# -*- coding: utf-8 -*- from datetime import datetime import time import csv import sys import json import os.path import subprocess from textacy import Vectorizer, viz from miscellaneous import * import textacy from scipy import * import os csv.field_size_limit(sys.maxsize) FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &" # load config config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) def label2ID(label, labeldict): return labeldict.get(label, len(labeldict)) def generate_labled_lines(textacyCorpus, labeldict): for doc in textacyCorpus: # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): logprint(str("ngrams: {0}".format(ngrams))) logprint(str("min_df: {0}".format(min_df))) logprint(str("max_df: {0}".format(max_df))) logprint(str("named_entities: {0}".format(named_entities))) # printlog("vectorize corpi...") vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus) doc_term_matrix = vectorizer.fit_transform(terms_list) id2term = vectorizer.__getattribute__("id_to_term") for t in terms_list: print(t) logprint("doc_term_matrix: {0}".format(doc_term_matrix)) logprint("id2term: {0}".format(id2term)) def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda', named_entities=False): logprint( "############################################ Topic Modeling {0} #############################################".format( topicModel)) print("\n\n") logprint(str("ngrams: {0}".format(ngrams))) logprint(str("min_df: {0}".format(min_df))) logprint(str("max_df: {0}".format(max_df))) logprint(str("n_topics: {0}".format(n_topics))) logprint(str("named_entities: {0}".format(named_entities))) start = time.time() top_topic_words = 7 top_document_labels_per_topic = 5 # http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix weighting = ('tf' if topicModel == 'lda' else 'tfidf') ####################'#################### # printlog("vectorize corpi...") vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus) doc_term_matrix = vectorizer.fit_transform(terms_list) id2term = vectorizer.__getattribute__("id_to_term") # printlog("terms_list: {0}".format(list(terms_list))) # printlog("doc_term_matrix: {0}".format(doc_term_matrix)) ##################### LSA, LDA, NMF Topic Modeling via Textacy ############################################## # Initialize and train a topic model # printlog("Initialize and train a topic model..") model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) model.fit(doc_term_matrix) # Transform the corpi and interpret our model: # printlog("Transform the corpi and interpret our model..") doc_topic_matrix = model.transform(doc_term_matrix) print() for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words): logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms))) print() for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic): logprint(topic_idx) for j in top_docs: logprint(corpus[j].metadata['categoryName']) print() ##################################################################################################################### print() print() # termite plot n_terms = int(n_topics*top_topic_words) sort_terms_by = 'seriation' #'seriation', 'weight', 'index', 'alphabetical' rank_terms_by = 'corpus' # 'corpus', 'topic' model.termite_plot(doc_term_matrix, id2term, n_terms=n_terms, sort_terms_by=sort_terms_by, rank_terms_by=rank_terms_by+'_weight', save="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/results/{}_{}_{}_{}_{}.png".format(topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by)) end = time.time() logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=False): ##################### LLDA Topic Modeling via JGibbsLabledLDA ############################################## start = time.time() # build dictionary of ticketcategories labelist = [] for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): labelist.append(texdoc.metadata["categoryName"]) labeldict = {k: v for v, k in enumerate(labelist)} if add_default_topic: n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic else: n_topics = len(labeldict) # + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/" LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) #dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root) dict_path = FILEPATH +"results/labeldict.txt".format(jgibbsLLDA_root) # printlog(str("LABELDICT: {0}".format(labeldict))) #logprint(str("LABELDICT-length: {0}".format(len(labeldict)))) with open(dict_path, 'w') as file: file.write(json.dumps(labeldict)) # for line in generate_labled_lines(de_corpus,labeldict): # print(line) # create file textacy.fileio.write_file_lines(generate_labled_lines(corpus, labeldict), filepath=LLDA_filepath) # wait for file to exist while not os.path.exists(LLDA_filepath): time.sleep(1) #top_topic_words=1 logprint("") logprint("start LLDA:") # run JGibsslda file FNULL = open(os.devnull, 'w') # supress output cmd_jgibbs_java = ["java", "-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( jgibbsLLDA_root), "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile", "tickets.gz", "-twords", str(top_topic_words), "-ntopics", str(n_topics)] subprocess.call(cmd_jgibbs_java, stdout=FNULL) # ANMERKUNG: Dateien sind versteckt. zu finden in models/ # twords """ subprocess.call(["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]) """ cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)] """ proc = subprocess.Popen(cmd_gzip, stdout=subprocess.PIPE) process = subprocess.Popen(cmd_gzip, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # wait for the process to terminate out, err = process.communicate() errcode = process.returncode result = subprocess.check_output(cmd_gzip) #result = proc.stdout.read() result = proc.communicate() out=[] for line in result: out.append(line) """ output = subprocess.check_output(cmd_gzip).decode("utf-8") reverse_labeldict = {v: k for k, v in labeldict.items()} result = [] regex = re.compile(r'Topic [0-9]*') for line in output.splitlines(): findall = regex.findall(line) if len(findall) != 0: try: index = int(findall[0].split()[1]) result.append("Topic {} {}:".format(index, reverse_labeldict[index])) except: result.append(line) else: result.append(line) textacy.fileio.write_file_lines(result, path2save_results) ##################################################################################################################### #todo llda termite plot """ topic_inds=[] #: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] # get topic and term labels # : ('topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14') topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds) # : ('hardware', 'raum', 'adresse', 'gebaeude', 'tu', 'uni', 'ticket', 'email', 'account', 'nummer', 'mail', 'outlook', 'karte', 'drucker', 'server', 'service', 'antwort', 'verbindung', 'herzliche', 'einrichten', 'vergessen', 'wenden', 'ews', 'anhang', 'form', 'konto', 'nachricht', 'unterstuetzung', 'passwort', 'unicard', 'semester', 'system', 'aenderung', 'rueckmeldung', 'meldung', 'zugreifen', 'login', 'adressat', 'sender', 'kurs', 'made', 'mittwoch', 'note', 'our', 'korrespondenz', 'unbeschadet', 'boss', 'unterrichten', 'telefax', 'zugang', 'probleme', 'zugriff', 'mitarbeiterin', 'internet', 'daten', 'anmeldung', 'aendern', 'unterschrift', 'loeschen', 'anmelden', 'datei', 'message', 'laptop', 'benoetigt', 'link', 'montag', 'programm', 'ordner', 'personal', 'rechner', 'veranstaltung', 'august', 'lizenz', 'anschluss', 'mitarbeiter', 'erwuenscht', 'umzug', 'pc', 'uniaccount', 'amt', 'fax', 'it', 'institut', 'nutzer', 'bild', 'type', 'prof', 'verantwortlicher', 'bemerkung', 'antragsteller', 'element', 'hahn', 'eintrag', 'telefonbuch', 'ansprechpartner', 'universitaet', 'physik', 'abteilung', 'fakultaet', 'software', 'dezernat', 'einrichtung', 'telefon', 'lehrstuhl', 'buero') term_labels = tuple(id2term[term_ind] for term_ind in term_inds) # get topic-term weights to size dots #[[ 0.02721858 -0.03898025 0.00047936 ..., 0.05862538 -0.07742336 0.04761928] # [ 0.14977875 -0.24192522 -0.00620335 ..., -0.0497216 0.08269951 -0.05715901] # [ 0.04977951 0.02296709 0.01214562 ..., 0.11444371 -0.15212482 0.21481788] # ..., # [ term_topic_weights = np.array([self.model.components_[topic_ind][term_inds] for topic_ind in topic_inds]).T viz.draw_termite_plot( term_topic_weights, topic_labels, term_labels, save=path2save_results) """ logprint("") end = time.time() logprint("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60)) def main(use_raw=False, algorithm="llda"): logprint("Topic Modeling: {0}".format(datetime.now())) corpus_de_path = FILEPATH + config.get("de_corpus", "path") corpus_en_path = FILEPATH + config.get("en_corpus", "path") if use_raw: # fehler Unknown document label ( X ) for document 352. preCorpus_name = "de" + "_raw_ticket" resultspath = FILEPATH + "results/raw" else: preCorpus_name = "de" + "_pre_ticket" resultspath = FILEPATH + "results/pre" # load raw corpus and create new one de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) logprint("Corpus loaded: {0}".format(de_corpus.lang)) # idee http://bigartm.org/ # idee http://wiki.languagetool.org/tips-and-tricks # idee https://en.wikipedia.org/wiki/Noisy_text_analytics # idee https://gate.ac.uk/family/ # todo llda topics zusammenfassen # idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics # frage lda wieviele tickets pro topic? """ ngrams = 1 min_df = 1 max_df = 1.0 weighting = 'tf' # weighting ='tfidf' named_entities = False printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting) printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting) printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting) printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting) printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting) printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting) """ if algorithm == "llda": top_topic_words = 5 add_default_topic = False path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "") jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, add_default_topic=add_default_topic) top_topic_words = 5 add_default_topic = True path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "") jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, add_default_topic=add_default_topic) top_topic_words = 10 add_default_topic = False path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "") jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, add_default_topic=add_default_topic) top_topic_words = 10 add_default_topic = True path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "") jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, add_default_topic=add_default_topic) # no_below = 20 # no_above = 0.5 # n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic else: # build dictionary of ticketcategories labelist = [] for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): labelist.append(texdoc.metadata["categoryName"]) labeldict = {k: v for v, k in enumerate(labelist)} textacyTopicModeling(ngrams = 1, min_df = 1, max_df = 0.9, topicModel = algorithm, n_topics =15, corpus=de_corpus) textacyTopicModeling(ngrams=1, min_df=1, max_df=0.9, topicModel=algorithm, n_topics=20, corpus=de_corpus) textacyTopicModeling(ngrams=1, min_df=1, max_df=0.9, topicModel=algorithm, n_topics=25, corpus=de_corpus) textacyTopicModeling(ngrams=1, min_df=1, max_df=0.9, topicModel=algorithm, n_topics=30, corpus=de_corpus) textacyTopicModeling(ngrams=(1, 2), min_df=1, max_df=0.9, topicModel=algorithm, n_topics=15, corpus=de_corpus) textacyTopicModeling(ngrams = (1,2), min_df = 1, max_df = 0.9, topicModel = algorithm, n_topics =20, corpus=de_corpus) textacyTopicModeling(ngrams = (1,2), min_df = 1, max_df = 0.9, topicModel = algorithm, n_topics =25, corpus=de_corpus) textacyTopicModeling(ngrams = (1,2), min_df = 1, max_df = 0.9, topicModel = algorithm, n_topics =30, corpus=de_corpus) """ textacyTopicModeling(ngrams = (1,2), min_df = 1, max_df = 0.8, topicModel = algorithm, n_topics =n_topics, corpus=de_corpus) """ """ textacyTopicModeling(ngrams = 1, min_df = 0.1, max_df = 0.6, topicModel = algorithm, n_topics =n_topics, corpus=de_corpus) textacyTopicModeling(ngrams = (1,2), min_df = 1, max_df = 1.0, topicModel = algorithm, n_topics =n_topics, corpus=de_corpus) textacyTopicModeling(ngrams = (1,2), min_df = 0.1, max_df = 0.6, topicModel = algorithm, n_topics =n_topics, corpus=de_corpus) textacyTopicModeling(ngrams = (1,2), min_df = 0.2, max_df = 0.8, topicModel = algorithm, n_topics = 20, corpus=de_corpus) """ if __name__ == "__main__": main()