# -*- coding: utf-8 -*- from datetime import datetime import draw import draw1 import time import numpy as np import csv import sys import json import os.path import subprocess from textacy import Vectorizer, viz from miscellaneous import * import textacy from scipy import * import os csv.field_size_limit(sys.maxsize) FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # load config config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) def textacyTopicModeling(corpus, n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5, ngrams = 1, min_df=1, max_df=0.9, topicModel='lda'): n_terms = int(n_topics * top_topic_words) sort_terms_by = 'seriation' # 'seriation', 'weight', 'index', 'alphabetical' rank_terms_by = 'corpus' # 'corpus', 'topic' logprint("#### Topic Modeling {0}".format(topicModel)) logprint(str("ngrams: {0}".format(ngrams))) logprint(str("min_df: {0}".format(min_df))) logprint(str("max_df: {0}".format(max_df))) logprint(str("n_topics: {0}".format(n_topics))) logprint("\n") start = time.time() # http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix weighting = ('tf' if topicModel == 'lda' else 'tfidf') ###### vectorize corpi vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus) doc_term_matrix = vectorizer.fit_transform(terms_list) id2term = vectorizer.__getattribute__("id_to_term") ####### Initialize and train a topic model model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words): logprint('{0}: {1}'.format(topic_idx, " ".join(top_terms))) for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic): logprint(topic_idx) for j in top_docs: logprint(corpus[j].metadata['categoryName']) ####################### termite plot ################################################################### grams_label = "uni" if ngrams == 1 else "bi" draw1.termite_plot(model,doc_term_matrix, id2term, n_terms=n_terms, sort_terms_by=sort_terms_by, rank_terms_by=rank_terms_by + '_weight', save=FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label, topicModel, n_topics, n_terms, sort_terms_by, rank_terms_by)) end = time.time() logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) def jgibbsLLDA(labeldict,line_gen,path2save_results, top_topic_words=7): labeldict_rev = {v: k for k, v in labeldict.items()} jgibbsLLDA_root = FILEPATH + "java_LabledLDA/" LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath) # wait for file to exist while not os.path.exists(LLDA_filepath): time.sleep(1) # run JGibbsLLDA file n_topics = len(labeldict) #+1 #default-topic FNULL = open(os.devnull, 'w') # supress output cmd_jgibbs_java = ["java", "-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( jgibbsLLDA_root), "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile", "tickets.gz", "-twords", str(top_topic_words), "-ntopics", str(n_topics)] subprocess.call(cmd_jgibbs_java, stdout=FNULL) # ANMERKUNG: Dateien sind versteckt. zu finden in models/ cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)] output = subprocess.check_output(cmd_gzip).decode("utf-8") topic_regex = re.compile(r'Topic [0-9]*') ##################################### # todo save results in file aufgrund von results result = [] for line in output.splitlines(): findall = topic_regex.findall(line) if len(findall) != 0: try: index = int(findall[0].split()[1]) result.append("Topic {} {}:".format(index, labeldict_rev[index])) except: result.append(line) else: result.append(line) textacy.fileio.write_file_lines(result, path2save_results+".txt") ##################################### results = [] res_dict = {} count =0 for line in output.splitlines(): findall = topic_regex.findall(line) if len(findall) != 0: if len(res_dict) != 0: results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig) index = int(findall[0].split()[1]) res_dict = {index : str(labeldict_rev[index]) } else: splitted = line.split() res_dict[splitted[0]] = float(splitted[1]) if len(res_dict) != 0: results.append(res_dict) # letzes an die liste ran # every term in the resulsts to a list terms=[] for res in results: for key,value in res.items(): if not isinstance(key, int) and not key in terms: terms.append(key) term2id = {t:i for i,t in enumerate(terms)} #and to dict ################# termite plot ##################################################################### topic_labels = list(range(len(labeldict))) term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()]) term_topic_weights = np.zeros((len(term2id),len(topic_labels))) for i,res in enumerate(results): for key,value in res.items(): if not isinstance(key, int): term_topic_weights[term2id[key]][i] = value term_labels[term2id[key]] = key else: topic_labels[i] = labeldict_rev[key] draw.draw_termite( term_topic_weights, topic_labels, term_labels, save=path2save_results+".png") draw.draw_termite( term_topic_weights, topic_labels, term_labels, save=path2save_results+"_spaced.png",pow_x=0.78,pow_y=0.87) # save labeldict labeldict_path = path2save_results + "_labeldict.json" with open(labeldict_path, 'w') as file: file.write(json.dumps(labeldict)) def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7): start = time.time() logprint("") logprint("start Category-LLDA:") # build dictionary of ticketcategories labelist = [] for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): labelist.append(texdoc.metadata["categoryName"]) labelist = list(set(labelist)) print("len(labelist): {}".format(len(labelist))) labeldict = {k: v for v, k in enumerate(labelist)} def gen_cat_lines(textacyCorpus, labeldict): """ generates [topic1, topic2....] tok1 tok2 tok3 out of corpi""" for doc in textacyCorpus: yield "[" + str(labeldict.get(doc.metadata["categoryName"], len(labeldict))) + "] " + doc.text line_gen = gen_cat_lines(corpus, labeldict) path2save_results = path2save_results + "_kb_cat_llda_{}".format("top" + str(top_topic_words)) jgibbsLLDA(labeldict, line_gen, path2save_results, top_topic_words=top_topic_words) end = time.time() logprint("\n\n\nTime Elapsed Category-LLDA :{0} min\n\n".format((end - start) / 60)) def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=False): """ticket_ID -> KB_ID -> keywords / subject -> llda""" start = time.time() logprint("") logprint("start {}-LLDA:".format("Keyword" if kb_keywords else "Subject")) # ticket2kb_dict kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";") ticket2kb_dict = {} for line in kb2ticket_gen: ticket_id = line[0] kb_id = line[1] ticket2kb_dict[ticket_id] = kb_id # {'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...} kb_entries_used = len(list(set(ticket2kb_dict.values()))) print("kb_entries_used: {}".format(kb_entries_used)) # kb2keywords_dict kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";") next(kb2keywords_gen,None) #skip first line("ArticleID";"Subject";"Keywords";...) kb2keywords_dict = {} for line in kb2keywords_gen: kb_id = line[0] subject = line[1] keywords = line[2] keywords_list = [normalize(x) for x in str(keywords).split(",")] if kb_id not in kb2keywords_dict.keys(): kb2keywords_dict[kb_id] = [] if kb_keywords: for item in keywords_list: if item != "": kb2keywords_dict[kb_id].append(item) else: kb2keywords_dict[kb_id].append(subject) #remove all empty items kb2keywords_dict = { k : v for k,v in kb2keywords_dict.items() if len(v) != 0} # {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...} #keywords2kb_dict keywords2kb_dict = {} for kb_id, lst in kb2keywords_dict.items(): for l in lst: if l not in keywords2kb_dict.keys(): keywords2kb_dict[l] = [kb_id] else: keywords2kb_dict[l].append(kb_id) # {'unicard namensaenderung': ['KBA10276'], 'vpn': ['KBA10063'], 'outlook_exchange': ['KBA10181'], ...} # Look for actually used keywords used_keywords = [] for doc in corpus: ticket_number = doc.metadata["TicketNumber"] kb_id = ticket2kb_dict.get(ticket_number, None) keywords = kb2keywords_dict.get(kb_id, None) if keywords and kb_id: used_keywords.append(list(map(normalize,keywords))) labelist = [item for sublist in used_keywords for item in sublist] #flatten list labelist = list(set(labelist)) print("len(labelist): {}".format(len(labelist))) labeldict = {k: v for v, k in enumerate(labelist)} def gen_KB_lines(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict): for doc in corpus: ticket_number = doc.metadata["TicketNumber"] kb_number = ticket2kb_dict.get(ticket_number, None) keywords = kb2keywords_dict.get(kb_number, None) if keywords: label = "" for kw in keywords: label = label + str(labeldict.get(normalize(str(kw)), len(labeldict))) + " " yield "[ " + label + "] " + doc.text line_gen = gen_KB_lines(corpus, labeldict, ticket2kb_dict, kb2keywords_dict) path2save_results = path2save_results + "_kb_{}_llda_{}".format("keys" if kb_keywords else "subs", "top" + str(top_topic_words)) jgibbsLLDA(labeldict, line_gen, path2save_results, top_topic_words=top_topic_words) end = time.time() logprint("\n\n\nTime Elapsed {1}-LLDA :{0} min\n\n".format((end - start) / 60,"Keyword" if kb_keywords else "Subject")) def main( algorithm="llda"): logprint("Topic Modeling: {0}".format(datetime.now())) corpus_de_path = FILEPATH + config.get("de_corpus", "path") corpus_en_path = FILEPATH + config.get("en_corpus", "path") preCorpus_name = "de" + "_pre_ticket" resultspath = FILEPATH + "results/pre" # load corpus de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) logprint("Corpus loaded: {0}".format(de_corpus.lang)) if algorithm == "llda": top_topic_words = 5 jgibbsLLDA_category(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words) kb_keywords = False jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords) kb_keywords = True jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords) """ top_topic_words = 10 path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words)) jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words) top_topic_words = 15 path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words)) jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words) top_topic_words = 20 path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words)) jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words) """ else: textacyTopicModeling(ngrams = 1, topicModel = algorithm, corpus=de_corpus) """ textacyTopicModeling(ngrams=1, min_df=1, max_df=0.9, topicModel=algorithm, n_topics=20, corpus=de_corpus) textacyTopicModeling(ngrams=1, min_df=1, max_df=0.9, topicModel=algorithm, n_topics=25, corpus=de_corpus) textacyTopicModeling(ngrams=1, min_df=1, max_df=0.9, topicModel=algorithm, n_topics=30, corpus=de_corpus) """ textacyTopicModeling(ngrams=(1, 2), topicModel=algorithm, corpus=de_corpus) """ textacyTopicModeling(ngrams = (1,2), min_df = 1, max_df = 0.9, topicModel = algorithm, n_topics =20, corpus=de_corpus) textacyTopicModeling(ngrams = (1,2), min_df = 1, max_df = 0.9, topicModel = algorithm, n_topics =25, corpus=de_corpus) textacyTopicModeling(ngrams = (1,2), min_df = 1, max_df = 0.9, topicModel = algorithm, n_topics =30, corpus=de_corpus) """ if __name__ == "__main__": main()