# -*- coding: utf-8 -*- from datetime import datetime import draw import draw1 import time import numpy as np import csv import sys import json import os.path import subprocess from textacy import Vectorizer, viz from miscellaneous import * import textacy from scipy import * import os csv.field_size_limit(sys.maxsize) FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # load config config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) def label2ID(label, labeldict): return labeldict.get(label, len(labeldict)) def generate_lablelID_lines(textacyCorpus, labeldict): for doc in textacyCorpus: # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text """ def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): logprint(str("ngrams: {0}".format(ngrams))) logprint(str("min_df: {0}".format(min_df))) logprint(str("max_df: {0}".format(max_df))) logprint(str("named_entities: {0}".format(named_entities))) # printlog("vectorize corpi...") vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus) doc_term_matrix = vectorizer.fit_transform(terms_list) id2term = vectorizer.__getattribute__("id_to_term") for t in terms_list: print(t) logprint("doc_term_matrix: {0}".format(doc_term_matrix)) logprint("id2term: {0}".format(id2term)) """ def textacyTopicModeling(corpus, n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5, ngrams = 1, min_df=1, max_df=1.0, topicModel='lda'): n_terms = int(n_topics * top_topic_words) sort_terms_by = 'seriation' # 'seriation', 'weight', 'index', 'alphabetical' rank_terms_by = 'corpus' # 'corpus', 'topic' logprint( "############### Topic Modeling {0} ###########################".format( topicModel)) logprint(str("ngrams: {0}".format(ngrams))) logprint(str("min_df: {0}".format(min_df))) logprint(str("max_df: {0}".format(max_df))) logprint(str("n_topics: {0}".format(n_topics))) logprint("\n") start = time.time() # http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix weighting = ('tf' if topicModel == 'lda' else 'tfidf') #################### vectorize corpi #################### vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus) doc_term_matrix = vectorizer.fit_transform(terms_list) id2term = vectorizer.__getattribute__("id_to_term") # printlog("terms_list: {0}".format(list(terms_list))) # printlog("doc_term_matrix: {0}".format(doc_term_matrix)) ##################### Initialize and train a topic model ############################################## model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words): logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms))) for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic): logprint(topic_idx) for j in top_docs: logprint(corpus[j].metadata['categoryName']) ####################### termite plot ################################################################### grams_label = "uni" if ngrams == 1 else "bi" """ model.termite_plot(doc_term_matrix, id2term, n_terms=n_terms, sort_terms_by=sort_terms_by, rank_terms_by=rank_terms_by+'_weight', save= FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label,topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by)) """ draw1.termite_plot(model,doc_term_matrix, id2term, n_terms=n_terms, sort_terms_by=sort_terms_by, rank_terms_by=rank_terms_by + '_weight', save=FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label, topicModel, n_topics, n_terms, sort_terms_by, rank_terms_by)) end = time.time() logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7): start = time.time() jgibbsLLDA_root = FILEPATH + "java_LabledLDA/" LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) # build dictionary of ticketcategories labelist = [] for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): labelist.append(texdoc.metadata["categoryName"]) labeldict = {k: v for v, k in enumerate(labelist)} reverse_labeldict = {v: k for k, v in labeldict.items()} #and save labeldict_path = FILEPATH + "results/labeldict.txt" with open(labeldict_path, 'w') as file: file.write(json.dumps(labeldict)) n_topics = len(labeldict) #+1 #default-topic # create file with label_IDs (input for llda) textacy.fileio.write_file_lines(generate_lablelID_lines(corpus, labeldict), filepath=LLDA_filepath) # wait for file to exist while not os.path.exists(LLDA_filepath): time.sleep(1) logprint("") logprint("start LLDA:") # run JGibbsLLDA file FNULL = open(os.devnull, 'w') # supress output cmd_jgibbs_java = ["java", "-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( jgibbsLLDA_root), "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile", "tickets.gz", "-twords", str(top_topic_words), "-ntopics", str(n_topics)] subprocess.call(cmd_jgibbs_java, stdout=FNULL) # ANMERKUNG: Dateien sind versteckt. zu finden in models/ cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)] output = subprocess.check_output(cmd_gzip).decode("utf-8") topic_regex = re.compile(r'Topic [0-9]*') ##################################### # todo save results in file aufgrund von results result = [] for line in output.splitlines(): findall = topic_regex.findall(line) if len(findall) != 0: try: index = int(findall[0].split()[1]) result.append("Topic {} {}:".format(index, reverse_labeldict[index])) except: result.append(line) else: result.append(line) textacy.fileio.write_file_lines(result, path2save_results+".txt") ##################################### results = [] res_dict = {} count =0 for line in output.splitlines(): findall = topic_regex.findall(line) if len(findall) != 0: if len(res_dict) != 0: results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig) index = int(findall[0].split()[1]) res_dict = {index : str(reverse_labeldict[index]) } else: splitted = line.split() res_dict[splitted[0]] = float(splitted[1]) """ ### print terms that are topics for s in list(res_dict.values()): if isinstance(s,str) and splitted[0] in s: vals = list(res_dict.values()) keys = list(res_dict.keys()) for v in vals: if not isinstance(v,float): print("{}".format(v)) print("{}".format(splitted[0])) count +=1 print() ### """ if len(res_dict) != 0: results.append(res_dict) # letzes an die liste ran #print(count) #print(float(count)/float(len(labelist))) # {0: 'betrieb', 'service': 0.24162679425837305, 'support': 0.24162679425837305, 'browser': 0.24162679425837305, 'unicard': 0.24162679425837305, 'telefon': 0.0023923444976076593} # every term in the resulsts to a list terms=[] for res in results: for key,value in res.items(): if not isinstance(key, int) and not key in terms: terms.append(key) term2id = {t:i for i,t in enumerate(terms)} #and to dict ################# termite plot ##################################################################### #term_topic_weights.shape = (len(term_ids),len(topic_ids) #topic_labels = tuple(labelist) topic_labels = list(range(len(labelist))) term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()]) term_topic_weights = np.zeros((len(term2id),len(topic_labels))) for i,res in enumerate(results): for key,value in res.items(): if not isinstance(key, int): term_topic_weights[term2id[key]][i] = value term_labels[term2id[key]] = key else: topic_labels[i] = reverse_labeldict[key] #viz.draw_termite_plot(term_topic_weights, topic_labels, term_labels, save=path2save_results+".png") draw.draw_termite( term_topic_weights, topic_labels, term_labels, save=path2save_results+".png") end = time.time() logprint("Time Elapsed Topic Modeling JGibbsLLDA:{0} min\n".format((end - start) / 60)) def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words=7, kb_keywords=False): jgibbsLLDA_root = FILEPATH + "java_LabledLDA/" LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) # ticket2kb_dict kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";") ticket2kb_dict = {} #{'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...} for line in kb2ticket_gen: ticket_id = line[0] kb_id = line[1] ticket2kb_dict[ticket_id] = kb_id ############# # kb2keywords_dict kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";") #"ArticleID";"Subject";"Keywords";..... next(kb2keywords_gen,None) #skip first kb2keywords_dict = {} for lino in kb2keywords_gen: kb_id = lino[0] kb2keywords_dict[kb_id] = [] subject = lino[1] keywords = lino[2] keywords_list = [x.lower().strip() for x in map(replaceRockDots_lambda(), str(keywords).split(","))] if kb_keywords: for item in keywords_list: if item != "": kb2keywords_dict[kb_id].append(item) else: kb2keywords_dict[kb_id].append(subject) #remove all empty items kb2keywords_dict = { k : v for k,v in kb2keywords_dict.items() if len(v) != 0} ############### #keywords2kb_dict keywords2kb_dict = {} for kb_id, lst in kb2keywords_dict.items(): for l in lst: if l not in keywords2kb_dict.keys(): keywords2kb_dict[l] = [kb_id] else: keywords2kb_dict[l].append(kb_id) ############ # idee topic_ID -> KB_ID -> keywords / subject -> llda # ticket2kb_dict {'INC65627': 'KBA10044', 'INC66057': 'KBA10009', ...} # kb2keywords_dict {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...} # keywords2kb_dict {'unicard namensaenderung': ['KBA10276'], 'vpn': ['KBA10063'], 'outlook_exchange': ['KBA10181'], ...} # Look for actually used keywords used_keywords = [] for doc in corpus: ticket_number = doc.metadata["TicketNumber"] kb_number = ticket2kb_dict.get(ticket_number, None) keywords = kb2keywords_dict.get(kb_number, None) if keywords and kb_number: used_keywords.append(list(map(normalize,keywords))) kb_entries_used = (len(list(set([kb for kb in ticket2kb_dict.values()])))) print("kb_entries_used: {}".format(kb_entries_used)) labelist = [item for sublist in used_keywords for item in sublist] labelist = list(set(labelist)) print("len(labelist): {}".format(len(labelist))) labeldict = {k: v for v, k in enumerate(labelist)} labeldict_rev = {v: k for k, v in labeldict.items()} print("labledict created") def genos_linos(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict): for doc in textacyCorpus: ticket_number = doc.metadata["TicketNumber"] kb_number = ticket2kb_dict.get(ticket_number, None) keywords = kb2keywords_dict.get(kb_number, None) if keywords is not None: pass if keywords and kb_number: label = "" for kw in keywords: label = label + str(labeldict.get( normalize(str(kw)) , len(labeldict))) + " " yield "[ " + label + "] " + doc.text line_gen = genos_linos(corpus, labeldict, ticket2kb_dict, kb2keywords_dict) textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath) # wait for file to exist while not os.path.exists(LLDA_filepath): time.sleep(1) logprint("") logprint("start LLDA:") # run JGibbsLLDA file n_topics = len(labeldict) #+1 #default-topic FNULL = open(os.devnull, 'w') # supress output cmd_jgibbs_java = ["java", "-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( jgibbsLLDA_root), "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile", "tickets.gz", "-twords", str(top_topic_words), "-ntopics", str(n_topics)] subprocess.call(cmd_jgibbs_java, stdout=FNULL) # ANMERKUNG: Dateien sind versteckt. zu finden in models/ cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)] output = subprocess.check_output(cmd_gzip).decode("utf-8") topic_regex = re.compile(r'Topic [0-9]*') ##################################### # todo save results in file aufgrund von results result = [] for line in output.splitlines(): findall = topic_regex.findall(line) if len(findall) != 0: try: index = int(findall[0].split()[1]) result.append("Topic {} {}:".format(index, labeldict_rev[index])) except: result.append(line) else: result.append(line) textacy.fileio.write_file_lines(result, path2save_results+".txt") ##################################### results = [] res_dict = {} count =0 for line in output.splitlines(): findall = topic_regex.findall(line) if len(findall) != 0: if len(res_dict) != 0: results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig) index = int(findall[0].split()[1]) res_dict = {index : str(labeldict_rev[index]) } else: splitted = line.split() res_dict[splitted[0]] = float(splitted[1]) if len(res_dict) != 0: results.append(res_dict) # letzes an die liste ran # every term in the resulsts to a list terms=[] for res in results: for key,value in res.items(): if not isinstance(key, int) and not key in terms: terms.append(key) term2id = {t:i for i,t in enumerate(terms)} #and to dict ################# termite plot ##################################################################### topic_labels = list(range(len(labelist))) term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()]) term_topic_weights = np.zeros((len(term2id),len(topic_labels))) for i,res in enumerate(results): for key,value in res.items(): if not isinstance(key, int): term_topic_weights[term2id[key]][i] = value term_labels[term2id[key]] = key else: topic_labels[i] = labeldict_rev[key] draw.draw_termite( term_topic_weights, topic_labels, term_labels, save=path2save_results+".png") end = time.time() def main(use_cleaned=False, algorithm="llda"): logprint("Topic Modeling: {0}".format(datetime.now())) corpus_de_path = FILEPATH + config.get("de_corpus", "path") corpus_en_path = FILEPATH + config.get("en_corpus", "path") if use_cleaned: preCorpus_name = "de" + "_clean_ticket" resultspath = FILEPATH + "results/clean" else: preCorpus_name = "de" + "_pre_ticket" resultspath = FILEPATH + "results/pre" # load cleand corpus de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) logprint("Corpus loaded: {0}".format(de_corpus.lang)) """ ngrams = 1 min_df = 1 max_df = 1.0 weighting = 'tf' # weighting ='tfidf' named_entities = False printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting) printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting) printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting) printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting) printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting) printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting) """ if algorithm == "llda": top_topic_words = 5 path2save_results = resultspath + "_cat_{}_{}".format(algorithm,"top"+str(top_topic_words)) jgibbsLLDA_category(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words) kb_keywords = False path2save_results = resultspath + "_kb_{}_{}_{}".format("keys" if kb_keywords else "subs",algorithm,"top"+str(top_topic_words)) jgibbsLLDA_KB(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, kb_keywords=kb_keywords) kb_keywords = True path2save_results = resultspath + "_kb_{}_{}_{}".format("keys" if kb_keywords else "subs", algorithm, "top" + str(top_topic_words)) jgibbsLLDA_KB(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, kb_keywords=kb_keywords) """ top_topic_words = 10 path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words)) jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words) top_topic_words = 15 path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words)) jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words) top_topic_words = 20 path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words)) jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words) """ else: textacyTopicModeling(ngrams = 1, min_df = 1, max_df = 0.9, topicModel = algorithm, n_topics =15, corpus=de_corpus) """ textacyTopicModeling(ngrams=1, min_df=1, max_df=0.9, topicModel=algorithm, n_topics=20, corpus=de_corpus) textacyTopicModeling(ngrams=1, min_df=1, max_df=0.9, topicModel=algorithm, n_topics=25, corpus=de_corpus) textacyTopicModeling(ngrams=1, min_df=1, max_df=0.9, topicModel=algorithm, n_topics=30, corpus=de_corpus) """ textacyTopicModeling(ngrams=(1, 2), min_df=1, max_df=0.9, topicModel=algorithm, n_topics=15, corpus=de_corpus) """ textacyTopicModeling(ngrams = (1,2), min_df = 1, max_df = 0.9, topicModel = algorithm, n_topics =20, corpus=de_corpus) textacyTopicModeling(ngrams = (1,2), min_df = 1, max_df = 0.9, topicModel = algorithm, n_topics =25, corpus=de_corpus) textacyTopicModeling(ngrams = (1,2), min_df = 1, max_df = 0.9, topicModel = algorithm, n_topics =30, corpus=de_corpus) """ if __name__ == "__main__": main()