diff --git a/cleaning.py b/cleaning.py index 082cbf7..fd96d02 100644 --- a/cleaning.py +++ b/cleaning.py @@ -90,6 +90,7 @@ def autocorrectWord(word): + def clean(stringstream,autocorrect=False): for string in stringstream: @@ -165,6 +166,8 @@ autocorrect = config.getboolean("preprocessing", "autocorrect") def cleanCorpus(corpus_path, clean_in_meta, lang="de", printrandom=10,autocorrect=False): + autocorrect = False #todo STELLSCHRAUBE + logprint("Clean {0}_corpus at {1}".format(lang, datetime.now())) rawCorpus_name = lang + "_raw_ticket" diff --git a/java_LabledLDA/models/tickets/.others.gz b/java_LabledLDA/models/tickets/.others.gz index ea94e7e..d845c77 100644 Binary files a/java_LabledLDA/models/tickets/.others.gz and b/java_LabledLDA/models/tickets/.others.gz differ diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz index f8b9a95..48f38c8 100644 Binary files a/java_LabledLDA/models/tickets/.tassign.gz and b/java_LabledLDA/models/tickets/.tassign.gz differ diff --git a/java_LabledLDA/models/tickets/.theta.gz b/java_LabledLDA/models/tickets/.theta.gz index 8db650c..195c1ec 100644 Binary files a/java_LabledLDA/models/tickets/.theta.gz and b/java_LabledLDA/models/tickets/.theta.gz differ diff --git a/java_LabledLDA/models/tickets/.twords.gz b/java_LabledLDA/models/tickets/.twords.gz index 8e81db4..f87b35c 100644 Binary files a/java_LabledLDA/models/tickets/.twords.gz and b/java_LabledLDA/models/tickets/.twords.gz differ diff --git a/java_LabledLDA/models/tickets/.wordmap.gz b/java_LabledLDA/models/tickets/.wordmap.gz index 5a09245..f0c924d 100644 Binary files a/java_LabledLDA/models/tickets/.wordmap.gz and b/java_LabledLDA/models/tickets/.wordmap.gz differ diff --git a/java_LabledLDA/models/tickets/tickets.gz b/java_LabledLDA/models/tickets/tickets.gz index 1ece2cc..6c3506f 100644 Binary files a/java_LabledLDA/models/tickets/tickets.gz and b/java_LabledLDA/models/tickets/tickets.gz differ diff --git a/main.py b/main.py index 2df1d14..cf1c57b 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ import matplotlib matplotlib.use('Agg') import time import init - +from datetime import datetime import corporization import preprocessing import topicModeling @@ -21,6 +21,8 @@ start = time.time() + +# idee häufige n-gramme raus (zB damen und herren) # idee llda topics zusammenfassen # idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics # frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen @@ -29,9 +31,10 @@ start = time.time() # todo modelle testen +logprint("main.py started at {}".format(datetime.now())) + """ - init.main() logprint("") @@ -41,13 +44,13 @@ logprint("") cleaning.main() logprint("") -preprocessing.main() # ~5h +preprocessing.main() logprint("") - - """ + + #topicModeling.main(algorithm="lsa") logprint("") @@ -56,16 +59,17 @@ logprint("") logprint("") -topicModeling.main(algorithm="llda") -logprint("") - - #topicModeling.main(algorithm="llda") logprint("") +topicModeling.main(algorithm="lda") +logprint("") + + + + end = time.time() +logprint("main.py finished at {}".format(datetime.now())) logprint("Total Time Elapsed: {0} min".format((end - start) / 60)) - -#800*400 \ No newline at end of file diff --git a/preprocessing.py b/preprocessing.py index 6327c50..422667f 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -49,6 +49,10 @@ def filterTokens(tokens, funclist): for f in funclist: tokens = list(filter(f, tokens)) + for tok in tokens: + if tok.pos_ =="NOUN": + x=0 + return tokens @@ -57,7 +61,9 @@ def keepPOS(pos_list): def keepNouns(noun_list=NOUNS): - return lambda tok: tok.lower_ in noun_list + #return lambda tok: tok.lower_ in noun_list + return lambda tok: tok.lower_ in noun_list or tok.pos_ == "NOUN" + def removePOS(pos_list): @@ -204,8 +210,8 @@ def processContentstream(textstream, parser, token_filterlist=None): tokens = filterTokens(tokens, token_filterlist) # post parse - tokens = [postparse(tok) for tok in tokens] #todo: informationsverlust von pos,tag etc.! - + #todo STELLSCHRAUBE tokens = [postparse(tok) for tok in tokens] #todo: informationsverlust von pos,tag etc.! + tokens = [tok.lower_ for tok in tokens] yield " ".join(tokens) def preparse(stringstream): @@ -360,16 +366,13 @@ def main(): keepNouns(NOUNS), - remove_words_containing_Numbers(), + removeWords(DE_STOP_WORDS + custom_words + VORNAMEN), removePOS(["PUNCT", "SPACE", "NUM"]), - removeWords(DE_STOP_WORDS + custom_words + VORNAMEN), - #removeWords(DE_STOP_WORDS), - - remove_long_words(), - remove_short_words(), - remove_first_names() + #todo STELLSCHRAUBE remove_words_containing_Numbers(), + #todo STELLSCHRAUBE remove_long_words(), + #todo STELLSCHRAUBE remove_short_words() ] diff --git a/test.py b/test.py index 5f431e8..37ba096 100644 --- a/test.py +++ b/test.py @@ -27,16 +27,189 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path") preCorpus_name = "de" + "_pre_ticket" corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) logprint("Corpus loaded: {0}".format(corpus.lang)) -# + + + + + #todo randomize -split_index = int(float(len(corpus)) * 0.8) + +split = 0.8 +weighting = "tf" +min_df = 0 +max_df = 1 +ngrams = 1 +n_topics = 3 +top_n = 7 + + + +split_index = int(float(len(corpus)) * split) corpus_train = corpus[0:split_index] corpus_test = corpus[split_index:len(corpus)-1] + + +###### Initialize and train a topic model +vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) +terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus_train) +doc_term_matrix = vectorizer.fit_transform(terms_list) +id2term = vectorizer.__getattribute__("id_to_term") +model = textacy.tm.TopicModel("lda", n_topics=n_topics) +model.fit(doc_term_matrix) +###### + + + + +compenents = model.model.components_ + +""" +components_ : array, [n_components, n_features] + +Variational parameters for topic word distribution. + +Since the complete conditional for topic word distribution is a Dirichlet, +components_[i, j] can be viewed as pseudocount that represents +the number of times word j was assigned to topic i. + +It can also be viewed as distribution over the words for each topic after normalization: +model.components_ / model.components_.sum(axis=1)[:, np.newaxis]. + +""" + +test_doc = corpus_test[0] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +end = time.time() +print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start)) + + + + + + + + + + + + + + + + + + + + + + + + +""" + + # frage wieviele tickets pro topic? + +ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/de_tickets.csv", delimiter=";") + +cat_dict = {} +cat2id_dict = {} +for line in ticket_gen: + tick_id = line[0] + cat = normalize(line[3]) + + cat2id_dict[cat] = tick_id + + if cat not in cat_dict.keys(): + cat_dict[cat] = 1 + else: + cat_dict[cat] += 1 + +import operator + +sorted_dict = sorted(cat_dict.items(), key=operator.itemgetter(1)) + +for k, v in sorted_dict: + if k == "sd": + print(cat2id_dict[k]) + print(k, v) + +print(len(sorted_dict)) + + + + + + + + + kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";") ticket2kb_dict = {} @@ -118,14 +291,7 @@ for k,v in kb2keywords_dict.items(): #str,list import operator -""" -sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1)) -for k,v in sorted_dict: - print(k,v) - -print(len(sorted_dict)) -""" @@ -152,6 +318,7 @@ for kb_entry in kb2keywords_gen: else: count_dict[entry_] += 1 +import operator sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1)) @@ -159,20 +326,7 @@ sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1)) # print(k,v) #print(len(sorted_dict)) - - - - - - - - - - - - -end = time.time() -print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start)) +""" diff --git a/topicModeling.py b/topicModeling.py index 642eaf8..e3347df 100644 --- a/topicModeling.py +++ b/topicModeling.py @@ -5,6 +5,7 @@ import draw import draw1 import time import numpy as np +import operator import csv import sys @@ -80,9 +81,8 @@ def textacyTopicModeling(corpus, doc_topic_matrix = model.transform(doc_term_matrix) - - for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words): - logprint('{0}: {1}'.format(topic_idx, " ".join(top_terms))) + for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words, weights=True): + logprint('{0}: {1}'.format(topic_idx, str(top_terms))) for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic): logprint(topic_idx) @@ -96,7 +96,7 @@ def textacyTopicModeling(corpus, grams_label = "uni" if ngrams == 1 else "bi" - draw1.termite_plot(model,doc_term_matrix, id2term, + draw1.termite_plot(model,doc_term_matrix, vectorizer.id_to_term, n_terms=n_terms, sort_terms_by=sort_terms_by, @@ -117,8 +117,6 @@ def jgibbsLLDA(labeldict,line_gen,path2save_results, top_topic_words=7): LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) - - textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath) @@ -241,24 +239,31 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7): logprint("start Category-LLDA:") - # build dictionary of ticketcategories labelist = [] for doc in corpus: - labelist.append(normalize(doc.metadata["categoryName"])) + + category = normalize(doc.metadata["categoryName"]) + labelist.append(category) + labelist = list(set(labelist)) - print("len(labelist): {}".format(len(labelist))) + #print("len(labelist): {}".format(len(labelist))) labeldict = {k: v for v, k in enumerate(labelist)} - + labeldict.update({'DEFAULT' : len(labeldict)}) def gen_cat_lines(textacyCorpus, labeldict): """ generates [topic1, topic2....] tok1 tok2 tok3 out of corpi""" for doc in textacyCorpus: - yield "[" + str(labeldict.get(doc.metadata["categoryName"], len(labeldict))) + "] " + doc.text + label = labeldict.get(normalize(doc.metadata["categoryName"]), labeldict['DEFAULT']) + + # frage nur die x häufigsten labels benutzen, rest raus? + + + yield "[ " + str(label) + " ] " + doc.text line_gen = gen_cat_lines(corpus, labeldict) @@ -274,6 +279,7 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7): logprint("\n\n\nTime Elapsed Category-LLDA :{0} min\n\n".format((end - start) / 60)) +@deprecated def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=False): """ticket_ID -> KB_ID -> keywords / subject -> llda""" @@ -420,7 +426,7 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): - # kb2keywords_dict / kb2subj_dict {str : [str]} + # kb2keywords_dict / kb2subjects_dict --> {str : [str]} kb2keywords_dict = {} kb2subjects_dict = {} @@ -458,7 +464,7 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): - # ticket2kbs_dict + # ticket2kbs_dict --> {str : [str]} ticket2kbs_dict = {} kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";") next(kb2ticket_gen, None) # skip first line"TicketNumber";"ArticleID" @@ -479,8 +485,8 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): - # ticket2keywords - ticket2keywords_dict = {} # {str:[str]} + # ticket2keywords --> {str:[str]} + ticket2keywords_dict = {} for ticket_id, kb_ids in ticket2kbs_dict.items(): @@ -496,8 +502,8 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): - # ticket2subjects - ticket2subjects_dict = {} # {str:[str]} + # ticket2subjects --> {str:[str]} + ticket2subjects_dict = {} for ticket_id, kb_ids in ticket2kbs_dict.items(): @@ -513,13 +519,12 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): # kb2keywords_dict {'KBA10230': ['DEFAULT'], 'KBA10129': ['DEFAULT'], 'KBA10287': ['sd_ansys_informationen'], } len = 260 - #kb2subjects_dict {'KBA10230': ['unicard nochmal beantragen'], 'KBA10129': ['sd_entsperrung unicard nach verlust/wiederfinden'], } len = 260 - # ticket2kbs_dict {'INC44526': ['KBA10056'], 'INC67205': ['KBA10056'], } len = 4832 + # kb2subjects_dict {'KBA10230': ['unicard nochmal beantragen'], 'KBA10129': ['sd_entsperrung unicard nach verlust/wiederfinden'], } len = 260 + # ticket2kbs_dict {'INC44526': ['KBA10056'], 'INC67205': ['KBA10056'], } len = 4832 # ticket2keywords_dict {'INC44526': ['DEFAULT'], 'INC67205': ['DEFAULT'], 'INC71863': ['DEFAULT'], 'INC44392': ['asknet'] } len=4832 - #ticket2subjects_dioct {'INC44526': ['sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt)'], len=4832 + # ticket2subjects_dict {'INC44526': ['sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt)'], len=4832 - # frage wieviele tickets pro topic? count_dict = {} for v in ticket2kbs_dict.values(): for kb in v: @@ -527,18 +532,17 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): count_dict[kb] +=1 else: count_dict[kb] = 1 - import operator sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1)) - print("kb_entrys used: {}".format(len(sorted_dict))) + for k,v in sorted_dict: - print(k,kb2subjects_dict[k],v) #todo das selbe mit keywords + subs = kb2subjects_dict[k] + keys = kb2keywords_dict[k] + print(subs, keys , v) # frage wieviele tickets pro topic? + + print("kb_entrys used: {}".format(len(sorted_dict))) # frage wie viele kb_entry's insg genutzt?: 155 - #todo hier weiter - - - # todo frage wie viele kb_entry's insg genutzt? labelist = ticket2keywords_dict.values() labelist = flatten(labelist) @@ -564,6 +568,7 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): yield "[ " + label + "] " + doc.text + keys_line_gen = gen_key_lines(corpus, labeldict, ticket2keywords_dict) path2save_keys_results = path2save_results + "_kb_keys_llda_{}".format("top" + str(top_topic_words)) @@ -574,28 +579,13 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): - """ - def gen_subj_lines(textacyCorpus, labeldict, ticket2subjects_dict): - for doc in corpus: - - ticket_number = doc.metadata["TicketNumber"] - - keywords = ticket2subjects_dict.get(ticket_number, ['DEFAULT']) - - if keywords != ['DEFAULT']: - - label = "" - for kw in keywords: - label = label + str(labeldict.get(normalize(str(kw)), len(labeldict))) + " " - - yield "[ " + label + "] " + doc.text - """ labelist = ticket2subjects_dict.values() labelist = flatten(labelist) labelist = list(set(labelist)) labeldict = {k: v for v, k in enumerate(labelist)} + labeldict.update({'DEFAULT' : len(labeldict)}) subj_line_gen = gen_key_lines(corpus, labeldict, ticket2subjects_dict) @@ -616,19 +606,13 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): def main( algorithm="llda"): - - logprint("Topic Modeling: {0}".format(datetime.now())) - - corpus_de_path = FILEPATH + config.get("de_corpus", "path") corpus_en_path = FILEPATH + config.get("en_corpus", "path") - - + preCorpus_name = "de" + "_pre_ticket_old" preCorpus_name = "de" + "_pre_ticket" + resultspath = FILEPATH + "results/pre" - - # load corpus de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) logprint("Corpus loaded: {0}".format(de_corpus.lang)) @@ -643,15 +627,15 @@ def main( algorithm="llda"): jgibbsLLDA_KB_v2(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words) - kb_keywords = False - #jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords) - - kb_keywords = True - #jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords) - - """ + kb_keywords = False + jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords) + + kb_keywords = True + jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords) + + top_topic_words = 10 path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words)) jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)