diff --git a/config.ini b/config.ini index 453ac97..98ebd26 100644 --- a/config.ini +++ b/config.ini @@ -41,8 +41,8 @@ filename=topicModelTickets.log [de_corpus] -#input=M42-Export/Tickets_small.csv -input=M42-Export/de_tickets.csv +input=M42-Export/Tickets_small.csv +#input=M42-Export/de_tickets.csv path=corpi/ @@ -64,7 +64,7 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI #ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC -custom_words=geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok +custom_words=eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok diff --git a/java_LabledLDA/models/tickets/.others.gz b/java_LabledLDA/models/tickets/.others.gz index dfc54ee..27f7097 100644 Binary files a/java_LabledLDA/models/tickets/.others.gz and b/java_LabledLDA/models/tickets/.others.gz differ diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz index 5be480f..4a5b35b 100644 Binary files a/java_LabledLDA/models/tickets/.tassign.gz and b/java_LabledLDA/models/tickets/.tassign.gz differ diff --git a/java_LabledLDA/models/tickets/.theta.gz b/java_LabledLDA/models/tickets/.theta.gz index 21cecf7..ce505a0 100644 Binary files a/java_LabledLDA/models/tickets/.theta.gz and b/java_LabledLDA/models/tickets/.theta.gz differ diff --git a/java_LabledLDA/models/tickets/.twords.gz b/java_LabledLDA/models/tickets/.twords.gz index 7dfe763..9673a6e 100644 Binary files a/java_LabledLDA/models/tickets/.twords.gz and b/java_LabledLDA/models/tickets/.twords.gz differ diff --git a/java_LabledLDA/models/tickets/.wordmap.gz b/java_LabledLDA/models/tickets/.wordmap.gz index 0792c8d..4b38701 100644 Binary files a/java_LabledLDA/models/tickets/.wordmap.gz and b/java_LabledLDA/models/tickets/.wordmap.gz differ diff --git a/java_LabledLDA/models/tickets/tickets.gz b/java_LabledLDA/models/tickets/tickets.gz index 4a252a2..54fbfc5 100644 Binary files a/java_LabledLDA/models/tickets/tickets.gz and b/java_LabledLDA/models/tickets/tickets.gz differ diff --git a/main.py b/main.py index fe673c0..abbfb66 100644 --- a/main.py +++ b/main.py @@ -2,18 +2,21 @@ import matplotlib matplotlib.use('Agg') import time +import init + import corporization import preprocessing import topicModeling import cleaning -from miscellaneous import * +from miscellaneous import * # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &" start = time.time() -import init -init.main() + + +#init.main() logprint("") corporization.main() @@ -25,14 +28,32 @@ logprint("") preprocessing.main() # ~5h logprint("") -#topicModeling.main(use_raw=False,algorithm="llda") + +""" +topicModeling.main(use_raw=False,algorithm="lsa") logprint("") -#topicModeling.main(use_raw=True) topicModeling.main(use_raw=False,algorithm="lda") logprint("") + +topicModeling.main(use_raw=False,algorithm="nmf") +logprint("") + + +topicModeling.main(use_raw=False,algorithm="llda") +logprint("") +""" + + + + + + + + + logprint("") end = time.time() diff --git a/miscellaneous.py b/miscellaneous.py index d528665..59c09e3 100644 --- a/miscellaneous.py +++ b/miscellaneous.py @@ -154,18 +154,6 @@ def printRandomDoc(textacyCorpus): print() -def corpus2Text(corpus): - for doc in corpus: - yield doc.text - -def corpus2Meta(corpus): - for doc in corpus: - yield doc.metadata - -def saveplaincorpustext(corpus,path): - textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path ) - - def save_corpus(corpus, corpus_path, corpus_name): """ @@ -175,42 +163,22 @@ def save_corpus(corpus, corpus_path, corpus_name): :param corpus_name: str (should content the language like "_de_") """ - """ - # save stringstore - stringstore_path = corpus_path + corpus_name + '_strings.json' - with open(stringstore_path, "w") as file: - parser.vocab.strings.dump(file) - - #todo save vocab? - """ - # save parser parser = corpus.spacy_lang parserpath = corpus_path + str(parser.lang) + '_parser' parser.save_to_directory(parserpath) - ## - - # save content - contentpath = corpus_path + corpus_name + "_content.bin" - textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath) - - #save plain content + # save plain content + meta plainpath = corpus_path + corpus_name + "_content.json" - textacy.fileio.write_json_lines(({"index" : doc.corpus_index, "content" : doc.text} for doc in corpus), plainpath) - - # save meta - metapath = corpus_path + corpus_name + "_meta.json" - #meta_gen = (doc.metadata.update({"index": doc.corpus_index}) for doc in corpus) - meta_gen = gen_meta(corpus) - textacy.fileio.write_json_lines(meta_gen, metapath) + textacy.fileio.write_json_lines(gen_dicts(corpus), plainpath) -def gen_meta(corpus): + +def gen_dicts(corpus): for doc in corpus: - meta = doc.metadata - meta.update({"index": doc.corpus_index}) - yield meta + dict = {"index" : doc.corpus_index, "content" : doc.text} + dict.update(doc.metadata) + yield dict @@ -233,7 +201,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"): # load parser parser = spacy.load(lang) - stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json' with open(stringstorepath) as file: parser.vocab.strings.load(file) @@ -244,46 +211,35 @@ def load_corpus(corpus_path, corpus_name, lang="de"): #load corpus corpus = textacy.Corpus(parser) - - contentpath = corpus_path + corpus_name + "_content.bin" plainpath = corpus_path + corpus_name + "_content.json" - metapath = corpus_path + corpus_name + "_meta.json" - """ - try: - spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath) - metadata_stream = textacy.fileio.read_json_lines(metapath) - - for spacy_doc, metadata in zip(spacy_docs, metadata_stream): - corpus.add_doc( - textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) - except: - """ - # neu init!! - #corpus = textacy.Corpus(parser) - plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str} - metadata_stream = textacy.fileio.read_json_lines(metapath) - - for plain, metadata in zip(plain_stream, metadata_stream): - if plain["index"] == metadata["index"]: - corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=metadata)) - else: - raise IndexError + for plain in plain_stream: + meta = {} + for key,value in plain.items(): + if key != "content" and key != "index": + meta[key] = value + corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta)) return corpus, corpus.spacy_lang +""" +def corpus2Text(corpus): + for doc in corpus: + yield doc.text + +def corpus2Meta(corpus): + for doc in corpus: + yield doc.metadata + +def saveplaincorpustext(corpus,path): + textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path ) def save_corpusV2(corpus, corpus_path, corpus_name): - """ - saves a textacy-corpus including spacy-parser - :param corpus: textacy-Corpus - :param corpus_path: str - :param corpus_name: str (should content the language like "_de_") - """ + # save parser parser = corpus.spacy_lang @@ -302,13 +258,7 @@ def save_corpusV2(corpus, corpus_path, corpus_name): file.write(json.dumps(doc.metadata)) def load_corpusV2(corpus_path, corpus_name, lang="de"): - """ - Load textacy-Corpus including spacy-parser out from file - :param corpus_path: str - :param corpus_name: str (should content the language like "_de_") - :param lang: str (language code) ir spacy.Language - :return: texracy.Corpus, spacy.language - """ + # ckeck for language if "de_" in corpus_name: @@ -359,5 +309,5 @@ def yield_fromdir(path,spacy_vocab=None,type=".pkl"): else: for filename in filelist: yield load_obj(path+filename) - +""" diff --git a/topicModeling.py b/topicModeling.py index 7b16ae0..170943d 100644 --- a/topicModeling.py +++ b/topicModeling.py @@ -9,7 +9,7 @@ import sys import json import os.path import subprocess -from textacy import Vectorizer +from textacy import Vectorizer, viz from miscellaneous import * import textacy @@ -163,7 +163,8 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/" LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) - dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root) + #dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root) + dict_path = FILEPATH +"results/labeldict.txt".format(jgibbsLLDA_root) # printlog(str("LABELDICT: {0}".format(labeldict))) #logprint(str("LABELDICT-length: {0}".format(len(labeldict)))) @@ -243,6 +244,30 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F textacy.fileio.write_file_lines(result, path2save_results) ##################################################################################################################### + + #todo llda termite plot + """ + topic_inds=[] #: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] + + # get topic and term labels + # : ('topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14') + topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds) + + # : ('hardware', 'raum', 'adresse', 'gebaeude', 'tu', 'uni', 'ticket', 'email', 'account', 'nummer', 'mail', 'outlook', 'karte', 'drucker', 'server', 'service', 'antwort', 'verbindung', 'herzliche', 'einrichten', 'vergessen', 'wenden', 'ews', 'anhang', 'form', 'konto', 'nachricht', 'unterstuetzung', 'passwort', 'unicard', 'semester', 'system', 'aenderung', 'rueckmeldung', 'meldung', 'zugreifen', 'login', 'adressat', 'sender', 'kurs', 'made', 'mittwoch', 'note', 'our', 'korrespondenz', 'unbeschadet', 'boss', 'unterrichten', 'telefax', 'zugang', 'probleme', 'zugriff', 'mitarbeiterin', 'internet', 'daten', 'anmeldung', 'aendern', 'unterschrift', 'loeschen', 'anmelden', 'datei', 'message', 'laptop', 'benoetigt', 'link', 'montag', 'programm', 'ordner', 'personal', 'rechner', 'veranstaltung', 'august', 'lizenz', 'anschluss', 'mitarbeiter', 'erwuenscht', 'umzug', 'pc', 'uniaccount', 'amt', 'fax', 'it', 'institut', 'nutzer', 'bild', 'type', 'prof', 'verantwortlicher', 'bemerkung', 'antragsteller', 'element', 'hahn', 'eintrag', 'telefonbuch', 'ansprechpartner', 'universitaet', 'physik', 'abteilung', 'fakultaet', 'software', 'dezernat', 'einrichtung', 'telefon', 'lehrstuhl', 'buero') + term_labels = tuple(id2term[term_ind] for term_ind in term_inds) + + # get topic-term weights to size dots + #[[ 0.02721858 -0.03898025 0.00047936 ..., 0.05862538 -0.07742336 0.04761928] + # [ 0.14977875 -0.24192522 -0.00620335 ..., -0.0497216 0.08269951 -0.05715901] + # [ 0.04977951 0.02296709 0.01214562 ..., 0.11444371 -0.15212482 0.21481788] + # ..., + # [ + term_topic_weights = np.array([self.model.components_[topic_ind][term_inds] + for topic_ind in topic_inds]).T + + viz.draw_termite_plot( + term_topic_weights, topic_labels, term_labels, save=path2save_results) + """ logprint("") end = time.time() @@ -275,9 +300,9 @@ def main(use_raw=False, algorithm="llda"): # idee https://en.wikipedia.org/wiki/Noisy_text_analytics # idee https://gate.ac.uk/family/ - # todo gescheites tf(-idf) maß finden - # todo topics zusammenfassen - # frage wieviele tickets pro topic? + # todo llda topics zusammenfassen + # idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics + # frage lda wieviele tickets pro topic? """ ngrams = 1 @@ -300,25 +325,25 @@ def main(use_raw=False, algorithm="llda"): if algorithm == "llda": top_topic_words = 5 add_default_topic = False - path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "") + path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "") jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, add_default_topic=add_default_topic) top_topic_words = 5 add_default_topic = True - path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "") + path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "") jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, add_default_topic=add_default_topic) top_topic_words = 10 add_default_topic = False - path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "") + path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "") jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, add_default_topic=add_default_topic) top_topic_words = 10 add_default_topic = True - path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "") + path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "") jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, add_default_topic=add_default_topic) @@ -339,15 +364,70 @@ def main(use_raw=False, algorithm="llda"): labeldict = {k: v for v, k in enumerate(labelist)} - n_topics = 15 textacyTopicModeling(ngrams = 1, min_df = 1, - max_df = 0.8, + max_df = 0.9, topicModel = algorithm, - n_topics =n_topics, + n_topics =15, corpus=de_corpus) + + textacyTopicModeling(ngrams=1, + min_df=1, + max_df=0.9, + topicModel=algorithm, + n_topics=20, + corpus=de_corpus) + + textacyTopicModeling(ngrams=1, + min_df=1, + max_df=0.9, + topicModel=algorithm, + n_topics=25, + corpus=de_corpus) + + + textacyTopicModeling(ngrams=1, + min_df=1, + max_df=0.9, + topicModel=algorithm, + n_topics=30, + corpus=de_corpus) + + + + textacyTopicModeling(ngrams=(1, 2), + min_df=1, + max_df=0.9, + topicModel=algorithm, + n_topics=15, + corpus=de_corpus) + + textacyTopicModeling(ngrams = (1,2), + min_df = 1, + max_df = 0.9, + topicModel = algorithm, + n_topics =20, + corpus=de_corpus) + + textacyTopicModeling(ngrams = (1,2), + min_df = 1, + max_df = 0.9, + topicModel = algorithm, + n_topics =25, + corpus=de_corpus) + + + textacyTopicModeling(ngrams = (1,2), + min_df = 1, + max_df = 0.9, + topicModel = algorithm, + n_topics =30, + corpus=de_corpus) + + + """ textacyTopicModeling(ngrams = (1,2), min_df = 1,