diff --git a/M42-Export/Tickets_small.csv b/M42-Export/Tickets_small.csv index 7936a66..520a4a8 100644 --- a/M42-Export/Tickets_small.csv +++ b/M42-Export/Tickets_small.csv @@ -1,92 +1,4 @@ "TicketNumber";"Subject";"CreatedDate";"categoryName";"Impact";"Urgency";"BenutzerID";"VerantwortlicherID";"EigentuemerID";"Description";"Solution" -"INC20357";"schulungstest";"21.07.2015 08:19:34";"ZHB";"2 - Mittel (Abt./Bereich)";"B - Normal";"aa8315f5-52c3-e411-80c7-0050569c58f5";"";"aa8315f5-52c3-e411-80c7-0050569c58f5";"kevin arbeite gefälligst :)";"" -"INC40481";"Telephone Contract";"13.08.2015 14:18:57";"Neuanschluss";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Neuanschluss -Antragsteller: -Melanie Hinrichs -melanie.hinrichs@tu-dortmund.de -  -  -  -Terminvorschlag unbestimmt -"TicketNumber";"Subject";"CreatedDate";"categoryName";"Impact";"Urgency";"BenutzerID";"VerantwortlicherID";"EigentuemerID";"Description";"Solution" -"INC20357";"schulungstest";"21.07.2015 08:19:34";"ZHB";"2 - Mittel (Abt./Bereich)";"B - Normal";"aa8315f5-52c3-e411-80c7-0050569c58f5";"";"aa8315f5-52c3-e411-80c7-0050569c58f5";"kevin arbeite gefälligst :)";"" -"INC40481";"Telephone Contract";"13.08.2015 14:18:57";"Neuanschluss";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Neuanschluss -Antragsteller: -Melanie Hinrichs -melanie.hinrichs@tu-dortmund.de -  -  -  -Terminvorschlag unbestimmt -Einrichtung Dezernat 3 -Abteilung Abteilung 2 -PSP Element L-11-10000-100-302300 -UniAccount myvowest(Westerdorf, Yvonne) -Gebäude Pavillon 8 -Raum ID 031 (63292) -Telefondose keine vorhanden -Telefonnr. - -Eintrag Telefonbuch -E-Mail melanie.hinrichs@tu-dortmund.de -Voicemail Nicht erwünscht -Ansprechpartner Melanie Hinrichs -Tel. Ansprechpartner 5848 -Verantwortlicher Nutzer - -Type Amt -Bemerkung: -Es wird ein Telefon benötigt,ein Telefon mit 6 Speicherpl.f.die Gruppenfunktion ist ausreichend. Die Möbel werden am 10.06.2015 aufgestellt.Weder Netzwerkdose noch Telefondose vorhanden. Dez.6 hat Vorbereitungen getroffen.";"Frau Hinrichs überdenkt die Situation und macht dann neue Anträge. -Dieses Ticket wird geschlossen" -"INC40483";"Telephone Contract";"13.08.2015 14:22:06";"Neuanschluss";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Neuanschluss -Antragsteller: -Anja Kulmsee -anja.kulmsee@tu-dortmund.de -  -  -  -Terminvorschlag 03.08.2015 -Einrichtung Fk06 Dekanat -Abteilung Bereich Studium und Lehre -PSP Element L-11-10000-100-060011 -UniAccount manjkulm(Kulmsee, Anja) -Gebäude CT Geschossbau 2 -Raum ID G2-3.22 (64882) -Telefondose -Telefonnr. - -Eintrag Telefonbuch -E-Mail anja.kulmsee@tu-dortmund.de -Voicemail Nicht erwünscht -Ansprechpartner Anja Kulmsee -Tel. Ansprechpartner 6179, 7370, 7179 -Verantwortlicher Nutzer - -Type Amt -Bemerkung: -Der Anschluß ist für ein Faxgerät. Wenn möglich hätte ich gern die Rufnummer 3033.";"Faxnummer 3166 wurde unter die Telefonnummer 7179 im elektronischen Telefonbuch eingetragen" -"INC40484";"Defekte Netzwerkdose / Frage zu VPN";"13.08.2015 14:25:50";"LAN";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Sehr geehrtes ITMC Service Team, - -seit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen an das Intranet der BMP mit der Dosennummer G1 303/04/12.05 (G1 4 26-1) in Raum G1-426 nicht mehr zu funktionieren. -Ich würde Sie daher bitten diese Mail an den zuständigen Kollegen weiterzuleiten, um die Leitung vielleicht einmal zu Prüfen. - -Des Weiteren hätte ich noch eine Frage bezüglich der Möglichkeit zur Nutzung einer VPN Verbindung aus unserem Intranet heraus zu einem fremden Netzwerk. Dies ist zwar über das WLAN-Netz möglich, jedoch nicht aus unserem Netzwerk heraus. Vielleicht können Sie mir mitteilen an welchen Kollegen ich mich bezüglich dieses Problem wenden kann. - -Bei Rückfragen stehe ich gerne zur Verfügung! - -Beste Grüße, - -Nicolas Rauner - -LS Biomaterialien und Polymerwissenschaften -Fakultät Bio- und Chemieingenieurwesen -TU Dortmund -D-44227 Dortmund - -Tel: + 49-(0)231 / 755 - 3015 -Fax: + 49-(0)231 / 755 - 2480 - -www.ls-bmp.de ";"Hallo Herr Rauner, -die Netzwerkdose weist z. Z. keine Verbindungsprobleme auf. Falls doch welche bestehen, melden Sie sich bitte bei uns. - -Mit freunldichen Grüßen -Aicha Oikrim" "INC40487";"(SSO) Login via Browser mit Zertifikat";"13.08.2015 14:54:57";"Betrieb";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Lieber Support, ich habe gerade versucht mich mit meiner Unicard im Firefox-Browser für das Service-Portal zu authentifizieren. Das hat vor einigen Wochen noch tadelos diff --git a/config.ini b/config.ini index 49394be..5e99a06 100644 --- a/config.ini +++ b/config.ini @@ -1,86 +1,91 @@ [thesaurus] -input = deWordNet.xml -pickle_file = thesaurus_dict.pkl +input=deWordNet.xml +pickle_file=thesaurus_dict.pkl [spellchecking] -input = deu_news_2015_1M-sentences.txt -pickle_file = words_dict.pkl +input=deu_news_2015_1M-sentences.txt +pickle_file=words_dict.pkl [lemmatization] -input = lemmas.txt -pickle_file = lemma_dict.pkl +input=lemmas.txt +pickle_file=lemma_dict.pkl [nouns] -input1 = nomen.txt -input2 = nomen2.txt -pickle_file = nouns_list.pkl +input1=nomen.txt +input2=nomen2.txt +pickle_file=nouns_list.pkl [firstnames] -input = firstnames.txt -pickle_file = firstnames_list.pkl +input=firstnames.txt +pickle_file=firstnames_list.pkl [de_stopwords] -input1 = de_stopwords_1.txt -input2 = de_stopwords_2.txt -input3 = de_stopwords_3.txt -pickle_file = stopwords_list.pkl +input1=de_stopwords_1.txt +input2=de_stopwords_2.txt +input3=de_stopwords_3.txt +pickle_file=de_stopwords_list.pkl + +[en_stopwords] + +pickle_file=en_stopwords_list.pkl [logging] -level = INFO -filename = topicModelTickets.log +level=INFO +filename=topicModelTickets.log [de_corpus] -#input = M42-Export/Tickets_med.csv -#input = M42-Export/Tickets_small.csv -#input = M42-Export/Tickets_mini.csv -input = M42-Export/de_tickets.csv +#input=M42-Export/Tickets_med.csv +#input=M42-Export/Tickets_small.csv +#input=M42-Export/Tickets_mini.csv +input=M42-Export/de_tickets.csv -path = corpi/ +path=corpi/ [en_corpus] -input = M42-Export/en_tickets.csv +input=M42-Export/en_tickets.csv -path = corpi/ +path=corpi/ [tickets] -content_collumn_name = Description -metaliste = TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerID,VerantwortlicherID,EigentuemerID,Solution +content_collumn_name=Description +metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerID,VerantwortlicherID,EigentuemerID,Solution [preprocessing] -ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC +ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC -custom_words = grüßen,fragen,damen,probleme,herren,dank - -#lemmatize = True +custom_words=geehrt,dame,herr,hilfe,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,institut,universitaet,name,gruss,id,erfolg,mail,folge,nummer,team,fakultaet,email,absender,tu,versenden,vorname,message,service,strasse,prozess,portal,raum,personal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,kollege,pruefen,hoffen -[topic modeling] +#lemmatize=True -ngrams = (1,2) -min_df = 0 -max_df = 1.0 -no_below = 20 -no_above = 0.5 +[topicmodeling] -topicModel = lda +ngrams=(1,2) -top_topic_words = 5 +min_df=0 +max_df=1.0 +no_below=20 +no_above=0.5 -top_document_labels_per_topic = 2 +topicModel=lda + +top_topic_words=5 + +top_document_labels_per_topic=2 diff --git a/corporization.py b/corporization.py index 50cba8e..64e4c47 100644 --- a/corporization.py +++ b/corporization.py @@ -7,6 +7,7 @@ import time from datetime import datetime import re import textacy +from textacy.preprocess import normalize_whitespace from scipy import * import os @@ -93,10 +94,8 @@ metaliste = [ ] """ - content_collumn_name = config.get("tickets","content_collumn_name") -metaliste = config.get("tickets","metaliste").split(",") - +metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(","))) path2de_csv = FILEPATH + config.get("de_corpus","input") corpus_de_path = FILEPATH + config.get("de_corpus", "path") @@ -121,7 +120,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la raw_corpus = textacy.Corpus(lang) ## add files to textacy-corpi, - printlog("Add texts to {0}_textacy-corpi".format(lang)) + #printlog("Add texts to {0}_textacy-corpi".format(lang)) raw_corpus.add_texts( ticketcsv_to_textStream(path2_csv, content_collumn_name), @@ -140,6 +139,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la # save corpus raw_name = lang + "_raw_ticket" save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name) + printlog("Done") def main(): @@ -148,7 +148,7 @@ def main(): ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de") - ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en") + #ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en") end = time.time() diff --git a/init.py b/init.py index 596190f..71c28b2 100644 --- a/init.py +++ b/init.py @@ -264,7 +264,9 @@ path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file") stop1 = FILEPATH + config.get("de_stopwords","input1") stop2 = FILEPATH + config.get("de_stopwords","input2") stop3 = FILEPATH + config.get("de_stopwords","input3") -path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file") +path2stopwordlist_de = FILEPATH + config.get("de_stopwords","pickle_file") + +path2stopwordlist_en = FILEPATH + config.get("en_stopwords","pickle_file") @@ -293,8 +295,9 @@ def main(): printlog("Build and save stoppwortliste") - de_stop_words = create_stopword_lists(stop1, stop2, stop3) - save_obj(de_stop_words, path2stopwordlist) + de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3) + save_obj(de_stop_words, path2stopwordlist_de) + save_obj(en_stop_words, path2stopwordlist_en) diff --git a/java_LabledLDA/models/tickets/.others.gz b/java_LabledLDA/models/tickets/.others.gz index 6502fb1..48c3c1b 100644 Binary files a/java_LabledLDA/models/tickets/.others.gz and b/java_LabledLDA/models/tickets/.others.gz differ diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz index c7df8fa..f815b2d 100644 Binary files a/java_LabledLDA/models/tickets/.tassign.gz and b/java_LabledLDA/models/tickets/.tassign.gz differ diff --git a/java_LabledLDA/models/tickets/.theta.gz b/java_LabledLDA/models/tickets/.theta.gz index d75dd2e..8d1f466 100644 Binary files a/java_LabledLDA/models/tickets/.theta.gz and b/java_LabledLDA/models/tickets/.theta.gz differ diff --git a/java_LabledLDA/models/tickets/.twords.gz b/java_LabledLDA/models/tickets/.twords.gz index bf49506..df07efe 100644 Binary files a/java_LabledLDA/models/tickets/.twords.gz and b/java_LabledLDA/models/tickets/.twords.gz differ diff --git a/java_LabledLDA/models/tickets/.wordmap.gz b/java_LabledLDA/models/tickets/.wordmap.gz index a19d863..6051dd9 100644 Binary files a/java_LabledLDA/models/tickets/.wordmap.gz and b/java_LabledLDA/models/tickets/.wordmap.gz differ diff --git a/java_LabledLDA/models/tickets/tickets.gz b/java_LabledLDA/models/tickets/tickets.gz index 062a01e..0516f9a 100644 Binary files a/java_LabledLDA/models/tickets/tickets.gz and b/java_LabledLDA/models/tickets/tickets.gz differ diff --git a/main.py b/main.py index faa9c8e..0cdd6ca 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,7 @@ import time import init import corporization import preprocessing +import topicModeling from miscellaneous import * @@ -19,5 +20,10 @@ printlog("") preprocessing.main() printlog("") +topicModeling.main() +printlog("") + end = time.time() printlog("Total Time Elapsed: {0} min".format((end - start) / 60)) + + diff --git a/miscellaneous.py b/miscellaneous.py index debe414..d1a3fa6 100644 --- a/miscellaneous.py +++ b/miscellaneous.py @@ -131,8 +131,8 @@ def printRandomDoc(textacyCorpus): else: printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) randIndex = int((len(textacyCorpus) - 1) * random.random()) - printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, - textacyCorpus[randIndex].metadata)) + printlog("Index: {0} \n Text: {1} \n Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, + textacyCorpus[randIndex].metadata['categoryName'])) print() diff --git a/preprocessing.py b/preprocessing.py index c7f0d65..26e755e 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -24,18 +24,30 @@ with open(config_ini) as f: config.read_file(f) +global REGEX_SPECIALCHAR +global REGEX_TOPLVL - -REGEX_SPECIALCHAR = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|?]' +REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|]' REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?' + + +global THESAURUS +global WORDS +global LEMMAS +global NOUNS +global VORNAMEN +global DE_STOP_WORDS +global EN_STOP_WORDS + THESAURUS = {} -WORDS = {} -LEMMAS = {} -NOUNS = [] -VORNAMEN= [] -de_stop_words=[] +WORDS= {} +LEMMAS= {} +NOUNS= {} +VORNAMEN= {} +DE_STOP_WORDS= {} +EN_STOP_WORDS= {} ############# filter tokens @@ -210,6 +222,10 @@ def stringcleaning(stringstream): yield string + + + + def filterTokens(tokens, funclist): # in:tokenlist, funclist # out: tokenlist @@ -218,9 +234,75 @@ def filterTokens(tokens, funclist): return tokens +def processContentstream2(textstream, parser, token_filterlist=None): + #pre parse + textstream = preparse(textstream) + pipe = parser.pipe(textstream) + for doc in pipe: + + tokens = [tok for tok in doc] + + # in parse + if token_filterlist is not None: + tokens = filterTokens(tokens, token_filterlist) + + # post parse + tokens = [postparse(tok) for tok in tokens] #todo informationsverlust! + + yield " ".join(tokens) + +def preparse(stringstream): + + for string in stringstream: + # fixUnicode + string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') + + # seperate_words_on_regex: + string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) + + #normalize whitespace + string = textacy.preprocess.normalize_whitespace(string) + + # replaceRockDots + string = re.sub(r'[ß]', "ss", string) + string = re.sub(r'[ö]', "oe", string) + string = re.sub(r'[ü]', "ue", string) + string = re.sub(r'[ä]', "ae", string) + + # cut_after + # todo addressen enfernen --> postal.parser idee zu metadaten hinzufügen + words = ["gruss", "grusse","gruesse","gruessen","grusses"] + + for gr in words: + if gr in string: + string = string.rpartition(gr)[0] + break + + yield string + +def postparse(toktext): + """ + :param toktext: spacy.token + :return: string + """ + toktext = toktext.lower_ + + # remove_words_containing_topLVL + toktext = toktext if not re.search(REGEX_TOPLVL, toktext) else "" + + # lemmatize + toktext = lemmatizeWord(toktext) + + # synonyme normalisieren + toktext = getFirstSynonym(toktext) + + # autocorrect + toktext = autocorrectWord(toktext) + + return toktext def corpus2Text(corpus): for doc in corpus: @@ -303,52 +385,16 @@ path2nouns_list = FILEPATH + config.get("nouns","pickle_file") path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file") -path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file") - +path2DEstopwordlist = FILEPATH + config.get("de_stopwords", "pickle_file") +path2ENstopwordlist = FILEPATH + config.get("en_stopwords", "pickle_file") corpus_de_path = FILEPATH + config.get("de_corpus", "path") corpus_en_path = FILEPATH + config.get("en_corpus", "path") -custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus", - "hallo", "gerne", "freundlich", "fragen", "fehler", "bitten", "ehre", "lieb", "helfen", - "versuchen", "unbestimmt", "woche", "tadelos", "klappen", "mittlerweile", "bekommen", - "erreichbar", "gruss", "auffahren", "vorgang", "hinweis", "institut", "universitaet", - "name", "gruss", "id", "erfolg", "mail","folge", - "nummer", "team", "fakultaet", "email", "absender", "tu", "versenden", "vorname", "message", - "service", "strasse", "prozess", "portal", "raum", "personal", "moeglichkeit", "fremd", "wende", - "rueckfrage", "stehen", "verfuegung", - "funktionieren", "kollege", "pruefen", "hoffen" - ] -filter_tokens = [ - # removeENT(["PERSON"]), - - keepNouns(), - - remove_words_containing_Numbers(), - - removePOS(["PUNCT", "SPACE", "NUM"]), - - #removeWords(de_stop_words + custom_words), - removeWords(de_stop_words), - - remove_long_words(), - remove_short_words(), - remove_first_names() - - -] -#todo filtertokens haut alle raus -filter_tokens = None - -clean_in_meta = { - "Solution": [removePOS(["SPACE"])], - "Subject": [removePOS(["SPACE", "PUNCT"])], - "categoryName": [removePOS(["SPACE", "PUNCT"])] -} def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10): @@ -365,7 +411,7 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print ## process and add files to textacy-corpi, corpus.add_texts( - processContentstream(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser), + processContentstream2(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser), processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser) ) @@ -392,14 +438,39 @@ def main(): THESAURUS = load_obj(path2thesaurus_dict) WORDS = load_obj(path2wordsdict) LEMMAS = load_obj(path2lemmadict) - DE_STOP_WORDS = load_obj(path2stopwordlist) + DE_STOP_WORDS = load_obj(path2DEstopwordlist) + EN_STOP_WORDS = load_obj(path2ENstopwordlist) NOUNS = load_obj(path2nouns_list) VORNAMEN = load_obj(path2firstnameslist) + filter_tokens = [ + # removeENT(["PERSON"]), + + keepNouns(NOUNS), + + remove_words_containing_Numbers(), + + removePOS(["PUNCT", "SPACE", "NUM"]), + + # removeWords(de_stop_words + custom_words), + removeWords(DE_STOP_WORDS), + + remove_long_words(), + remove_short_words(), + remove_first_names() + + ] + + + clean_in_meta = { + "Solution": [removePOS(["SPACE"])], + "Subject": [removePOS(["SPACE", "PUNCT"])], + "categoryName": [removePOS(["SPACE", "PUNCT"])] + } + preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" ) - preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" ) - + #preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" ) end = time.time() printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60)) diff --git a/testra.py b/testra.py index 843d548..d1fc357 100644 --- a/testra.py +++ b/testra.py @@ -8,6 +8,8 @@ import json #import textacy from functools import reduce +import textacy + start = time.time() import enchant @@ -54,8 +56,12 @@ corpi.add_texts( print(corpi) """ +jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/" - +LLDA_filepath = "{0}labeldict.txt".format(jgibbsLLDA_root) +laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_hilfe': 18, 'sap': 7, 'pos': 23, 'verwaltung': 4, 'lan': 1} +with open(LLDA_filepath, 'w') as file: + file.write(json.dumps(laveldict)) """ def load_corpus(corpus_path, corpus_name, lang="de"): from pathlib import Path @@ -85,20 +91,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"): textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) return corpus """ -import os -a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt" -b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt" -d = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt" - -c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt" - - - - -scriptpath = os.path.dirname(os.path.realpath(__file__)) - - - """ diff --git a/topicModeling.py b/topicModeling.py index 75dbe12..cd35eac 100644 --- a/topicModeling.py +++ b/topicModeling.py @@ -1,82 +1,39 @@ # -*- coding: utf-8 -*- - -from datetime import datetime - -print(datetime.now()) - -import time - -import enchant - -start = time.time() from datetime import datetime import time -import logging -from stop_words import get_stop_words -#import words as words -from nltk.corpus import stopwords as nltk_stopwords -from collections import Counter import csv -import re -import xml.etree.ElementTree as ET -import spacy -import textacy -from scipy import * import sys -csv.field_size_limit(sys.maxsize) -import pickle -import configparser as ConfigParser -from miscellaneous import * - - - -import time - - - - -from datetime import datetime -import logging -from nltk.corpus import stopwords -import csv -import functools -import re -import xml.etree.ElementTree as ET -import spacy -import textacy -from scipy import * -import sys -csv.field_size_limit(sys.maxsize) - -import logging - -import csv -import functools +import json import os.path -import re import subprocess -import time -import xml.etree.ElementTree as ET -import sys -import spacy +from textacy import Vectorizer + +from miscellaneous import * import textacy from scipy import * -from textacy import Vectorizer -import warnings -import configparser as ConfigParser -import sys -import hunspell -from postal.parser import parse_address + +import os csv.field_size_limit(sys.maxsize) +FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" + +# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &" + + +# load config +config_ini = FILEPATH + "config.ini" + +config = ConfigParser.ConfigParser() +with open(config_ini) as f: + config.read_file(f) -def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): +def printvecotorization(de_corpus,ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): printlog(str("ngrams: {0}".format(ngrams))) printlog(str("min_df: {0}".format(min_df))) printlog(str("max_df: {0}".format(max_df))) @@ -94,47 +51,7 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en printlog("doc_term_matrix: {0}".format(doc_term_matrix)) printlog("id2term: {0}".format(id2term)) -corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/" -corpus_name = "de_corpus" - -# load corpi -de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path) - - - -# todo gescheites tf(-idf) maß finden -ngrams = 1 -min_df = 1 -max_df = 1.0 -weighting = 'tf' -# weighting ='tfidf' -named_entities = False - -""" -printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting) -printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting) -printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting) - -printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting) -printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting) -printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting) -""" - - -# build citionary of ticketcategories -labelist = [] - -for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): - labelist.append(texdoc.metadata["categoryName"]) - -LABELDICT = {k: v for v, k in enumerate(labelist)} - -printlog(str("LABELDICT: {0}".format(LABELDICT))) - - - -def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(LABELDICT), named_entities=False, - corpus=de_corpus): +def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda',named_entities=False): printlog( "############################################ Topic Modeling {0} #############################################".format( topicModel)) @@ -198,132 +115,174 @@ def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len( printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) -# no_below = 20 -# no_above = 0.5 +def jgibbsLLDA(de_corpus, top_topic_words): + ##################### LLDA Topic Modeling via JGibbsLabledLDA ############################################## + + start = time.time() + + def label2ID(label, labeldict): + return labeldict.get(label, len(labeldict)) + + def generate_labled_lines(textacyCorpus,labeldict): + for doc in textacyCorpus: + # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi + yield "[" + str(label2ID(doc.metadata["categoryName"],labeldict)) + "] " + doc.text + + # build citionary of ticketcategories + labelist = [] + + for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): + labelist.append(texdoc.metadata["categoryName"]) + + labeldict = {k: v for v, k in enumerate(labelist)} + + n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic + + jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/" + + LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) + dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root) -# n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic - - - -""" -topicModeling(ngrams = 1, - min_df = 1, - max_df = 1.0, - topicModel = 'lda', - n_topics = len(LABELDICT), - corpi=de_corpus) - -topicModeling(ngrams = 1, - min_df = 0.1, - max_df = 0.6, - topicModel = 'lda', - n_topics = len(LABELDICT), - corpi=de_corpus) - -topicModeling(ngrams = (1,2), - min_df = 1, - max_df = 1.0, - topicModel = 'lda', - n_topics = len(LABELDICT), - corpi=de_corpus) - -topicModeling(ngrams = (1,2), - min_df = 0.1, - max_df = 0.6, - topicModel = 'lda', - n_topics = len(LABELDICT), - corpi=de_corpus) - -topicModeling(ngrams = (1,2), - min_df = 0.2, - max_df = 0.8, - topicModel = 'lda', - n_topics = 20, - corpi=de_corpus) - - - - - - - -""" - -##################### LLDA Topic Modeling via JGibbsLabledLDA ############################################## - - -top_topic_words = 15 - -print("\n\n") -start = time.time() - -n_topics = len(LABELDICT) # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic - -# build citionary of ticketcategories -labelist = [] - -for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): - labelist.append(texdoc.metadata["categoryName"]) - -LABELDICT = {k: v for v, k in enumerate(labelist)} -print(LABELDICT) - - -def label2ID(label, labeldict=LABELDICT): - return labeldict.get(label, len(labeldict)) - - -def generate_labled_lines(textacyCorpus): - for doc in textacyCorpus: - # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi - yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text - - -jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/" -LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) - -# create file -textacy.fileio.write_file_lines(generate_labled_lines(de_corpus), filepath=LLDA_filepath) - -# todfo ticket drucken -# wait for file to exist -while not os.path.exists(LLDA_filepath): - time.sleep(1) - -print("\n\n") -printlog("start LLDA:") -# run JGibsslda file -FNULL = open(os.devnull, 'w') # supress output -subprocess.call(["java", - "-cp", - "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root), - "jgibblda.LDA", - "-est", - "-dir", "{0}models/tickets".format(jgibbsLLDA_root), - "-dfile", "tickets.gz", - "-twords", str(top_topic_words), - "-ntopics", str(n_topics)], stdout=FNULL) - -# ANMERKUNG: Dateien sind versteckt. zu finden in models/ - -# twords -subprocess.call(["gzip", - "-dc", - "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]) -##################################################################################################################### -print() -print() - -end = time.time() -printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60)) + #printlog(str("LABELDICT: {0}".format(labeldict))) + printlog(str("LABELDICT-length: {0}".format(len(labeldict)))) + with open(dict_path, 'w') as file: + file.write(json.dumps(labeldict)) + + #for line in generate_labled_lines(de_corpus,labeldict): + # print(line) + + # create file + textacy.fileio.write_file_lines(generate_labled_lines(de_corpus,labeldict), filepath=LLDA_filepath) + + # wait for file to exist + while not os.path.exists(LLDA_filepath): + time.sleep(1) + """ + printlog("") + printlog("start LLDA:") + # run JGibsslda file + FNULL = open(os.devnull, 'w') # supress output + subprocess.call(["java", + "-cp", + "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( + jgibbsLLDA_root), + "jgibblda.LDA", + "-est", + "-dir", "{0}models/tickets".format(jgibbsLLDA_root), + "-dfile", "tickets.gz", + "-twords", str(top_topic_words), + "-ntopics", str(n_topics)], stdout=FNULL) + + # ANMERKUNG: Dateien sind versteckt. zu finden in models/ + + # twords + subprocess.call(["gzip", + "-dc", + "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]) + ##################################################################################################################### + printlog("") + """ + end = time.time() + printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60)) +def main(): + printlog("Topic Modeling: {0}".format(datetime.now())) + corpus_de_path = FILEPATH + config.get("de_corpus", "path") + corpus_en_path = FILEPATH + config.get("en_corpus", "path") + preCorpus_name = "de" + "_pre_ticket" + + #load raw corpus and create new one + de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) + printlog("Corpus loaded: {0}".format(de_corpus.lang)) + + #idee http://bigartm.org/ + #idee http://wiki.languagetool.org/tips-and-tricks + + # todo gescheites tf(-idf) maß finden + ngrams = 1 + min_df = 1 + max_df = 1.0 + weighting = 'tf' + # weighting ='tfidf' + named_entities = False + + + """ + printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting) + printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting) + printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting) + + printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting) + printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting) + printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting) + """ + + + jgibbsLLDA(de_corpus,15) + + # no_below = 20 + # no_above = 0.5 + + + # n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic + + + + """ + topicModeling(ngrams = 1, + min_df = 1, + max_df = 1.0, + topicModel = 'lda', + n_topics = len(LABELDICT), + corpi=de_corpus) + + topicModeling(ngrams = 1, + min_df = 0.1, + max_df = 0.6, + topicModel = 'lda', + n_topics = len(LABELDICT), + corpi=de_corpus) + + topicModeling(ngrams = (1,2), + min_df = 1, + max_df = 1.0, + topicModel = 'lda', + n_topics = len(LABELDICT), + corpi=de_corpus) + + topicModeling(ngrams = (1,2), + min_df = 0.1, + max_df = 0.6, + topicModel = 'lda', + n_topics = len(LABELDICT), + corpi=de_corpus) + + topicModeling(ngrams = (1,2), + min_df = 0.2, + max_df = 0.8, + topicModel = 'lda', + n_topics = 20, + corpi=de_corpus) + + + + + + + + """ + + + +if __name__ == "__main__": + main()