diff --git a/cleaning.py b/cleaning.py index ad7b60c..dc5258d 100644 --- a/cleaning.py +++ b/cleaning.py @@ -49,15 +49,20 @@ def clean(stringstream):#, NOUNS): #string = textacy.preprocess.unidecode(string) # seperate_words_on_regex: - string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|]').split(string)) + string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|]').split(string)) #todo bla vllt lassen wir das hier? achaj: für header und footer vllt englische-spracherkennung und adressen parsing und grußfromelerkennung #normalize whitespace string = textacy.preprocess.normalize_whitespace(string) + + #todo bla textacy.preprocess.remove_accents(text, method=u'unicode')[source] + + + #remove linebreaks - string = re.sub(r'[\n]', " ", string) + string = re.sub(r'[\n]', " ", string) #todo bla kann/soll raus? weil absätze vllt weas zu bedeuten haben - string = replaceRockDots(string) + string = replaceRockDots(string) #todo bla gehört zu normalize """ # fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden. @@ -91,13 +96,70 @@ def clean(stringstream):#, NOUNS): yield string + +def processDictstream_v2(dictstream, keys_to_clean): + for dic in dictstream: + + result = {k: re.sub(r'[.!?]', "", normalize_str(v).lower()) if k in keys_to_clean else v for k, v in dic.items()} + yield result + + + + + + +def processDictstream(dictstream, funcdict, parser): + """ + + :param dictstream: dict-gen + :param funcdict: + clean_in_meta = { + "Solution":funclist, + ... + } + + :param parser: spacy-parser + :return: dict-gen + """ + for dic in dictstream: + result = {} + for key, value in dic.items(): + + if key in funcdict: + + doc = parser(value) + tokens = [tok for tok in doc] + funclist = funcdict[key] + + tokens = filterTokens(tokens, funclist) + + result[key] = " ".join([tok.lower_ for tok in tokens]) + + + else: + result[key] = value + yield result + +def filterTokens(tokens, funclist): + # in:tokenlist, funclist + # out: tokenlist + for f in funclist: + tokens = list(filter(f, tokens)) + + for tok in tokens: + if tok.pos_ == "NOUN": + x = 0 + + return tokens + + ################################################################################################## corpus_de_path = FILEPATH + config.get("de_corpus", "path") -def cleanCorpus(corpus): +def cleanCorpus(corpus,clean_in_meta): logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now())) """ @@ -122,9 +184,12 @@ def cleanCorpus(corpus): # Actually clean the corpus cleaned_corpus = textacy.Corpus(parser) + cleaned_corpus.add_texts( clean(corpus2Text(raw_corpus)), - corpus2Meta(raw_corpus) + #processDictstream(corpus2Meta(cleaned_corpus), clean_in_meta, parser=parser) + + processDictstream_v2(corpus2Meta(raw_corpus),clean_in_meta) ) @@ -143,14 +208,26 @@ def cleanCorpus(corpus): return cleaned_corpus +def removePOS(pos_list): + return lambda tok: tok.pos_ not in pos_list + + def main(corpus): start = time.time() + clean_in_meta = { + "Solution": [removePOS(["SPACE"])], + "Subject": [removePOS(["SPACE", "PUNCT"])], + "categoryName": [removePOS(["SPACE", "PUNCT"])] + } - cleaned_corpus = cleanCorpus(corpus) + clean_in_meta = ["Subject", "categoryName" ] + + + cleaned_corpus = cleanCorpus(corpus, clean_in_meta) diff --git a/config.ini b/config.ini index 837b39a..5deec1f 100644 --- a/config.ini +++ b/config.ini @@ -67,7 +67,7 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI autocorrect = false #true -custom_words=aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderung,test,erwuenscht,antragsteller,bemerkung,tu,uni,prof,bezeichnung,gramm,type,eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok,januar,februar,maerz,april,mai,juni,juli,august,september,oktober,november,dezember +custom_words=problem,without,aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderung,test,erwuenscht,antragsteller,bemerkung,tu,uni,prof,bezeichnung,gramm,type,eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok,januar,februar,maerz,april,mai,juni,juli,august,september,oktober,november,dezember diff --git a/corpi/corpi_vor_besprechung/labeled.txt b/corpi/corpi_vor_besprechung/labeled.txt new file mode 100644 index 0000000..ea4d4ec --- /dev/null +++ b/corpi/corpi_vor_besprechung/labeled.txt @@ -0,0 +1,24 @@ +[neuanschluss] telefon dezernat uniaccount pavillon telefonbuch mail amt telefon benoetigt telefon speicherpl.f.die ausreichend moebel aufgestellt.weder +[neuanschluss] telefon dekanat bereich studium lehre uniaccount ct g2 telefonbuch mail amt anschluss faxgeraet rufnummer +[lan] service netzwerkanschluss kollegen intranet mail kollegen leitung vpn verbindung intranet netzwerk wlan netz netzwerk mitteilen wenden +[betrieb] support unicard browser service wochen +[elektronisches telefonbuch] telefon umzug astrid.gramm@tu-dortmund.de dezernat uniaccount dezernat telefonbuch mail +[verwaltung] laptop klaerung dezernat organisationsentwicklung mail +[umzug] telefon umzug uniaccount chemie telefonbuch mail anschluss fakultaet berufung telefonanlage gesondert b. +[umzug] telefon umzug uniaccount chemie telefonbuch mail anschluss fax anschluss +[umzug] telefon umzug uniaccount chemie telefonbuch mail anschluss hochschullehrer fakultaet telefonanlage anschlusses +[lsf] service semester semester stelle studium wahlmodulen experimentelle webseite erstellen vision stufen kommilitonen informatik fakultaet erfinden sichtweise daten boss schaffen aspekt studienplan uebersicht aktuelle ansicht semester modul stufe veranstaltungsinformationen studiengang fakultaet dozent semester turnus orientierung anhand semester automatisiert erstellen datenbank dozent turnus uhrzeit beschreibung boss programmierschnittstelle datenabfrage login benutzername passwort einzelne benutzer erlaubnis liste boss nummer format xml pdf gespraech klaeren +[sap] mitarbeiter schulung berechtigung budget kennung ort schulung +[gastaufenthalt] damen pruefung email adresse honorarprofessor ing vorlesungen bereich satellitenkommunikation fakultaet elektrotechnik informationstechnik mitarbeiter lehrstuhl hochfrequenztechnik lehrstuhl email adresse sinnvolle kommunikation hilfsmittel ews sinne email adresse +[sap] schulung dezernat zuhoeren berechtigung budget lage account +[fk raumplanung 09] pc modus +[sap] kolleginnen kollegen schulung anfaenger verwendung feld dezentral zugreifen uebersicht alternative budget berechtigung transaktionen fb60 dezernat sekretariaten kuerze fk05 statistik einsatz korrektur kurze rueckmeldung freischaltung einrichtungen +[fiona] mitarbeiter fachgebiet regionalsoziologie fakultaet raumplanung fachgebiet alte homepage homepage erscheinungsbild aktuell kenne programm umstellung einstiegsschulung vornehmen besprechen taeglich buero erreichen bescheid weber gb iii raumplanung waehlen mithilfe +[fk 12] hi zugang fk12-adresse aendern +[uniaccount] meldung zugangsdaten passwort rechtzeitig zugang problemlos account +[elektronisches telefonbuch] telefon umzug lehrstuhl uniaccount physik telefonbuch mail nr mitnehmen +[abmeldung] telefon abmeldung uniaccount telefonbuch mail besitzer nr +[telefon] telefon geraet display defekt telefon wenden -5886 +[neuanschluss] telefon leitung uniaccount telefonbuch mail amt telefon auszubildende sekretariat azubi sekretariat +[uni mail] kenntnisnahme loesung alte passwort aendern erklaert passwort buero server absturz problemlos passwort unabhaengig telefonats service geloest erstmal rueckmeldung vorgehensweise kollegen geloest service antrag dienstreise passwort alte passwort mail dienstreisen antrag passwort system unding offenbar it sachverhalt systemausfall wochen reibungslos +[uni mail] webmailer text einfuegen \ No newline at end of file diff --git a/corporization.py b/corporization.py index 607e2b2..97ed909 100644 --- a/corporization.py +++ b/corporization.py @@ -60,8 +60,8 @@ def ticket_csv_to_DictStream(path2csv,content_collumn_name): for i, lst in enumerate(stream): if i == 0: for j, col in enumerate(lst): - if "icketNumb" in col: - col = "TicketNumber" + if "icketNumb" in col: #korrigieren der .csv todo wenn hier sowieso hardgecodet werden muss, dann gleich auch config.ini raus? + col = "TicketNumber" metalist.append(str(col)) metaindices.append(j) metadata_template = dict( @@ -89,7 +89,7 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path") -def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0): +def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0): #todo bla das kann hier die main sein """ Use textacy to create a Corpus out of the ITMC-Ticket.csv @@ -105,17 +105,19 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printra path_csv_split = path2_csv.split("/") filename = path_csv_split[len(path_csv_split) - 1] - logprint("Corporization of {0} at {1}".format(filename, datetime.now())) + logprint("Corporization of {0}".format(filename))#, datetime.now())) raw_corpus = textacy.Corpus(lang) - ## add files to textacy-corpi, + ## add files to textacy-corpi, todo bla hier cleanen, dict nich vergessn raw_corpus.add_texts( - ticketcsv_to_textStream(path2_csv, content_collumn_name), + ticketcsv_to_textStream(path2_csv, content_collumn_name), ticket_csv_to_DictStream(path2_csv,content_collumn_name) ) + + # leere docs aus corpi kicken raw_corpus.remove(lambda doc: len(doc) == 0) @@ -147,4 +149,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/init.py b/init.py index 79c7bc2..2bfddcb 100644 --- a/init.py +++ b/init.py @@ -281,7 +281,7 @@ def build_words_for_spellchecking(path2words): def main(): start = time.time() - logprint("Init: {0}".format(datetime.now())) + logprint("Init")#: {0}".format(datetime.now())) ressources_path = FILEPATH + "ressources/" diff --git a/java_LabledLDA/models/tickets/.others.gz b/java_LabledLDA/models/tickets/.others.gz index aad2762..cd6c7da 100644 Binary files a/java_LabledLDA/models/tickets/.others.gz and b/java_LabledLDA/models/tickets/.others.gz differ diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz index 3e1418c..926d375 100644 Binary files a/java_LabledLDA/models/tickets/.tassign.gz and b/java_LabledLDA/models/tickets/.tassign.gz differ diff --git a/java_LabledLDA/models/tickets/.theta.gz b/java_LabledLDA/models/tickets/.theta.gz index ec06bac..441848d 100644 Binary files a/java_LabledLDA/models/tickets/.theta.gz and b/java_LabledLDA/models/tickets/.theta.gz differ diff --git a/java_LabledLDA/models/tickets/.twords.gz b/java_LabledLDA/models/tickets/.twords.gz index 1ab725d..279be75 100644 Binary files a/java_LabledLDA/models/tickets/.twords.gz and b/java_LabledLDA/models/tickets/.twords.gz differ diff --git a/java_LabledLDA/models/tickets/.wordmap.gz b/java_LabledLDA/models/tickets/.wordmap.gz index c24683a..8b901a8 100644 Binary files a/java_LabledLDA/models/tickets/.wordmap.gz and b/java_LabledLDA/models/tickets/.wordmap.gz differ diff --git a/java_LabledLDA/models/tickets/tickets.gz b/java_LabledLDA/models/tickets/tickets.gz index 83cc2ed..dac36e5 100644 Binary files a/java_LabledLDA/models/tickets/tickets.gz and b/java_LabledLDA/models/tickets/tickets.gz differ diff --git a/main.py b/main.py index 22d2d7c..e0e9550 100644 --- a/main.py +++ b/main.py @@ -27,13 +27,13 @@ start = time.time() # idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics # frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen # idee word vorher mit semantischen netz abgleichen: wenn zu weit entfernt, dann ignore - +# idee lda2vec # todo modelle testen -# todo ticket2kbkeys, subj, cats in init.py -logprint("main.py started at {}".format(datetime.now())) + +logprint("main.py started") init.main() @@ -45,9 +45,15 @@ logprint("") cleaned_corpus = cleaning.main(raw_corpus) logprint("") -pre_corpus = preprocessing.main(cleaned_corpus) +doc_term_matrix, id2term_dict = preprocessing.main(cleaned_corpus) logprint("") +topicModeling.textacyTopicModeling_v2(doc_term_matrix, id2term_dict) + + + + + """ ticket_number = "INC40484" raw="" @@ -89,11 +95,11 @@ logprint("") logprint("") """ -topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="llda") +#topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="llda") logprint("") -topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="lda") +#topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="lda") logprint("") diff --git a/miscellaneous.py b/miscellaneous.py index f4617fe..12188b4 100644 --- a/miscellaneous.py +++ b/miscellaneous.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from datetime import datetime import configparser as ConfigParser import csv import functools @@ -47,6 +48,7 @@ logging.basicConfig(filename=filename, level=level) def logprint(string, level="INFO"): """log and prints""" + string = "{}\t".format(datetime.now()) + str(string) print(string) if level == "INFO": logging.info(string) @@ -145,14 +147,14 @@ def sort_dictionary(dict): return sorted(dict.items(), key=operator.itemgetter(1)) -def normalize(string): - # replaceRockDots - string = re.sub(r'[ß]', "ss", string.lower()) - string = re.sub(r'[ö]', "oe", string) - string = re.sub(r'[ü]', "ue", string) - string = re.sub(r'[ä]', "ae", string) - string = textacy.preprocess.normalize_whitespace(string) - return string +def normalize_str(string): + """ + replaceRockDots + textacy.preprocess.normalize_whitespace + :param string: str + :return: str + """ + return textacy.preprocess.normalize_whitespace(replaceRockDots(string)) def deprecated(func): @@ -200,14 +202,18 @@ def corpus2Meta(corpus): for doc in corpus: yield doc.metadata -def savelabledCorpiLines(corpus,filepath): - +def savelabledCorpiLines_cat(corpus, filepath): textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath) -def gen_labledLines(corpus): + + +def gen_labledLines(corpus, label ="categoryName"): for doc in corpus: # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi - yield "[" + doc.metadata["categoryName"] + "] " + doc.text + yield "[" + doc.metadata[label] + "] " + doc.text + + + def save_corpus(corpus, corpus_path, corpus_name): @@ -235,6 +241,8 @@ def gen_dicts(corpus): dict.update(doc.metadata) yield dict + + def multisub(subs, subject): #https://stackoverflow.com/questions/764360/a-list-of-string-replacements-in-python "Simultaneously perform all substitutions on the subject string." diff --git a/preprocessing.py b/preprocessing.py index 6f7e0c2..5eaec20 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -1,15 +1,25 @@ # -*- coding: utf-8 -*- - +import matplotlib +matplotlib.use('Agg') from datetime import datetime import csv import sys + +from textacy import Vectorizer + +import draw1 from miscellaneous import * from datetime import datetime import time import textacy from scipy import * - +from scipy.stats import threshold +import draw +from spacy.tokens.token import Token as SpacyToken +from spacy.tokens.span import Span as SpacySpan +from topicModeling import jgibbsLLDAv2 import os +from ressources.iir.lda.llda import * csv.field_size_limit(sys.maxsize) FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" @@ -23,21 +33,29 @@ ressources_path = FILEPATH + "ressources/" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) +""" +def init_glabal_vars(): + global THESAURUS, WORDS, LEMMAS, NOUNS, VORNAMEN, DE_STOP_WORDS, EN_STOP_WORDS, WHITELIST, FOOTER_FLAG, CURRENT_TICKET - - +THESAURUS = {} +WORDS= {} +LEMMAS= {} +NOUNS= {} +VORNAMEN= {} +DE_STOP_WORDS= {} +EN_STOP_WORDS= {} +WHITELIST= {} +CURRENT_TICKET = "" + + REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|]' #+r',.' REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?' +""" -global THESAURUS -global WORDS -global LEMMAS -global NOUNS -global VORNAMEN -global DE_STOP_WORDS -global EN_STOP_WORDS + +""" THESAURUS = {} WORDS= {} LEMMAS= {} @@ -122,11 +140,11 @@ def remove_first_names(): ############# strings - +""" def remove_addresses(string): pass # todo remove_addresses idee postal.parser und zu metadaten hinzufügen -def lemmatizeWord(word,lemma_dict=LEMMAS,n=5): +def lemmatizeWord(word,lemma_dict,n=5): for i in range(n): try: word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower() @@ -135,7 +153,7 @@ def lemmatizeWord(word,lemma_dict=LEMMAS,n=5): return word -def getFirstSynonym(word, thesaurus=THESAURUS, n=3): +def getFirstSynonym(word, thesaurus, n=3): for i in range(n): @@ -158,59 +176,7 @@ def getFirstSynonym(word, thesaurus=THESAURUS, n=3): print("THESAURUSFEHLER BEI: {}".format(word)) return word - -########################## Spellchecking ########################################## -# http://norvig.com/spell-correct.html -# http://wortschatz.uni-leipzig.de/en/download - -import re - - -def words(text): return re.findall(r'\w+', text.lower()) - -def P(word, N=sum(WORDS.values())): - "Probability of `word`." - return WORDS[word] / N - - -def correction(word): - "Most probable spelling correction for word." - return max(candidates(word), key=P) - - -def candidates(word): - "Generate possible spelling corrections for word." - return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word]) - - -def known(words): - "The subset of `words` that appear in the dictionary of WORDS." - return set(w for w in words if w in WORDS) - - -def edits1(word): - "All edits that are one edit away from `word`." - letters = 'abcdefghijklmnopqrstuvwxyz' - splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] - deletes = [L + R[1:] for L, R in splits if R] - transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] - replaces = [L + c + R[1:] for L, R in splits if R for c in letters] - inserts = [L + c + R for L, R in splits for c in letters] - return set(deletes + transposes + replaces + inserts) - - -def edits2(word): - "All edits that are two edits away from `word`." - return (e2 for e1 in edits1(word) for e2 in edits1(e1)) - - -def autocorrectWord(word): - try: - return correction(word) - except: - return word - - +""" ############# stringcleaning @@ -249,10 +215,7 @@ def preparse(stringstream): yield string def postparse(toktext): - """ - :param toktext: spacy.token - :return: string - """ + toktext = toktext.lower_ # remove_words_containing_topLVL @@ -271,18 +234,7 @@ def postparse(toktext): def processDictstream(dictstream, funcdict, parser): - """ - - :param dictstream: dict-gen - :param funcdict: - clean_in_meta = { - "Solution":funclist, - ... - } - - :param parser: spacy-parser - :return: dict-gen - """ + for dic in dictstream: result = {} for key, value in dic.items(): @@ -302,42 +254,39 @@ def processDictstream(dictstream, funcdict, parser): result[key] = value yield result - +""" ################################################################################################## -path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file") - -path2wordsdict = ressources_path + config.get("spellchecking", "pickle_file") - -path2lemmadict = ressources_path + config.get("lemmatization","pickle_file") - -path2firstnameslist = ressources_path + config.get("firstnames","pickle_file") - - -path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file") - -path2ENstopwordlist = ressources_path + config.get("en_stopwords", "pickle_file") - -custom_words = get_list_from_config("preprocessing", "custom_words") - - - -corpus_de_path = FILEPATH + config.get("de_corpus", "path") -de_plainpath = FILEPATH + config.get("de_corpus", "path") + "pre_labled_lines.txt" - -corpus_en_path = FILEPATH + config.get("en_corpus", "path") - - +""" def extract_from_corpus(corpus): - """ - Extract from each doc from a corpus a string containing disired token_texts +""" +""" + Extract from each doc from a corpus a string containing token_texts :param corpus: textacy.Corpus :return: string-gen - """ +""" +""" + + weighting = 'tf' #'tfidf' + ngrams = 1 + min_df = 1 + max_df = 0.9 + + + ###### vectorize corpi + + vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) + + terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus) + doc_term_matrix = vectorizer.fit_transform(terms_list) + id2term = vectorizer.id_to_term #__getattribute__("id_to_term") + + + # WHITELIST erstellen. Enthält zumindest die evtuellen Topics @@ -370,13 +319,13 @@ def extract_from_corpus(corpus): for tok in doc: - """ - if tok.lower_ =="boss": - ents_boss.append(tok.ent_type_) +""" +#if tok.lower_ =="boss": +# ents_boss.append(tok.ent_type_) - if tok.lower_ =="sap": - ents_sap.append(tok.ent_type_) - """ +#if tok.lower_ =="sap": +# ents_sap.append(tok.ent_type_) +""" # wenn in whitelist, direkt übernehmen if tok.lower_ in WHITELIST: @@ -402,7 +351,7 @@ def extract_from_corpus(corpus): - if tok.pos_ in ["NOUN"] \ + if tok.pos_ in ["NOUN","PROPN"] \ or tok.ent_type_ in ["NORP","FACILITY","ORG","PRODUCT","WORK_OF_ART","LOC"]: #or tok.dep_ == "ROOT": # or tok.lower_ in NOUNS \ #,"PERSON"] \ @@ -411,12 +360,14 @@ def extract_from_corpus(corpus): toktext = lemmatized_word - # hauptsynonym bilden idee zwar das Huaptsyn bilden und zählen aber die originalen wörter in den llda algo geben - """ - first_synonym = getFirstSynonym(toktext, thesaurus=THESAURUS) - if first_synonym is not None or first_synonym != '': - toktext = first_synonym if len(first_synonym.split()) == 1 else toktext - """ + use_thesaurus = False + if use_thesaurus: + # hauptsynonym bilden idee zwar das Huaptsyn bilden und zählen aber die originalen wörter in den llda algo geben + #fehler ergibt nonsens frage werden die gesamtzahl an termen signifikant reduziert? + first_synonym = getFirstSynonym(toktext, thesaurus=THESAURUS) + if first_synonym is not None or first_synonym != '': + toktext = first_synonym if len(first_synonym.split()) == 1 else toktext + result.append(toktext) @@ -425,66 +376,657 @@ def extract_from_corpus(corpus): yield " ".join(result) + #return doc_term_matrix + + """ - print(list(set(ents_sap))) - ['', 'ORG', 'PERSON', 'LOC'] - print(list(set(ents_boss))) - ['', 'ORG', 'PERSON', 'LOC'] + +#print(list(set(ents_sap))) +# ['', 'ORG', 'PERSON', 'LOC'] + +#print(list(set(ents_boss))) +# ['', 'ORG', 'PERSON', 'LOC'] + + + + + +# LOAD FROM CONFIG + +path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file") +path2wordsdict = ressources_path + config.get("spellchecking", "pickle_file") +path2lemmadict = ressources_path + config.get("lemmatization","pickle_file") +path2firstnameslist = ressources_path + config.get("firstnames","pickle_file") +path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file") +path2ENstopwordlist = ressources_path + config.get("en_stopwords", "pickle_file") +custom_words = get_list_from_config("preprocessing", "custom_words") + +corpus_de_path = FILEPATH + config.get("de_corpus", "path") +de_plainpath = FILEPATH + config.get("de_corpus", "path") + "pre_labled_lines.txt" + +corpus_en_path = FILEPATH + config.get("en_corpus", "path") + + + + + + + +def norma(token_or_span): + + if isinstance(token_or_span, SpacyToken): + return normalize_token(token_or_span) + + elif isinstance(token_or_span, SpacySpan): + result= ' '.join(normalize_token(subtok) for subtok in token_or_span) + return textacy.preprocess.normalize_whitespace(result) + + else: + msg = 'Input must be a spacy Token or Span, not {}.'.format(type(token_or_span)) + raise TypeError(msg) + + +def normalize_token(tok): + global CURRENT_TICKET, FOOTER_FLAG + + + + # check if CURRENT_TICKET is assigned + try: + CURRENT_TICKET + FOOTER_FLAG + except NameError: + CURRENT_TICKET = tok.doc + FOOTER_FLAG = False + + + + #aktuell verarbeitendes Doc merken. wenn neues Ticket, dann Footer_flag auf False setzen + if tok.doc != CURRENT_TICKET: + FOOTER_FLAG = False + CURRENT_TICKET = tok.doc + + # wenn in whitelist, direkt übernehmen + if tok.lower_ in WHITELIST: + return tok.lower_ + + + # ignore header, urls , emails, stop, vornamen, blacklisted + lemmatized_word = lemmatizeWord(tok.text,lemma_dict=LEMMAS) + if lemmatized_word.lower() in ["sehr", "geehrt", "herr" ,"herrn", "herren", "dame" , "damen", "liebe","lieben", "hallo", "guten", "tag","ehre","hi"] \ + or tok.like_url \ + or tok.like_email \ + or tok.is_stop \ + or tok.is_punct \ + or tok.lower_ in DE_STOP_WORDS \ + or tok.lower_ in VORNAMEN\ + or tok.lower_ in BLACKLIST: + return "" #todo bla das alles hier kommt zwischen to_terms_list und fitransform + + + + # cut after footer + + if replaceRockDots(tok.lower_) in ["gruss", "grusse", "gruesse", "gruessen", "grusses"]: # fehler schneidet bei zB INC40506 das meiste weg + FOOTER_FLAG = True + + + + if ( tok.pos_ in ["NOUN","PROPN"] or tok.ent_type_ in ["NORP","FACILITY","ORG","PRODUCT","WORK_OF_ART","LOC"] ) and not FOOTER_FLAG: + + #or tok.dep_ == "ROOT": + # or tok.lower_ in NOUNS \ #,"PERSON"] \ + toktext = tok.lower_ + + + toktext = lemmatized_word + + use_thesaurus = False + if use_thesaurus: + # hauptsynonym bilden idee zwar das Huaptsyn bilden und zählen aber die originalen wörter in den llda algo geben + #fehler ergibt nonsens frage werden die gesamtzahl an termen signifikant reduziert? + first_synonym = getFirstSynonym(toktext, thesaurus=THESAURUS) + if first_synonym is not None or first_synonym != '': + toktext = first_synonym if len(first_synonym.split()) == 1 else toktext + + + + return toktext + else: + return "" + + + + +def preprocessCorpus(cleaned_corpus): + logprint("Preprocess {0}_corpus at {1}".format(cleaned_corpus.lang, datetime.now())) + global THESAURUS, WORDS, LEMMAS, NOUNS, VORNAMEN, DE_STOP_WORDS, EN_STOP_WORDS, WHITELIST, BLACKLIST + + + + + + + weighting = 'tf' #'tfidf' + ngrams = (1,2) + min_df = 1 + max_df = 0.3 + min_label_freq = 1 + + + + + + + + WHITELIST = ["boss", "sap", "firefox"] # todo autogenerierung von whitelist + kb_cats = ['eldorado', 'cws_confluence', 'wsus', 'mail groupware', 'd.3 dms', 'serviceportal', 'softwarelizenzen', + 'sophos', 'webserver', 'sap', 'ftp server', 'dhcp', 'tonerboerse', 'mailalias', 'arbeitsplatzsupport', + 'mediendienste', 'mailverteiler', 'uni mail', 'basis app', 'videoschnitt', 'DEFAULT', 'verwaltung', + 'matrix42_hilfe', 'hoersaaluebertragung', 'redmine', 'uniflow', 'keine rueckantwort', 'pools', 'leitung', + 'netze', 'konteneinsicht', 'kennwort aenderung', 'datanet', 'neuanschluss', 'semesterticket', 'asknet', + 'veranstaltungen', 'housing', 'fk 16', 'fiona', 'betrieb', 'vorlagenerstellung', + 'studierendensekretariat', 'pvp', 'mobilfunkvertraege', 'ausleihe', 'web', 'spam phishing', 'sap urlaub', + 'evaexam', 'vorlesungsaufzeichnung', 'firewall betreuung', 'ub basis it', 'virtuelle desktops citrix', + 'fk15', 'virtuelle server', 'lizenzserver', 'elektronisches telefonbuch', 'joomla itmc website', + 'weiterentwicklung', 'serversupport', 'wlan', 'kurse', 'technik', 'raumkalender', 'backup tsm', + 'haustechnik', 'voicemail box', 'facility', 'unicard ausgabe', 'mdm mobile device management', + 'entwicklung', 'webgestaltung', 'unicard sperrung', 'forensic', 'basis applikationen', + 'overhead projektor', 'plagiatserkennung', 'uniaccount zugangsdaten', 'zentrale webserver', 'webmailer', + 'fk12 webauftritt', 'plotter', 'campus management', 'ub_stoerungen', 'rundmail', 'telefon', + 'raumbuchung', 'fk12 migration', 'dienstreise', 'hardware', 'it sicherheit sic', 'hochleistungsrechnen', + 'unicard', 'sos', 'benutzerverwaltung_probleme', 'confluence', 'vpn', 'zhb', 'campus app', + 'itmc_aufgaben', 'sicherheit', 'schulungsraum verwaltung', 'unicard produktion', 'schulung', 'video', + 'dokoll support', 'sd', 'servicedesk', 'v2 campus app feedback', 'lido', 'app feedback', + 'ibz raumbuchung', 'hcm stammdaten', 'itmc_stoerungen', 'boss service desk', 'exchange nutzung', + 'office', 'rektorat -buero', 'bestellung', 'moodle', 'fk raumplanung 09', 'aenderung', 'neuausstattung', + 'benutzerverwaltung', 'rechnerraeume', 'designentwicklung', 'fk 12', 'werkstoffe lehrstuhl bauwesen', + 'server storage', 'beantragung', 'visitenkartenproduktion', 'gastaufenthalt', 'telefonkonferenzen', + 'raumbuchungssysteme', 'fk14_test', 'e mail dienste', 'grafik', 'ews', 'itmc schulungsraeume', 'tsm', + 'softwareverteilung', 'beamer', 'lizenzmanagement', 'fileserver einrichtung', + 'redmine projektverwaltung', 'service desk itmc', 'pruefungsmanagement', + 'prozess- und projektmanagement', 'formulare antraege', 'namensaenderung', 'verkauf', 'software', + 'itmc medienraeume ef50', 'zugangsdaten', 'medientechnik', 'lan', 'veeam', 'unicard redaktionsteam', + 'changes', 'service portal', 'limesurvey', 'dns', 'dokoll pvp', 'uhren', 'nrw ticket', 'itmc_als', + 'linux bs', 'werkvertraege', 'blogs wikis foren', 'test', 'abmeldung', 'desktop & basisdienste', + 'telefonzentrale', 'siport zugangskontrolle', 'antrag auf rechnungserstellung', 'verschiedene aufgaben', + 'kundenserver', 'medienraeume ef50', 'videokonferenzen', 'benutzungsverwaltung', + 'mailverteiler exchange', 'lsf', 'telefonabrechnung', 'werkstaette', 'uniaccount', 'outlook_einrichtung', + 'itmc webauftritt', 'zertifikate server dfn', 'allgemein', 'umzug', 'service portal redaktion', 'pos', + 'beschaffung', 'boss', 'hacker angriff', 'software entwicklung', 'cd dvd produktion', 'sam spider', + 'viren', 'kursplanung', 'itmc pools', 'kms', 'e learning'] + kb_keys = ['zugriff_onlinedienste_rueckmeldung', 'uniaccount', 'freischaltung', 'asknet', 'eduroam', + 'donnerstagsmail namensaenderung', 'asiexception', 'lsf', 'kundenantwort', 'chip', 'unitymedia', + 'citavi', 'fehler', 'windows beziehen', 'wlan', 'ipv6', 'freischaltung verzoegert', 'betrag', + '"defekte karte"', 'risse', 'laden', 'sap portal anderer modus', 'goeke', + 'informationen des itmc zum einsatz', 'transport wurde durchgefuehrt.', 'wi-fi', 'unicard_auszahlung', + 'ausleihe', 'unimail', 'uni-account', 'unicard', 'beantragung', 'nrw-ticket', 'printservice', 'dms', + 'ip6', 'transport und beschreibung zum transportauftrag !', 'wlan passwort', + 'dokumentenmanagementsystem', 'webmailer', 'vpn', 'repository', 'unicard', 'projekte', 'eingeschrieben', + 'unicard abholung oeffnungszeiten', 'd3', 'beantragung', 'app tu-dortmund feedback', 'semester ticket', + 'redmine', 'git', 'geldkarte', 'outlook_exchange', 'spam standardmeldung phishing', + 'automatische aktualisierung der selbst angelegten kontakte in outlook', '"beschaedigte unicard"', + 'elektronische telefonbuch', 'boss', 'wwrite', 'DEFAULT', 'anyconnect', 'wifi'] + kb_subjs = ['sd_office 365 plus support', 'citavi_lizenzschluessel_nicht bekommen', 'uni card', + 'sd_office 356 plus bestellung', 'sd_gastaufenthalter', + 'sd_outlook kontakte automatische aktualisierung', 'benutzer zum redmine hinzufuegen', + 'sd_matlab lizenzdatei pc-pools', 'sd_tu-app feedback standard', 'vpn_ipsec_stoerung', + 'vpn verbindung fuer unitymedia kunden', 'ub_prod_abholung_ abholfristen_benachrichtigungen', + 'einrichtung des eduroam netzwerks', 'sd_webmailer_threadanzeige und weiterleitung', + 'sd_wlan passwort setzen', 'ub_prod_namenskorrektur_student', 'sd_unimail imap_pop3', + 'sd_outlook_in_exchange_einbinden', 'sd_keine rueckantwort kunde', 'sd_asknet_und_dreamspark', + 'sd_heirat_namensaenderung_student', 'bd_unicard_nicht_eingeschrieben', 'wlan', + 'sd_telefonbuch_prof_eintragung', 'change produktiv nehmen chn00146 - transport e01k909284', + 'ungueltiges ticket siehe journal', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', + 'd.3 client installation', 'unicard_restbetrag_auszahlung', 'cm_asiexception', 'sd_origin_workaround', + 'sd_vpn_aktualisierung', 'problem mit der beantragung von der unicard', + 'sd_unicard fehlerhafte geldbuchung', 'sd_login tu portals english', 'sd_gmx_web.de', + 'studierendenausweis', 'sd_citavi', 'sd_fk9 test', 'sd_webmailer_thread-anzeige', + 'bd_unicard_geldkarte_laden', 'ub_unicard_unicard mit vollmacht abholen', + 'sd_stellenausschreibung schwarzes brett', 'freischaltung uniaccount', + 'sd_asknet_mitarbeiter_softwarebestellung', 'how to setup eduroam', 'sd_citavi bestellung', + 'unicard vergessen abzuholen und nicht mehr da', 'sd_unimail zu exchange', 'sd_diensthandy beschaffung', + 'sd_sap konteneinsicht antrag', 'sd_unicard_defekt', 'sd_webmailer einrichtung weiterleitung', + 'sd_kurs-angebote anmeldung', 'm42_dokumentationen_zu_neuen_ous', 'sd_sap_initialkennwort', + 'sd_sap_freischaltung ohne passwortaenderung', 'sd_telefonbuch-eintrag_aenderung', 'sd_pruefungsamt', + 'sd_phishing', 'apps_dms-passwort d.3', 'sd_goeke drucker', 'sd_sap_dienstreise', + 'unicard nochmal beantragen', 'sd_outlook anmeldung gestoert', 'sd_citavi_support', 'DEFAULT', + 'sd_geraeteausleihe', 'sd_account_abmelden', 'sd_uniaccount freischaltung verzoegert englisch', + 'ub_beschaedigte unicard', 'sd_gleitzeitanlage_dez3_stoerung', 'transportdurchfuehung', + 'sd_sap_initialkennwort_englisch', 'sd_antwort_phishingmail', 'sd_namensaenderung mitarbeiter', + 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'lsf freischaltung als mitarbeiter/in', + 'ub_unicard_spaetere abholung moeglich?', 'sd_antrag funktionale mailadresse', 'sd_apple-on-campus', + 'sd_office365_asknet', 'sd_sophos download', 'sd_freischaltung uniaccount verzoegert', + 'ub_unicard_zusendung der karte moeglich?', 'ohne betreff', 'sd_immatrikulationsbescheinigung_portal', + 'sd_studisek_buchung_semesterbeitrag', 'sd_studisek_englisch', + 'probleme mit der namensaenderung/ neue unicard', 'sd_telefonbuch, neues system', + 'fehlender eintrag im elektronischen telefonbuch', 'sd_boss_notenverbuchung', + 'sd_laufzeit unimail account', 'sd_semesterticket', 'sd_kontakt_asknet', 'windows 10', + 'sd_login_tu_portale', 'ub_geldchip-problem bei uc', 'sd_zugriff_onlinedienste_rueckmeldung', + 'sd_wlan-gastkonto', 'sd_tu_app_keine internetverbindung', 'sd_uniaccount_ehemalige_passwortaenderung', + 'sd_verlust/antrag unicard', 'sd_sap_konteneinsicht_ workaround', 'apps_redmine_repository', + 'sd_itmc kurse anmeldebestaetigung', 'sd_mail_als_anhang', 'bd_unicard_chip_defekt', + 'probleme mit unicard', 'ub_unicard_abholungszeiten', 'sd_falsche_personendaten', + 'sd_uniaccount_ehemalige_studierende', 'sd_vpn anleitungen', 'sd_kurs-angebote itmc', 'sd_studisek', + 'sd_login tu portale', 'sd_immatrikulationsbescheigung_druckfehler', 'ub_drucker kopierer', + 'sd_vpn_temporaerer fehler ub', 'sd_spss_online_bestellung', 'sd_dreamspark', + 'sd_unicard_gesperrte unicard entsperre', 'sd_boss-bescheinigung', 'bd_goeke_allgemein', + 'sd_uniaccount_passwortaenderung', 'sd_namensaenderung_englisch', 'sd_email_namensaenderung', + 'bd_unicard_freigabe_beantragung', 'spam ohne tu bezug', 'sd_internationaloffice', + 'sd_tu-app feedback_englisch', 'cm_lsf-boss_freischaltung', 'sd-e-mail_adresse_funktional_beantragen', + 'sd_vpn_webvpn', 'sd_vpn_probleme_mit_unitymedia', 'sd_plotauftrag_zv', 'sd_beantragung_unicard', + 'sd_antworten_korrekt', 'ub_prod_neue unicard bei beschaedigung', + 'sd_telefonantrag_aenderung_neuantrag', 'sd_wlan passwort englisch', 'sd_aktivierung uniaccount', + 'sd_spam e-mail bekannt meldung', 'sd_wlan_beratung', 'ub_namensaenderung', + 'sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt)', 'sd_unicard_abholung', + 'sd_uniaccount_dauer freischaltung', 'sd_uniaccount activation englisch', 'sd_unicard_max_laufzeit', + 'sd_unicard_workaround_bestellung', 'sd_sap_firefox_esr', 'sap portal "im anderen modus geoeffnet"', + 'sd_origin nur noch eine seriennummer', 'sd_login_unibib ub-it'] + + BLACKLIST = get_list_from_config("preprocessing", "custom_words") + WHITELIST = WHITELIST + kb_cats + kb_keys + kb_subjs + DE_STOP_WORDS = load_obj(path2DEstopwordlist) + VORNAMEN = load_obj(path2firstnameslist) + + LEMMAS = load_obj(path2lemmadict) + THESAURUS = load_obj(path2thesaurus_dict) + + + + + ###### vectorize corpus + logprint("vectorize corpus") + + vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) #fehler norma() kickt tokens raus, also bezieht sich min/max_df nicht mehr auf cleaned + + terms_list_gen = (doc.to_terms_list(ngrams=ngrams, as_strings=True, normalize=norma) for doc in cleaned_corpus) + + doc_term_matrix = vectorizer.fit_transform(terms_list_gen) + id2term = vectorizer.id_to_term + term2id = vectorizer.vocabulary + + logprint("corpus vectorized") + + + + + + + # write labeled_lines.txt + line_gen = gen_lines(doc_term_matrix,term2id,cleaned_corpus) + lines_txt = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/labled_lines.txt" + textacy.fileio.write_file_lines(line_gen, + filepath=lines_txt) + logprint("labled_lines.txt written") + + + + + + + #### ticket2label_cat + + # get all categrories + """ + labelist_cat = [] + for doc in cleaned_corpus: + category = normalize_str(doc.metadata["categoryName"]) + labelist_cat.append(category) + """ + labelist_cat = [ normalize_str(doc.metadata["categoryName"]) for doc in cleaned_corpus] + + + #create label_dict + labeldict_cat = create_labeldict(labelist_cat, min_label_freq = 1, add_default_label=True) + + # map tickets to labels + """ + ticket2label_cat = {} + for doc in cleaned_corpus: + ticketID = doc.metadata["TicketNumber"] + category_name_ = doc.metadata["categoryName"] + label_num = labeldict_cat.get(category_name_,labeldict_cat['DEFAULT']) + + ticket2label_cat.update({ticketID : label_num}) + """ + ticket2label_cat = { doc.metadata["TicketNumber"] : labeldict_cat.get(doc.metadata["categoryName"],labeldict_cat['DEFAULT']) for doc in cleaned_corpus } + + ################################################################################################## + + kb2keywords_dict, kb2subjects_dict, ticket2kbs_dict, ticket2keywords_dict, ticket2subjects_dict = create_ticket2sth_dicts() + + + #### ticket2label_keys + ticket2label_keys = create_ticket2label_dict(ticket2keywords_dict,cleaned_corpus) + + """ + labelist_keys = ticket2keywords_dict.values() + labelist_keys = flatten(labelist_keys) + + + labeldict_keys = create_labeldict(labelist_keys, min_label_freq = 1, add_default_label=True) + + ticket2label_keys = {} + for doc in cleaned_corpus: + ticketID = doc.metadata["TicketNumber"] + + + keywords = ticket2keywords_dict.get(ticketID, ['DEFAULT']) + + + label = "" + for kw in keywords: + label = label + str(labeldict_keys.get(normalize_str(str(kw)), labeldict_keys['DEFAULT'])) + " " + + + ticket2label_keys.update({ticketID : label}) + + """ + ################################################################################################## + + + + #### ticket2label_subjs + ticket2label_subjs = create_ticket2label_dict(ticket2subjects_dict,cleaned_corpus) + + """ + labelist_subjs = ticket2subjects_dict.values() + labelist_subjs = flatten(labelist_subjs) + + + labeldict_subjs = create_labeldict(labelist_subjs, min_label_freq = 1, add_default_label=True) + + ticket2label_subjs = {} + for doc in cleaned_corpus: + ticketID = doc.metadata["TicketNumber"] + + + keywords = ticket2subjects_dict.get(ticketID, ['DEFAULT']) + + + label = "" + for kw in keywords: + label = label + str(labeldict_subjs.get(normalize_str(str(kw)), labeldict_subjs['DEFAULT'])) + " " + + ticket2label_subjs.update({ticketID : label}) + """ + + + #### ticket2label_kb + ticket2label_kb = create_ticket2label_dict(ticket2kbs_dict,cleaned_corpus) + + """ + labelist_kbs = ticket2kbs_dict.values() + labelist_kbs = flatten(labelist_kbs) + + labeldict_kbs = create_labeldict(labelist_kbs, min_label_freq = 1, add_default_label=True) + + ticket2label_kb = {} + for doc in cleaned_corpus: + ticketID = doc.metadata["TicketNumber"] + + + keywords = ticket2kbs_dict.get(ticketID, ['DEFAULT']) + + + label = "" + for kw in keywords: + label = label + str(labeldict_kbs.get(normalize_str(str(kw)), labeldict_kbs['DEFAULT'])) + " " + + ticket2label_kb.update({ticketID : label}) + """ + + def relabele_lines(file,ticket2label_dict): + + line_gen = textacy.fileio.read_file_lines(file) + + for line in line_gen: + label = re.findall(r'\[(.*?)\]', line) + + new_label = "[ " + for lbl in label: + new_label = new_label + str(ticket2label_dict.get(str(lbl), "")).strip() + " " + + new_label = new_label + "] " + result = new_label + str(line.rpartition("]")[2]) + + # new_label = str([ticket2label_dict.get(str(lbl),"") for lbl in label]) + + # result = "[ " + new_label + " ] " + line.rpartition("]")[2] + # print(result) + + yield result + + lines_sub = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/labled_lines_sub.txt" + generator = relabele_lines(lines_txt, ticket2subjects_dict) + textacy.fileio.write_file_lines(generator, lines_sub) + + + labelset, corpus, labels = load_corp(lines_sub) + + K = 20 #Number of topics, ist egal, wird in implementierung nicht verwertet + alpha = 0.001 + beta = 0.001 + number_of_iterations = 10 + + + llda = LLDA(K, alpha, beta) + llda.set_corpus(labelset, corpus, labels) + + for i in range(number_of_iterations): + llda.inference() + + phi = llda.phi() + # print(llda.vocas) + + # for v, voca in enumerate(llda.vocas): + # print ','.join([voca]+[str(x) for x in llda.n_z_t[:,v]]) + # print(','.join([voca] + [str(x) for x in phi[:, v]])) + + + + + ################# termite plot ###################################### + topic_labels = list(labelset) + term_labels = list(llda.vocas) + + term_topic_weights = phi.transpose() + + threshmin = 0.05 + + + thresholded = threshold(term_topic_weights, threshmin=threshmin) + + draw.draw_termite(thresholded, topic_labels, term_labels, save="test.png") + #jgibbsLLDAv2("corpi/labled_lines.txt",ticket2kbs_dict,cleaned_corpus,"results") + + + + + return doc_term_matrix, id2term + + + +def create_ticket2label_dict(ticket2chunk_dict,corpus): + """ + Creates a dictionary to map a TicketNumber to a label + :param ticket2chunk_dict: e.g. { TicketNumber : KB_entries } + :return: {TicketNumber : label } + """ + + labelist = ticket2chunk_dict.values() + labelist = flatten(labelist) + + labeldict = create_labeldict(labelist, min_label_freq=1, add_default_label=True) + + ticket2label = {} + for doc in corpus: + ticketID = doc.metadata["TicketNumber"] + + keywords = ticket2chunk_dict.get(ticketID, ['DEFAULT']) + + label = "" + for kw in keywords: + label = label + str(labeldict.get(normalize_str(str(kw)), labeldict['DEFAULT'])) + " " + + ticket2label.update({ticketID: label}) + + return ticket2label + + + +def create_labeldict(labelist, min_label_freq=1, add_default_label=True): + + # nur die x häufigsten labels benutzen + labelist = [l for l in labelist if labelist.count(l) >= min_label_freq] + + in_labelist_ = {k: labelist.count(k) for k in labelist} # { label1 : 3 , label2 : 5, label3 : 1 } + labelist = sort_dictionary(in_labelist_) # [ (label3, 1), (label1, 3), (label2, 5) ] + labelist.reverse() # [ (label2, 5), (label1, 3), (label3, 1) ] + labeldict = {elem[0]: i for i, elem in enumerate(labelist)} # { label2 : 0, label1 : 1 , label3 : 2 } + if add_default_label: + if 'DEFAULT' not in labeldict.keys(): + labeldict.update({'DEFAULT': len(labelist)}) # { label2 : 0, label1 : 1 , label3 : 2 , DEFAULT : 3 } + return labeldict + + + +def create_ticket2sth_dicts(): + """ + Return: {str : [str] } + + kb2keywords_dict {'KBA10230': ['DEFAULT'], 'KBA10129': ['DEFAULT'], 'KBA10287': ['sd_ansys_informationen'], } len = 260 + kb2subjects_dict {'KBA10230': ['unicard nochmal beantragen'], 'KBA10129': ['sd_entsperrung unicard nach verlust/wiederfinden'], } len = 260 + ticket2kbs_dict {'INC44526': ['KBA10056'], 'INC67205': ['KBA10056'], } len = 4832 + ticket2keywords_dict {'INC44526': ['DEFAULT'], 'INC67205': ['DEFAULT'], 'INC71863': ['DEFAULT'], 'INC44392': ['asknet'] } len=4832 + ticket2subjects_dict {'INC44526': ['sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt)'], len=4832 + + """ + + # kb2keywords_dict / kb2subjects_dict --> {str : [str]} + + kb2keywords_dict = {} + kb2subjects_dict = {} + + kb_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";") + next(kb_gen, None) # skip first line "ArticleID";"Subject";"Keywords";... + + for line in kb_gen: + + kb_id = line[0] + + subject = normalize_str(line[1]) + + keywords = [normalize_str(x) for x in str(line[2]).split(",")] + + if kb_id not in kb2keywords_dict.keys(): + kb2keywords_dict[kb_id] = keywords if keywords != [''] else ["DEFAULT"] + else: + kb2keywords_dict[kb_id] = kb2keywords_dict[kb_id] + keywords + + if kb_id not in kb2subjects_dict.keys(): + kb2subjects_dict[kb_id] = [normalize_str(subject) if subject != [''] else "DEFAULT"] + else: + kb2subjects_dict[kb_id].append(normalize_str(subject)) + + # ticket2kbs_dict --> {str : [str]} + ticket2kbs_dict = {} + kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";") + next(kb2ticket_gen, None) # skip first line"TicketNumber";"ArticleID" + + for line in kb2ticket_gen: + ticket_id = line[0] + kb_id = line[1] + + if ticket_id not in ticket2kbs_dict.keys(): + ticket2kbs_dict[ticket_id] = [kb_id] + else: + ticket2kbs_dict[ticket_id].append(kb_id) + + # ticket2keywords --> {str:[str]} + ticket2keywords_dict = {} + + for ticket_id, kb_ids in ticket2kbs_dict.items(): + + if ticket_id not in ticket2keywords_dict.keys(): + ticket2keywords_dict[ticket_id] = [] + + for kb_id in kb_ids: + ticket2keywords_dict[ticket_id].append(kb2keywords_dict[kb_id]) + + ticket2keywords_dict[ticket_id] = flatten(ticket2keywords_dict[ticket_id]) + + # ticket2subjects --> {str:[str]} + ticket2subjects_dict = {} + + for ticket_id, kb_ids in ticket2kbs_dict.items(): + + if ticket_id not in ticket2subjects_dict.keys(): + ticket2subjects_dict[ticket_id] = [] + + for kb_id in kb_ids: + ticket2subjects_dict[ticket_id].append(kb2subjects_dict[kb_id]) + + ticket2subjects_dict[ticket_id] = flatten(ticket2subjects_dict[ticket_id]) + + + """ + count_dict = {} + for v in ticket2kbs_dict.values(): + for kb in v: + if kb in count_dict.keys(): + count_dict[kb] +=1 + else: + count_dict[kb] = 1 + + sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1)) + + for k,v in sorted_dict: + subs = kb2subjects_dict[k] + keys = kb2keywords_dict[k] + print(subs, keys , v) # frage wieviele tickets pro topic? + + print("kb_entrys used: {}".format(len(sorted_dict))) # frage wie viele kb_entry's insg genutzt?: 155 """ + return kb2keywords_dict, kb2subjects_dict, ticket2kbs_dict, ticket2keywords_dict, ticket2subjects_dict + #labelist = ticket2keywords_dict.values() -def preprocessCorpus(corpus, clean_in_meta): + #labelist = flatten(labelist) + #labelist = list(set(labelist)) + #labeldict = {k: v for v, k in enumerate(labelist)} - logprint("Preprocess {0}_corpus at {1}".format(corpus.lang, datetime.now())) + ############################################################################################## - preCorpus_name = corpus.lang + "_pre" +def gen_lines(doc_term_matrix,term2id,corpus,label="TicketNumber"): - clean_corpus = corpus + for i,doc in enumerate(corpus): + line = "[" + doc.metadata[label] + "]" - parser = corpus.spacy_lang + for term, id_ in term2id.items(): + if doc_term_matrix[i, id_] != 0: + term = term if len(term.split()) == 1 else "_".join(term.split()) - pre_corpus = textacy.Corpus(parser) + line = line + " " + term + if len(line) != 0: + yield line + else: + continue - ## process and add files to textacy-corpi, - pre_corpus.add_texts( - #processContentstream(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser), - extract_from_corpus(clean_corpus), - processDictstream(corpus2Meta(clean_corpus), clean_in_meta,parser=parser) - ) - - # idee labeled_lines.txt enthählt bigramme mit unterstrich - # todo preCorpus weg. llda bekommt labaled_lines.txt und lda doctermamtrix - - - # leere docs aus corpi kicken - pre_corpus.remove(lambda doc: len(doc) == 0) - - - #save corpus - save_corpus(corpus=pre_corpus, corpus_path=corpus_de_path, corpus_name=preCorpus_name) - - - #save corpus as labled, plain text - savelabledCorpiLines(pre_corpus, de_plainpath) - - labled_lines ="" - return pre_corpus - - -def main(corpus): +def main(cleaned_corpus): start = time.time() @@ -505,38 +1047,42 @@ def main(corpus): #todo STELLSCHRAUBE remove_short_words() ] - """ + clean_in_meta = { "Solution": [removePOS(["SPACE"])], "Subject": [removePOS(["SPACE", "PUNCT"])], "categoryName": [removePOS(["SPACE", "PUNCT"])] } + """ - - pre_corpus = preprocessCorpus(corpus, clean_in_meta) - - - - - #for i in range(5): - # printRandomDoc(pre_corpus) + doc_term_matrix, id2term_dict = preprocessCorpus(cleaned_corpus) end = time.time() logprint("Time Elapsed Preprocessing:{0} min".format((end - start) / 60)) - return pre_corpus + return doc_term_matrix, id2term_dict + + + + + + if __name__ == "__main__": - corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/",corpus_name="de_clean") + logprint("Load Corpus...") + corpus_name = "de_clean_small" # _small + cleaned_corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/", corpus_name=corpus_name) - main(corpus) + logprint("... Done") + + main(cleaned_corpus) diff --git a/test.py b/test.py index 0ebd69b..e342b37 100644 --- a/test.py +++ b/test.py @@ -14,17 +14,220 @@ from scipy import * import json import draw """ +import matplotlib +matplotlib.use('Agg') import os import time + +from textacy import Vectorizer +from itertools import tee start = time.time() from gensim.models import Doc2Vec - +from datetime import datetime import textacy FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" from miscellaneous import * - +from ressources.iir.lda.llda import * +import numpy as np import re +import draw +# http://universaldependencies.org/u/pos/ + +#corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/", corpus_name="de_clean") + +# array of zeros and ones interleaved +lrg = np.arange(2).reshape((2,-1)).repeat(1000000,-1).flatten() + +flt = lrg[lrg==0] + +flt = np.array(filter(lambda x:x==0, lrg)) + +lines_txt = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/corpi_vor_besprechung/labeled.txt" + +labelset, corpus, labels = load_corp(lines_txt) + +llda = LLDA(20, 0.001, 0.001) +llda.set_corpus(labelset, corpus, labels) + +for i in range(10): + llda.inference() + +phi = llda.phi() +#print(llda.vocas) + +#for v, voca in enumerate(llda.vocas): + # print ','.join([voca]+[str(x) for x in llda.n_z_t[:,v]]) + #print(','.join([voca] + [str(x) for x in phi[:, v]])) + + + + +################# termite plot ##################################################################### +topic_labels = list(labelset) +term_labels = list(llda.vocas) + + +term_topic_weights = phi.transpose() + +threshmin = 0.005 + + + +from scipy.stats import threshold + +thresholded = threshold(term_topic_weights, threshmin=threshmin) + + + + +draw.draw_termite(thresholded, topic_labels, term_labels, save="test.png") + + + +exit() + + + + + +KBA10184_text = "wenn Sie Ihr UniAccount Passwort ändern möchten, k�nnen Sie dies im Service Portal unter folgendem Link durchführen: https://service.tu-dortmund.de/uniaccount-passwort" + + +corpus = textacy.Corpus("de") +preprocess_replace_urls = textacy.preprocess.replace_urls(KBA10184_text,replace_with="URL") +print(preprocess_replace_urls) + +preprocess_replace_urls = textacy.preprocess.transliterate_unicode(KBA10184_text) +print(preprocess_replace_urls) +#corpus.add_text(preprocess_replace_urls) + +to_corr = "Sehr geehrtes ITMC Service Team, seit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen " \ + "An das Intranet der BMP Mit der Dosennummer G1 303 04 12.05 G1 4 26-1 in Raum G1-426 nicht Mehr Zu funktionieren. " \ + "Ich Wuerde Sie daher bitten diese Mail An den zustaendigen Kollegen weiterzuleiten," \ + " Um Die Leitung Vielleicht Einmal Zu Pruefen. Des Weiteren haette Ich noch Eine Frage " \ + "bezueglich der Moeglichkeit zur Nutzung Einer VPN Verbindung Aus unserem Intranet" \ + " heraus Zu einem fremden Netzwerk. Dies ist zwar Ueber das WLAN-Netz moeglich, jedoch nicht Aus unserem Netzwerk heraus." + +to_corr = "Wichtiger Hinweis: Die Information in dieser E-Mail ist vertraulich. Sie ist ausschließlich für den Adressaten bestimmt. Sollten Sie nicht der für diese E-Mail bestimmte Adressat sein, unterrichten Sie bitte den Absender und vernichten Sie diese Mail. Vielen Dank. Unbeschadet der Korrespondenz per E-Mail, sind unsere Erklärungen ausschließlich final rechtsverbindlich, wenn sie in herkömmlicher Schriftform (mit eigenhändiger Unterschrift) oder durch Übermittlung eines solchen Schriftstücks per Telefax erfolgen." +corpus.add_text(to_corr) + + + + + + + +for doc in corpus: + for sent in doc.sents: + for tok in sent: + print(tok.text, tok.pos_, tok.dep_) + + +corpus.add_text("dfsaf fdsa fdsa",metadata={ "x" : "test"}) +corpus.add_text("iuzizi gbjh iuzit",metadata={ "x" : "testo"}) + + + +vectorizer = Vectorizer(weighting="tf", min_df=1, max_df=1.0) + +# create tuples of term_list and metadata for each doc +terms_meta_gen, terms_meta_gen_copy = tee(((doc.to_terms_list(ngrams=(1,2), as_strings=True), doc.metadata) for doc in corpus)) +terms_list_gen, terms_list_gen_copy = tee((term_meta[0] for term_meta in terms_meta_gen)) + +doc_term_matrix = vectorizer.fit_transform(terms_list_gen) +id2term = vectorizer.id_to_term +term2id = vectorizer.vocabulary + +for k,v in term2id.items(): + print(k,doc_term_matrix[0,v]) + + + + + +#{0: 'dfsaf', 1: 'fdsa', 2: 'dfsaf fdsa', 3: 'fdsa fdsa', 4: 'iuzizi', 5: 'gbjh', 6: 'iuzit', 7: 'iuzizi gbjh', 8: 'gbjh iuzit'} + +#line_gen = ( doc.metadata["x"] + " ".join([term for term in]) + +def gen_lines(docterm,term2id,corpus,label): + + for i,doc in enumerate(corpus): + line = "[" + doc.metadata[label] + "]" + + for term, id_ in term2id.items(): + if doc_term_matrix[i, id_] != 0: + term = term if len(term.split()) == 1 else "_".join(term.split()) + + line = line + " " + term + + yield line + +for line in gen_lines(doc_term_matrix,term2id,corpus,"x"): + print(line) + + + #doc.to_terms_list(ngrams=2, as_strings=True) + + # "" \ + # "" \ + # "" ".join( + #[term if term in id2term.values() else "" for term in terms_meta[0]]) for terms_meta in terms_meta_gen_copy) + + +label = "x" + +#for line in line_gen: +# print(line) + +#terms_meta_gen = ( (doc.to_terms_list(ngrams=2, as_strings=True),doc.metadata) for doc in corpus) + +for x in terms_meta_gen: + print(x) + +#terms_list_gen = (term_meta[0] for term_meta in terms_meta_gen) + +for x in terms_list_gen: + print(x) + + + + + + +for doc in corpus: + for term in doc.to_terms_list(ngrams=2, as_strings=True): + print(type(term)) + + + + + + + + + + +for doc in corpus: + for span in textacy.extract.ngrams(doc,2, + filter_stops=True, filter_punct=True, + filter_nums=False, include_pos=None, + exclude_pos=None, min_freq=1): + print(span.text) + + + + + + + + + + + + + + diff --git a/topicModeling.py b/topicModeling.py index a909838..03acad8 100644 --- a/topicModeling.py +++ b/topicModeling.py @@ -66,14 +66,13 @@ def textacyTopicModeling(corpus, - ###### vectorize corpi vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus) doc_term_matrix = vectorizer.fit_transform(terms_list) - id2term = vectorizer.__getattribute__("id_to_term") + #id2term = vectorizer.__getattribute__("id_to_term") @@ -113,8 +112,264 @@ def textacyTopicModeling(corpus, logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) +def textacyTopicModeling_v2(doc_term_matrix, id_to_term, + n_topics = 15, top_topic_words = 3, + topicModel='lda'): + n_terms = int(n_topics * top_topic_words) + sort_terms_by = 'seriation' # 'seriation', 'weight', 'index', 'alphabetical' + rank_terms_by = 'corpus' # 'corpus', 'topic' + + + + logprint("#### Topic Modeling {0}".format(topicModel)) + logprint(str("n_topics: {0}".format(n_topics))) + logprint("\n") + + start = time.time() + + # http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix + weighting = ('tf' if topicModel == 'lda' else 'tfidf') + + + ####### Initialize and train a topic model + + model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) + + model.fit(doc_term_matrix) + + doc_topic_matrix = model.transform(doc_term_matrix) + + for topic_idx, top_terms in model.top_topic_terms(id_to_term, top_n=top_topic_words, weights=True): + logprint('{0}: {1}'.format(topic_idx, str(top_terms))) + + + + + + ####################### termite plot ################################################################### + + + draw1.termite_plot(model,doc_term_matrix, id_to_term, + + n_terms=n_terms, + sort_terms_by=sort_terms_by, + rank_terms_by=rank_terms_by + '_weight', + + save=FILEPATH + "results/{}.png".format(topicModel)) + + end = time.time() + logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) + + + +def create_ticket2label_dict(ticket2chunk_dict,corpus): + """ + Creates a dictionary to map a TicketNumber to a label + :param ticket2chunk_dict: e.g. { TicketNumber : KB_entries } + :return: {TicketNumber : label } + """ + + labelist = ticket2chunk_dict.values() + labelist = flatten(labelist) + + labeldict = create_labeldict(labelist, min_label_freq=1, add_default_label=True) + + ticket2label = {} + for doc in corpus: + ticketID = doc.metadata["TicketNumber"] + + keywords = ticket2chunk_dict.get(ticketID, ['DEFAULT']) + + label = "" + for kw in keywords: + label = label + str(labeldict.get(normalize_str(str(kw)), labeldict['DEFAULT'])) + " " + + ticket2label.update({ticketID: label}) + + return ticket2label + + + +def create_labeldict(labelist, min_label_freq=1, add_default_label=True): + + # nur die x häufigsten labels benutzen + labelist = [l for l in labelist if labelist.count(l) >= min_label_freq] + + in_labelist_ = {k: labelist.count(k) for k in labelist} # { label1 : 3 , label2 : 5, label3 : 1 } + labelist = sort_dictionary(in_labelist_) # [ (label3, 1), (label1, 3), (label2, 5) ] + labelist.reverse() # [ (label2, 5), (label1, 3), (label3, 1) ] + labeldict = {elem[0]: i for i, elem in enumerate(labelist)} # { label2 : 0, label1 : 1 , label3 : 2 } + if add_default_label: + if 'DEFAULT' not in labeldict.keys(): + labeldict.update({'DEFAULT': len(labelist)}) # { label2 : 0, label1 : 1 , label3 : 2 , DEFAULT : 3 } + return labeldict + + + + +#todo +def jgibbsLLDAv2(labeled_lines_path,ticket2kbs_dict, cleaned_corpus, path2save_results, top_topic_words=7): + + ticket2label_dict = create_ticket2label_dict(ticket2kbs_dict, cleaned_corpus) + + # reduce ticket2label_dict + labeldict = {} + label_list = list(set(ticket2label_dict.values())) + lbl_dict = {elem : i for i,elem in enumerate(label_list)} + + labeldict = {k : lbl_dict[v] for k,v in ticket2label_dict.items()} + labeldict.update({"DEFAULT" : len(labeldict)}) + + + + def gen_lines_from_labeled_lines(input,ticket2label_dict): + + line_gen = textacy.fileio.read_file_lines(input) + + for line in line_gen: + label = re.findall(r'\[(.*?)\]',line) + + new_label = "[ " + for lbl in label: + new_label = new_label + str(ticket2label_dict.get(str(lbl),"")).strip() + " " + + new_label = new_label + "] " + result = new_label + str(line.rpartition("]")[2]) + +# new_label = str([ticket2label_dict.get(str(lbl),"") for lbl in label]) + +# result = "[ " + new_label + " ] " + line.rpartition("]")[2] + #print(result) + + yield result + + + labeldict_rev = {v: k for k, v in labeldict.items()} + + #line_gen = gen_lines_from_labeled_lines(labeled_lines_path,ticket2label_dict) + line_gen = gen_lines_from_labeled_lines(labeled_lines_path,labeldict) + + + jgibbsLLDA_root = FILEPATH + "java_LabledLDA/" + LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) + + + textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath) + + + + + # wait for file to exist + while not os.path.exists(LLDA_filepath): + time.sleep(1) + + + + # run JGibbsLLDA file + + n_topics = len(labeldict) #+1 #default-topic + + FNULL = open(os.devnull, 'w') # supress output + cmd_jgibbs_java = ["java", "-cp", + "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( + jgibbsLLDA_root), + "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile", + "tickets.gz", + "-twords", str(top_topic_words), "-ntopics", str(n_topics)] + subprocess.call(cmd_jgibbs_java, stdout=FNULL) + + + # ANMERKUNG: Dateien sind versteckt. zu finden in models/ + cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)] + output = subprocess.check_output(cmd_gzip).decode("utf-8") + + + topic_regex = re.compile(r'Topic [0-9]*') + + ##################################### + # todo save results in file aufgrund von results + result = [] + + for line in output.splitlines(): + findall = topic_regex.findall(line) + if len(findall) != 0: + try: + index = int(findall[0].split()[1]) + result.append("Topic {} {}:".format(index, str(ticket2kbs_dict[labeldict_rev[index]]))) + + except: + result.append(line) + + else: + result.append(line) + + textacy.fileio.write_file_lines(result, path2save_results+".txt") + ##################################### + + results = [] + res_dict = {} + count =0 + for line in output.splitlines(): + + findall = topic_regex.findall(line) + + if len(findall) != 0: + + if len(res_dict) != 0: + results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig) + + index = int(findall[0].split()[1]) + + res_dict = {index : str(labeldict_rev[index]) } + + else: + splitted = line.split() + res_dict[splitted[0]] = float(splitted[1]) + + if len(res_dict) != 0: + results.append(res_dict) # letzes an die liste ran + + + # every term in the resulsts to a list + + terms=[] + for res in results: + for key,value in res.items(): + if not isinstance(key, int) and not key in terms: + terms.append(key) + + term2id = {t:i for i,t in enumerate(terms)} #and to dict + + ################# termite plot ##################################################################### + topic_labels = list(range(len(labeldict))) + term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()]) + + + term_topic_weights = np.zeros((len(term2id),len(topic_labels))) + + for i,res in enumerate(results): + + for key,value in res.items(): + + if not isinstance(key, int): + term_topic_weights[term2id[key]][i] = value + term_labels[term2id[key]] = key + else: + topic_labels[i] = labeldict_rev[key] + + draw.draw_termite( + term_topic_weights, topic_labels, term_labels, save=path2save_results+".png") + + draw.draw_termite( + term_topic_weights, topic_labels, term_labels, save=path2save_results+"_spaced.png",pow_x=0.78,pow_y=0.87) + + # save labeldict + labeldict_path = path2save_results + "_labeldict.json" + with open(labeldict_path, 'w') as file: + file.write(json.dumps(labeldict)) + @@ -259,15 +514,21 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7): logprint("start Category-LLDA:") + + # labeldict ############################################################################################ + + # build dictionary of ticketcategories labelist = [] for doc in corpus: - category = normalize(doc.metadata["categoryName"]) + category = normalize_str(doc.metadata["categoryName"]) labelist.append(category) - # frage nur die x häufigsten labels benutzen, rest raus? - labelist = [l for l in labelist if labelist.count(l) > 50 ] + x = 50 # frage nur die x häufigsten labels benutzen, rest raus? + + + labelist = [l for l in labelist if labelist.count(l) > x ] in_labelist_ = {k: labelist.count(k) for k in labelist} labelist = sort_dictionary(in_labelist_) @@ -290,13 +551,13 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7): labeldict.update({'DEFAULT': len(labelist)}) - + ############################################################################################## def gen_cat_lines(textacyCorpus, labeldict): """ generates [topic1, topic2....] tok1 tok2 tok3 out of corpi""" for doc in textacyCorpus: - label = labeldict.get(normalize(doc.metadata["categoryName"]), labeldict['DEFAULT']) + label = labeldict.get(normalize_str(doc.metadata["categoryName"]), labeldict['DEFAULT']) if label is not 'DEFAULT': @@ -324,6 +585,11 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa logprint("") logprint("start {}-LLDA:".format("Keyword" if kb_keywords else "Subject")) + + + # labeldict ############################################################################################ + + # ticket2kb_dict kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";") @@ -358,7 +624,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa subject = line[1] keywords = line[2] - keywords_list = [normalize(x) for x in str(keywords).split(",")] + keywords_list = [normalize_str(x) for x in str(keywords).split(",")] if kb_id not in kb2keywords_dict.keys(): kb2keywords_dict[kb_id] = [] @@ -406,7 +672,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa keywords = kb2keywords_dict.get(kb_id, None) if keywords and kb_id: - used_keywords.append(list(map(normalize,keywords))) + used_keywords.append(list(map(normalize_str, keywords))) @@ -418,6 +684,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa labeldict = {k: v for v, k in enumerate(labelist)} + ############################################################################################## def gen_KB_lines(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict): @@ -433,7 +700,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa label = "" for kw in keywords: - label = label + str(labeldict.get(normalize(str(kw)), len(labeldict))) + " " + label = label + str(labeldict.get(normalize_str(str(kw)), len(labeldict))) + " " yield "[ " + label + "] " + doc.text @@ -451,7 +718,6 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa logprint("\n\n\nTime Elapsed {1}-LLDA :{0} min\n\n".format((end - start) / 60,"Keyword" if kb_keywords else "Subject")) - def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): start = time.time() @@ -461,6 +727,10 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): + # labeldict ############################################################################################ + + + # kb2keywords_dict / kb2subjects_dict --> {str : [str]} @@ -476,9 +746,9 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): kb_id = line[0] - subject = normalize(line[1]) + subject = normalize_str(line[1]) - keywords = [normalize(x) for x in str(line[2]).split(",")] + keywords = [normalize_str(x) for x in str(line[2]).split(",")] if kb_id not in kb2keywords_dict.keys(): @@ -488,9 +758,9 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): if kb_id not in kb2subjects_dict.keys(): - kb2subjects_dict[kb_id] = [normalize(subject) if subject != [''] else "DEFAULT"] + kb2subjects_dict[kb_id] = [normalize_str(subject) if subject != [''] else "DEFAULT"] else: - kb2subjects_dict[kb_id].append(normalize(subject)) + kb2subjects_dict[kb_id].append(normalize_str(subject)) @@ -586,8 +856,7 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): labelist = list(set(labelist)) labeldict = {k: v for v, k in enumerate(labelist)} - - + ############################################################################################## def gen_key_lines(textacyCorpus, labeldict, ticket2keywords_dict): @@ -601,7 +870,7 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): label = "" for kw in keywords: - label = label + str(labeldict.get(normalize(str(kw)), labeldict['DEFAULT'])) + " " + label = label + str(labeldict.get(normalize_str(str(kw)), labeldict['DEFAULT'])) + " " yield "[ " + label + "] " + doc.text @@ -642,10 +911,9 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): def load_from_labled_lines(path): path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/pre_labled_lines_wo_lemma_061217.txt" - #idee plan + #idee # clean laden, pre laden - # unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee bigramme mit _ verbinden - # nimm nur ngrams wo midn. ein token in pre vorkommt + # unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee bigramme mit _ verbinden # nimm nur ngrams wo midn. ein token in pre vorkommt diff --git a/topicModeling_1711_0846.py b/topicModeling_1711_0846.py index fe1971c..c46dd1c 100644 --- a/topicModeling_1711_0846.py +++ b/topicModeling_1711_0846.py @@ -415,7 +415,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words=7, kb_keywords=Fals keywords = kb2keywords_dict.get(kb_number, None) if keywords and kb_number: - used_keywords.append(list(map(normalize,keywords))) + used_keywords.append(list(map(normalize_str, keywords))) kb_entries_used = (len(list(set([kb for kb in ticket2kb_dict.values()])))) print("kb_entries_used: {}".format(kb_entries_used)) @@ -447,7 +447,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words=7, kb_keywords=Fals label = "" for kw in keywords: - label = label + str(labeldict.get( normalize(str(kw)) , len(labeldict))) + " " + label = label + str(labeldict.get(normalize_str(str(kw)), len(labeldict))) + " " yield "[ " + label + "] " + doc.text