diff --git a/cleaning.py b/cleaning.py index 70f3a0c..ad7b60c 100644 --- a/cleaning.py +++ b/cleaning.py @@ -30,6 +30,16 @@ with open(config_ini) as f: def clean(stringstream):#, NOUNS): + """ + fix bad unicode + seperate_words_on_regex `\=~%^&*()_+\[\]{};\'"| + normalize whitespace + remove linebreaks + replaceRockDöts + + :param stringstream: str-gen + :return: string-gen + """ #NOUNS = [n.lower() for n in NOUNS] @@ -90,19 +100,22 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path") def cleanCorpus(corpus): logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now())) - - + """ ressources_path = FILEPATH + "ressources/" + + path2nouns_list = ressources_path + config.get("nouns", "pickle_file") + + #NOUNS = load_obj(path2nouns_list) #noun_disjunction = '|'.join(NOUNS) #nouns_tuples = [] #for n in NOUNS: # nouns_tuples.append((n.lower(),n)) - - cleanCorpus_name = corpus.lang + "_clean" + """ + # load Corpus raw_corpus = corpus parser = corpus.spacy_lang @@ -115,13 +128,14 @@ def cleanCorpus(corpus): ) - # leere docs aus corpi kicken + # leere docs aus corpus kicken cleaned_corpus.remove(lambda doc: len(doc) == 0) #save corpus + cleanCorpus_name = corpus.lang + "_clean" save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name) diff --git a/corporization.py b/corporization.py index 52fe031..607e2b2 100644 --- a/corporization.py +++ b/corporization.py @@ -90,7 +90,16 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path") def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0): + """ + Use textacy to create a Corpus out of the ITMC-Ticket.csv + :param path2_csv: str + :param corpus_path: str + :param content_collumn_name: str the Collumn which is used as the Docs text + :param lang: str standard 2-letter language + :param printrandom: print n random Documents + :return: textacy.Corpus + """ # print paths path_csv_split = path2_csv.split("/") diff --git a/init.py b/init.py index 8c4a226..79c7bc2 100644 --- a/init.py +++ b/init.py @@ -28,20 +28,20 @@ with open(config_ini) as f: def create_lemma_dict(path2lemmalist): """ - Creates a dict out of a file a la: + Creates a dict out of a txt file a la: l1 w1 l1 w2 l2 w1 l2 w2 - Result will be used as lemma_dict["word"] --> lemma + Result will be used as lemma_dict[word] --> lemma :param path2lemmalist: str :return: dictionary """ - lemmalist = list(map(textacy.preprocess.normalize_whitespace, list( - textacy.fileio.read_file_lines(path2lemmalist)))) + file_gen = textacy.fileio.read_file_lines(path2lemmalist) + lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(file_gen))) lemma_dict = {} @@ -63,7 +63,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False): Creates a dict out of the deWordNet https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml - Result will be used as lemma_dict["word"] --> lemma + Result will be used as thesaurus[word] --> main_synonym :param path2wordnet: str :param returnall: bool if True, also return , word2synsets, synset2Words @@ -73,6 +73,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False): lexroot = lextree.getroot() + # Build word2synsets word2synsets = {} template = {"w1": ["s1", "s2"]} @@ -82,7 +83,6 @@ def build_thesaurus_dict(path2wordnet,returnall=False): lex_dictlist = [subentry.attrib for subentry in elem] # idee technischer thesaurus - # idee hauptsynonmy muss einzelnes wort sein synlist = [] string = "WORD" @@ -96,55 +96,92 @@ def build_thesaurus_dict(path2wordnet,returnall=False): if 'writtenForm' in lex_dict.keys(): string = (lex_dict["writtenForm"]) - if string == "Kennwort": - pass # replaceRockDots string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ö]', "oe", string) + string = re.sub(r'[Ö]', "Oe", string) + string = re.sub(r'[ü]', "ue", string) + string = re.sub(r'[Ü]', "Ue", string) + string = re.sub(r'[ä]', "ae", string) + string = re.sub(r'[Ä]', "ae", string) # alle punkte raus string = re.sub(r'[.]', "", string) # alles in klammern raus - string = re.sub(r"\((.*)\)", " ", string) + if "auptform" in string: + string = re.sub(r"\((.*)\)", " ", string) + string = string + " (hauptform)" # evtl. als hauptform merken + else: + string = re.sub(r"\((.*)\)", " ", string) # längeres leerzeichen normalisieren string = textacy.preprocess.normalize_whitespace(string) - string = string.lower().strip() + string = string.strip()#.lower() - word2synsets[string] = synlist + if string != '': + word2synsets[string] = synlist + + # Build synset2Words synset2Words = {} template = {"s1": ["w1","w2"]} for word,synset in word2synsets.items(): if word != '': + + for syn in synset: if syn not in synset2Words.keys(): synset2Words[syn] = [word] else: synset2Words[syn].append(word) - # nach anzhal der wörter in den strings sortieren - for synset in word2synsets.values(): - synset.sort(key=lambda x: len(x.split())) + + # Sortieren + for words in synset2Words.values(): + words.sort(key=lambda w: len(w.split())) # nach anzhal der wörter in den strings (weniger nach vorne) + for w in words: + if "(hauptform)" in w: + to_insert = re.sub(r"\((.*)\)", " ", w).strip() + + words.remove(w) + words.insert(0, to_insert) # Hauptform evtl. nach vorne + + + thesaurus = {} thesaurus_template = {"w1" : "mainsyn"} + # word --> [synset1, synset2, .. ] --> synset1 --> [syn1, syn2, ... ] --> syn1 / mainsyn - for word,synset in word2synsets.items(): + for word,synsets in word2synsets.items(): #word , [synset1, synset2, .. ] try: - thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym #todo nach (hauptform) suchen? + if "Passwort" in word: + x=2 + + first_synset = synsets[0] #erstes synset wählen . praktischer Grund + + syns = synset2Words[first_synset] # [syn1, syn2, ... ] + + first_syn = syns[0] # erstes synonym (evtl. Hauptform) wählen + + word = re.sub(r"\((.*)\)", " ", word).strip() #(hautpform weg) + + + + thesaurus[word] = first_syn #Ann.: erstes synonym ist das Hauptsynonym except: pass + if returnall: return thesaurus, word2synsets, synset2Words else: @@ -237,39 +274,8 @@ def build_words_for_spellchecking(path2words): ################################################################################################## -# THESAURUS -ressources_path = FILEPATH + "ressources/" -path2wordnet = ressources_path + config.get("thesaurus","input") -path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file") -# SPELLCHECKING -path2words_file = ressources_path + config.get("spellchecking","input") -path2wordlist = ressources_path + config.get("spellchecking","pickle_file") - - -# LEMMA -path2lemma_file = ressources_path + config.get("lemmatization","input") -path2lemmadict = ressources_path + config.get("lemmatization","pickle_file") - -# NOMEN -nouns0 = ressources_path + config.get("nouns","input") -nouns1 = ressources_path + config.get("nouns","input1") -nouns2 = ressources_path + config.get("nouns","input2") -path2nouns_list = ressources_path + config.get("nouns","pickle_file") - - -# VORNAMEN -firstnames_txt = ressources_path + config.get("firstnames","input") -path2firstnameslist = ressources_path + config.get("firstnames","pickle_file") - -# STOPWORDS -stop1 = ressources_path + config.get("de_stopwords","input1") -stop2 = ressources_path + config.get("de_stopwords","input2") -stop3 = ressources_path + config.get("de_stopwords","input3") -path2stopwordlist_de = ressources_path + config.get("de_stopwords","pickle_file") - -path2stopwordlist_en = ressources_path + config.get("en_stopwords","pickle_file") @@ -277,44 +283,135 @@ def main(): start = time.time() logprint("Init: {0}".format(datetime.now())) - - """""" - logprint("create and save lemma_dict") - lemma_dict = create_lemma_dict(path2lemma_file) - save_obj(lemma_dict, path2lemmadict) - - - logprint("Build and save Wordlist for Spellchecking") - words = build_words_for_spellchecking(path2words_file) - save_obj(words, path2wordlist) + ressources_path = FILEPATH + "ressources/" + + + + + # THESAURUS logprint("Build and save Thesaurus") + + path2wordnet = ressources_path + config.get("thesaurus", "input") thesaurus = build_thesaurus_dict(path2wordnet) + + path2thesaurus_dict = ressources_path + config.get("thesaurus", "pickle_file") save_obj(thesaurus, path2thesaurus_dict) + + + + + + + # LEMMA + logprint("create and save lemma_dict") + + path2lemma_file = ressources_path + config.get("lemmatization", "input") + lemma_dict = create_lemma_dict(path2lemma_file) + + path2lemmadict = ressources_path + config.get("lemmatization", "pickle_file") + save_obj(lemma_dict, path2lemmadict) + + + + + + + + + + + + + + + # SPELLCHECKING + logprint("Build and save Wordlist for Spellchecking") + + path2words_file = ressources_path + config.get("spellchecking", "input") + words = build_words_for_spellchecking(path2words_file) + + path2words_counter = ressources_path + config.get("spellchecking", "pickle_file") + save_obj(words, path2words_counter) + + + + + + + + + + + + # STOPWORDS logprint("Build and save stoppwortliste") + + stop1 = ressources_path + config.get("de_stopwords", "input1") + stop2 = ressources_path + config.get("de_stopwords", "input2") + stop3 = ressources_path + config.get("de_stopwords", "input3") de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3) + + path2stopwordlist_de = ressources_path + config.get("de_stopwords", "pickle_file") save_obj(de_stop_words, path2stopwordlist_de) + + path2stopwordlist_en = ressources_path + config.get("en_stopwords", "pickle_file") save_obj(en_stop_words, path2stopwordlist_en) + + + + + + + + # NOMEN logprint("Build and save nomenliste") - #nouns = list_from_files(nouns1,nouns2) - nouns = list_from_files(nouns0) + + nouns0 = ressources_path + config.get("nouns", "input") + nouns1 = ressources_path + config.get("nouns", "input1") + nouns2 = ressources_path + config.get("nouns", "input2") + nouns = list_from_files(nouns0,nouns1,nouns2) + + path2nouns_list = ressources_path + config.get("nouns", "pickle_file") save_obj(nouns, path2nouns_list) + + + + + + + + # VORNAMEN logprint("Build and save firstnameslist") + + firstnames_txt = ressources_path + config.get("firstnames", "input") vornamen = list_from_files(firstnames_txt) + + path2firstnameslist = ressources_path + config.get("firstnames", "pickle_file") save_obj(vornamen, path2firstnameslist) + + + + + + + + + + end = time.time() logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60)) diff --git a/java_LabledLDA/models/tickets/.others.gz b/java_LabledLDA/models/tickets/.others.gz index 3a831de..aad2762 100644 Binary files a/java_LabledLDA/models/tickets/.others.gz and b/java_LabledLDA/models/tickets/.others.gz differ diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz index 77283d9..3e1418c 100644 Binary files a/java_LabledLDA/models/tickets/.tassign.gz and b/java_LabledLDA/models/tickets/.tassign.gz differ diff --git a/java_LabledLDA/models/tickets/.theta.gz b/java_LabledLDA/models/tickets/.theta.gz index 05c59f9..ec06bac 100644 Binary files a/java_LabledLDA/models/tickets/.theta.gz and b/java_LabledLDA/models/tickets/.theta.gz differ diff --git a/java_LabledLDA/models/tickets/.twords.gz b/java_LabledLDA/models/tickets/.twords.gz index c702b39..1ab725d 100644 Binary files a/java_LabledLDA/models/tickets/.twords.gz and b/java_LabledLDA/models/tickets/.twords.gz differ diff --git a/java_LabledLDA/models/tickets/.wordmap.gz b/java_LabledLDA/models/tickets/.wordmap.gz index d4cbb40..c24683a 100644 Binary files a/java_LabledLDA/models/tickets/.wordmap.gz and b/java_LabledLDA/models/tickets/.wordmap.gz differ diff --git a/java_LabledLDA/models/tickets/tickets.gz b/java_LabledLDA/models/tickets/tickets.gz index 3f2afc1..83cc2ed 100644 Binary files a/java_LabledLDA/models/tickets/tickets.gz and b/java_LabledLDA/models/tickets/tickets.gz differ diff --git a/main.py b/main.py index 8b1ec86..22d2d7c 100644 --- a/main.py +++ b/main.py @@ -30,15 +30,13 @@ start = time.time() # todo modelle testen - - +# todo ticket2kbkeys, subj, cats in init.py logprint("main.py started at {}".format(datetime.now())) - -#init.main() +init.main() logprint("") raw_corpus = corporization.main() diff --git a/miscellaneous.py b/miscellaneous.py index e3a159e..f4617fe 100644 --- a/miscellaneous.py +++ b/miscellaneous.py @@ -217,7 +217,6 @@ def save_corpus(corpus, corpus_path, corpus_name): :param corpus_path: str :param corpus_name: str (should content the language like "_de_") """ - #todo pos und ner tagging speichern # save parser parser = corpus.spacy_lang diff --git a/preprocessing.py b/preprocessing.py index 734ae8e..6f7e0c2 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -126,7 +126,7 @@ def remove_first_names(): def remove_addresses(string): pass # todo remove_addresses idee postal.parser und zu metadaten hinzufügen -def lemmatizeWord(word,lemma_dict=LEMMAS,n=3): +def lemmatizeWord(word,lemma_dict=LEMMAS,n=5): for i in range(n): try: word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower() @@ -134,26 +134,29 @@ def lemmatizeWord(word,lemma_dict=LEMMAS,n=3): print(word) return word -def getFirstSynonym(word, thesaurus=THESAURUS,n=3): + +def getFirstSynonym(word, thesaurus=THESAURUS, n=3): for i in range(n): + try: - word = thesaurus[word.lower()] if word.lower() in thesaurus.keys() else word.lower() + if word in thesaurus.keys(): + return thesaurus[word] + + elif word.title() in thesaurus.keys(): + return thesaurus[word.title()] + + elif word.lower() in thesaurus.keys(): + return thesaurus[word.lower()] + + else: + return word + + + except: - print(word) - return word - - - """ - if not isinstance(word, str): - return str(word) - - word = word.lower() - if word in thesaurus.keys(): - return thesaurus[word] - else: - return str(word) - """ + print("THESAURUSFEHLER BEI: {}".format(word)) + return word ########################## Spellchecking ########################################## @@ -328,6 +331,15 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path") def extract_from_corpus(corpus): + """ + Extract from each doc from a corpus a string containing disired token_texts + + + :param corpus: textacy.Corpus + :return: string-gen + """ + + # WHITELIST erstellen. Enthält zumindest die evtuellen Topics WHITELIST = ["boss", "sap", "firefox"] #todo autogenerierung relv. techn. begriffe @@ -337,6 +349,7 @@ def extract_from_corpus(corpus): WHITELIST = WHITELIST + kb_cats + kb_keys + kb_subjs + THESAURUS = load_obj(path2thesaurus_dict) #WORDS = load_obj(path2wordsdict) LEMMAS = load_obj(path2lemmadict) @@ -344,6 +357,9 @@ def extract_from_corpus(corpus): #EN_STOP_WORDS = load_obj(path2ENstopwordlist) VORNAMEN = load_obj(path2firstnameslist) + ents_boss = [] + ents_sap = [] + for doc in corpus: result = [] @@ -353,10 +369,16 @@ def extract_from_corpus(corpus): for tok in doc: - if tok.lower_ =="boss" or tok.lower_ =="sap": - print(tok.lower_+": "+tok.ent_type_) + """ + if tok.lower_ =="boss": + ents_boss.append(tok.ent_type_) + if tok.lower_ =="sap": + ents_sap.append(tok.ent_type_) + """ + + # wenn in whitelist, direkt übernehmen if tok.lower_ in WHITELIST: result.append(tok.lower_) @@ -372,25 +394,27 @@ def extract_from_corpus(corpus): or tok.lower_ in VORNAMEN: continue - # cut after footer - if replaceRockDots(tok.lower_) in ["gruss", "grusse", "gruesse", "gruessen", "grusses"]: # fehler schneidet bei INC40506 das meiste weg - break - # boss/SAP ent_type = 'ORG' oder '' (ein-weimal LOC oder PERSON) + + # cut after footer + if replaceRockDots(tok.lower_) in ["gruss", "grusse", "gruesse", "gruessen", "grusses"]: # fehler schneidet bei zB INC40506 das meiste weg + break if tok.pos_ in ["NOUN"] \ - or tok.ent_type_ in ["NORP","FACILITY","ORG","PRODUCT","WORK_OF_ART"]: + or tok.ent_type_ in ["NORP","FACILITY","ORG","PRODUCT","WORK_OF_ART","LOC"]: #or tok.dep_ == "ROOT": # or tok.lower_ in NOUNS \ #,"PERSON"] \ toktext = tok.lower_ toktext = lemmatized_word - """ + + # hauptsynonym bilden idee zwar das Huaptsyn bilden und zählen aber die originalen wörter in den llda algo geben + """ first_synonym = getFirstSynonym(toktext, thesaurus=THESAURUS) - if first_synonym is not None: + if first_synonym is not None or first_synonym != '': toktext = first_synonym if len(first_synonym.split()) == 1 else toktext """ @@ -402,6 +426,14 @@ def extract_from_corpus(corpus): yield " ".join(result) + """ + print(list(set(ents_sap))) + ['', 'ORG', 'PERSON', 'LOC'] + + print(list(set(ents_boss))) + ['', 'ORG', 'PERSON', 'LOC'] + """ + @@ -433,6 +465,9 @@ def preprocessCorpus(corpus, clean_in_meta): ) + # idee labeled_lines.txt enthählt bigramme mit unterstrich + # todo preCorpus weg. llda bekommt labaled_lines.txt und lda doctermamtrix + # leere docs aus corpi kicken pre_corpus.remove(lambda doc: len(doc) == 0) diff --git a/ressources/deWordNet.xml b/ressources/deWordNet.xml index 3be16f7..7ccf171 100644 --- a/ressources/deWordNet.xml +++ b/ressources/deWordNet.xml @@ -71439,7 +71439,7 @@ - + @@ -750689,4 +750689,4 @@ - \ No newline at end of file + diff --git a/ressources/lemmas.txt b/ressources/lemmas.txt index f25788c..9f54eb1 100644 --- a/ressources/lemmas.txt +++ b/ressources/lemmas.txt @@ -1,3 +1,5 @@ +kennwort kennworts +kennwort kennwortes a as aachen aachens aal aale @@ -358471,4 +358473,4 @@ zynisch zynischstes zynische zynischen zynischere zynischeren zynischste zynischsten -zyste zysten \ No newline at end of file +zyste zysten diff --git a/test.py b/test.py index 33784a7..0ebd69b 100644 --- a/test.py +++ b/test.py @@ -25,6 +25,74 @@ FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" from miscellaneous import * import re + + + + +ressources_path = FILEPATH + "ressources/" +path2lemmadict = ressources_path + config.get("lemmatization","pickle_file") +LEMMAS = load_obj(path2lemmadict) +path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file") +THESAURUS = load_obj(path2thesaurus_dict) + + +def getFirstSynonym(word, thesaurus=THESAURUS,n=3): + + for i in range(n): + + try: + if word in thesaurus.keys(): + return thesaurus[word] + + elif word.title() in thesaurus.keys(): + return thesaurus[word.title()] + + elif word.lower() in thesaurus.keys(): + return thesaurus[word.lower()] + else: + return word + except: + print("THESAURUSFEHLER BEI: {}".format(word)) + return word + + +def lemmatizeWord(word,lemma_dict=LEMMAS,n=3): + for i in range(n): + try: + word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower() + except: + print(word) + return word + +test = "Kennwort" + +print(getFirstSynonym("kennwort")) +print(getFirstSynonym("Kollern")) +print(getFirstSynonym("lködjsklafd")) + + + + + + + + + + + + + + + + + + + + + + + + dict = {"benutzer zum redmine hinzufuegen": 0, "sd_outlook anmeldung gestoert": 131, "sap portal \"im anderen modus geoeffnet\"": 1, "uni card": 79, "sd_verlust/antrag unicard": 2, "sd_citavi bestellung": 3, "fehlender eintrag im elektronischen telefonbuch": 4, "sd_origin_workaround": 80, "sd_kurs-angebote anmeldung": 141, "ub_drucker kopierer": 82, "sd_itmc kurse anmeldebestaetigung": 66, "sd_unicard_gesperrte unicard entsperre": 6, "ub_unicard_abholungszeiten": 7, "bd_unicard_nicht_eingeschrieben": 8, "ub_prod_namenskorrektur_student": 149, "sd_outlook_in_exchange_einbinden": 84, "sd_tu-app feedback standard": 10, "sd_account_abmelden": 77, "sd_gmx_web.de": 87, "bd_unicard_chip_defekt": 11, "sd_antrag funktionale mailadresse": 88, "sd_login tu portals english": 142, "sd_falsche_personendaten": 90, "sd_vpn_aktualisierung": 12, "ub_namensaenderung": 111, "studierendenausweis": 13, "re: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss": 14, "sd_vpn_webvpn": 15, "spam ohne tu bezug": 92, "ungueltiges ticket siehe journal": 123, "sd_heirat_namensaenderung_student": 122, "sd_telefonbuch, neues system": 16, "sd_diensthandy beschaffung": 94, "sd_telefonantrag_aenderung_neuantrag": 95, "sd_uniaccount_ehemalige_passwortaenderung": 17, "sd_plotauftrag_zv": 18, "ohne betreff": 19, "wlan": 97, "sd-e-mail_adresse_funktional_beantragen": 98, "sd_unimail zu exchange": 21, "sd_citavi": 99, "transportdurchfuehung": 100, "ub_prod_neue unicard bei beschaedigung": 101, "sd_wlan passwort englisch": 22, "sd_semesterticket": 103, "sd_pruefungsamt": 104, "sd_uniaccount freischaltung verzoegert englisch": 23, "sd_uniaccount_passwortaenderung": 140, "sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt)": 105, "sd_tu-app feedback_englisch": 24, "sd_uniaccount_ehemalige_studierende": 107, "ub_prod_abholung_ abholfristen_benachrichtigungen": 25, "sd_stellenausschreibung schwarzes brett": 26, "sd_tu_app_keine internetverbindung": 27, "sd_uniaccount activation englisch": 28, "sd_aktivierung uniaccount": 108, "ub_unicard_zusendung der karte moeglich?": 132, "einrichtung des eduroam netzwerks": 29, "unicard nochmal beantragen": 30, "sd_webmailer_thread-anzeige": 5, "sd_mail_als_anhang": 31, "m42_dokumentationen_zu_neuen_ous": 32, "vpn verbindung fuer unitymedia kunden": 33, "sd_beantragung_unicard": 20, "sd_unicard_defekt": 83, "sd_asknet_mitarbeiter_softwarebestellung": 34, "sd_spss_online_bestellung": 109, "sd_webmailer einrichtung weiterleitung": 9, "sd_unicard_max_laufzeit": 35, "sd_office 356 plus bestellung": 81, "vpn_ipsec_stoerung": 36, "sd_telefonbuch-eintrag_aenderung": 37, "sd_geraeteausleihe": 125, "bd_unicard_geldkarte_laden": 112, "unicard_restbetrag_auszahlung": 113, "apps_redmine_repository": 114, "windows 10": 78, "sd_antwort_phishingmail": 110, "sd_gastaufenthalter": 38, "sd_matlab lizenzdatei pc-pools": 39, "sd_wlan passwort setzen": 40, "sd_sap_initialkennwort": 41, "sd_sap_konteneinsicht_ workaround": 119, "sd_office365_asknet": 118, "bd_unicard_freigabe_beantragung": 42, "sd_internationaloffice": 43, "sd_kurs-angebote itmc": 153, "sd_asknet_und_dreamspark": 102, "cm_lsf-boss_freischaltung": 116, "sd_unicard fehlerhafte geldbuchung": 44, "sd_office 365 plus support": 45, "citavi_lizenzschluessel_nicht bekommen": 86, "sd_webmailer_threadanzeige und weiterleitung": 121, "sd_boss_notenverbuchung": 47, "sd_namensaenderung_englisch": 48, "sd_sap_freischaltung ohne passwortaenderung": 49, "sd_outlook kontakte automatische aktualisierung": 124, "sd_fk9 test": 50, "sd_sophos download": 51, "apps_dms_d.3 client installation/login d.3 funktioniert nicht": 52, "sd_sap_firefox_esr": 127, "sd_unicard_workaround_bestellung": 128, "sd_vpn anleitungen": 53, "probleme mit unicard": 89, "sd_wlan_beratung": 129, "sd_login_tu_portale": 130, "problem mit der beantragung von der unicard": 150, "sd_unicard_abholung": 54, "ub_beschaedigte unicard": 120, "sd_uniaccount_dauer freischaltung": 96, "sd_freischaltung uniaccount verzoegert": 133, "sd_unimail imap_pop3": 134, "change produktiv nehmen chn00146 - transport e01k909284": 135, "sd_boss-bescheinigung": 55, "sd_studisek_buchung_semesterbeitrag": 56, "sd_studisek": 57, "sd_sap_initialkennwort_englisch": 58, "sd_zugriff_onlinedienste_rueckmeldung": 59, "d.3 client installation": 60, "lsf freischaltung als mitarbeiter/in": 126, "sd_sap_dienstreise": 145, "sd_keine rueckantwort kunde": 136, "apps_dms-passwort d.3": 61, "ub_unicard_unicard mit vollmacht abholen": 137, "sd_immatrikulationsbescheinigung_portal": 62, "how to setup eduroam": 46, "sd_spam e-mail bekannt meldung": 63, "sd_laufzeit unimail account": 64, "sd_gleitzeitanlage_dez3_stoerung": 154, "sd_vpn_probleme_mit_unitymedia": 139, "sd_origin nur noch eine seriennummer": 115, "sd_kontakt_asknet": 65, "sd_email_namensaenderung": 67, "ub_geldchip-problem bei uc": 68, "sd_dreamspark": 138, "bd_goeke_allgemein": 143, "sd_phishing": 144, "sd_login_unibib ub-it": 91, "sd_citavi_support": 146, "sd_wlan-gastkonto": 147, "sd_namensaenderung mitarbeiter": 69, "sd_telefonbuch_prof_eintragung": 93, "probleme mit der namensaenderung/ neue unicard": 106, "sd_antworten_korrekt": 70, "freischaltung uniaccount": 71, "DEFAULT": 155, "ub_unicard_spaetere abholung moeglich?": 148, "sd_sap konteneinsicht antrag": 85, "sd_vpn_temporaerer fehler ub": 72, "sd_studisek_englisch": 75, "unicard vergessen abzuholen und nicht mehr da": 73, "sd_immatrikulationsbescheigung_druckfehler": 151, "sd_goeke drucker": 152, "sd_login tu portale": 117, "sd_apple-on-campus": 74, "cm_asiexception": 76} @@ -47,30 +115,20 @@ content ="kenntnisnahme" -ressources_path = FILEPATH + "ressources/" -path2lemmadict = ressources_path + config.get("lemmatization","pickle_file") -LEMMAS = load_obj(path2lemmadict) -path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file") -THESAURUS = load_obj(path2thesaurus_dict) -def getFirstSynonym(word, thesaurus=THESAURUS,n=3): - - for i in range(n): - try: - word = thesaurus[word.lower()] if word.lower() in thesaurus.keys() else word.lower() - except: - print(word) - return word -def lemmatizeWord(word,lemma_dict=LEMMAS,n=3): - for i in range(n): - try: - word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower() - except: - print(word) - return word + + + + + + + + + + parser = spacy.load("de") @@ -86,11 +144,8 @@ print(textacy.extract.acronyms_and_definitions(obj4,known_acro_defs)) print(textacy.similarity.word2vec(obj1, obj2)) print(textacy.similarity.word2vec(obj1, obj3)) -#todo ner von boss sap testen -test = "boss" -print(getFirstSynonym(test)) @@ -162,7 +217,12 @@ cleanCorpus_name = "de" + "_clean_ticket" -# idee test von tagging zwischen cleaned und raw + + + + +# test von tagging zwischen cleaned und raw + parser = spacy.load("de") corpus = textacy.Corpus(parser) @@ -329,7 +389,7 @@ def cleaning(string): -# fehler case sensitive pos tagging idee abgleich mit nomenliste und dann Großschreiben +# fehler case sensitive pos tagging idee abgleich mit nomenliste und dann Großschreiben: geht nicht, zu viele Homonyme von sowaol nomen als auch anders # fehler replaceRockdots verändert pos: bsp für adp aber fuer verb # klammern ändern nix an pos diff --git a/topicModeling.py b/topicModeling.py index bce6d41..a909838 100644 --- a/topicModeling.py +++ b/topicModeling.py @@ -571,14 +571,14 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): count_dict[kb] = 1 sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1)) - + """ for k,v in sorted_dict: subs = kb2subjects_dict[k] keys = kb2keywords_dict[k] print(subs, keys , v) # frage wieviele tickets pro topic? print("kb_entrys used: {}".format(len(sorted_dict))) # frage wie viele kb_entry's insg genutzt?: 155 - + """ labelist = ticket2keywords_dict.values() @@ -644,7 +644,7 @@ def load_from_labled_lines(path): #idee plan # clean laden, pre laden - # unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee notfalls bigramme als geklammerte "wörter" + # unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee bigramme mit _ verbinden # nimm nur ngrams wo midn. ein token in pre vorkommt