diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz index 5c5f7d3..f8b9a95 100644 Binary files a/java_LabledLDA/models/tickets/.tassign.gz and b/java_LabledLDA/models/tickets/.tassign.gz differ diff --git a/java_LabledLDA/models/tickets/.theta.gz b/java_LabledLDA/models/tickets/.theta.gz index 9309100..8db650c 100644 Binary files a/java_LabledLDA/models/tickets/.theta.gz and b/java_LabledLDA/models/tickets/.theta.gz differ diff --git a/java_LabledLDA/models/tickets/.twords.gz b/java_LabledLDA/models/tickets/.twords.gz index 84e04a9..8e81db4 100644 Binary files a/java_LabledLDA/models/tickets/.twords.gz and b/java_LabledLDA/models/tickets/.twords.gz differ diff --git a/java_LabledLDA/models/tickets/tickets.gz b/java_LabledLDA/models/tickets/tickets.gz index 0e5edec..1ece2cc 100644 Binary files a/java_LabledLDA/models/tickets/tickets.gz and b/java_LabledLDA/models/tickets/tickets.gz differ diff --git a/main.py b/main.py index 343b826..2df1d14 100644 --- a/main.py +++ b/main.py @@ -21,15 +21,12 @@ start = time.time() -# todo llda topics zusammenfassen +# idee llda topics zusammenfassen # idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics -# frage wieviele tickets pro topic? -# todo modelle testen # frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen -# frage wenn 155 versch. kb-einträge benutzt wurden, wieso gibt es nur 139 topics? # idee word vorher mit semantischen netz abgleichen: wenn zu weit entfernt, dann ignore -#todo FREITAG zeichnen, refactoring +# todo modelle testen @@ -59,11 +56,11 @@ logprint("") logprint("") -#topicModeling.main(algorithm="lda") +topicModeling.main(algorithm="llda") logprint("") -topicModeling.main(algorithm="llda") +#topicModeling.main(algorithm="llda") logprint("") diff --git a/miscellaneous.py b/miscellaneous.py index cbdf9aa..902cc4d 100644 --- a/miscellaneous.py +++ b/miscellaneous.py @@ -121,7 +121,7 @@ def list_from_files(*paths): return list(map(textacy.preprocess.normalize_whitespace, liste)) -def debug(): +def breakpoint(): pass def normalize(string): @@ -148,6 +148,9 @@ def deprecated(func): return new_func +def flatten(liste): + return [item for sublist in liste for item in sublist] + def printRandomDoc(textacyCorpus): """ diff --git a/test.py b/test.py index 08b412e..5f431e8 100644 --- a/test.py +++ b/test.py @@ -22,58 +22,6 @@ import draw -# kb2keywords_dict - -kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", - delimiter=";") -next(kb2keywords_gen, None) # skip first -used_kb=[] -for kb in kb2keywords_gen: - used_kb.append(kb[1]) -print("used_kb: {}".format(len(list(set(used_kb))))) - -# von 260 kb einträgen insg. wurden 155 genutzt - -#"ArticleID";"Subject";"Keywords";"Solution";"SolutionText";"CreatedOn" -kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", # - delimiter=";") -next(kb2keywords_gen, None) # skip first -cats=[] -subjects=[] -keywords=[] -for kb in kb2keywords_gen: - cats.append(kb[0]) - subjects.append(kb[1]) - keywords.append(kb[2].split(",")) - -cats_lst = list(set(cats)) -print("cats: {}".format(len(cats_lst))) -print(cats_lst[0:20]) - -print(len(subjects)) -subj_lst = list(set(subjects)) #frage: hat wirklich jeder kb_eintrag ein anderesn Betreff? -print("subjects: {}".format(len(subj_lst))) -print(subj_lst[0:20]) - -keywords = [item for sublist in keywords for item in sublist] - -kys_lst = list(set(keywords)) -print("keywords: {}".format(len(kys_lst))) -print(kys_lst[0:20]) - - - - -used_list = ['bd_unicard_nicht_eingeschrieben', 'sd_vpn_temporaerer fehler ub', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_beschaedigte unicard', 'sd_boss_notenverbuchung', 'd.3 client installation', 'sd_keine rueckantwort kunde', 'sd_asknet_und_dreamspark', 'sd_beantragung_unicard', 'sd_gastaufenthalter', 'sd_internationaloffice', 'sd_outlook anmeldung gestoert', 'unicard_restbetrag_auszahlung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'ub_unicard_unicard mit vollmacht abholen', 'sd_namensaenderung mitarbeiter', 'sd_itmc kurse anmeldebestaetigung', 'sd_zugriff_onlinedienste_rueckmeldung', 'benutzer zum redmine hinzufuegen', 'sd_unicard_gesperrte unicard entsperre', 'lsf freischaltung als mitarbeiter/in', 'sd_mail_als_anhang', 'sd-e-mail_adresse_funktional_beantragen', 'sd_goeke drucker', 'sd_unimail imap_pop3', 'sd_origin_workaround', 'sd_matlab lizenzdatei pc-pools', 'sd_outlook kontakte automatische aktualisierung', 'sd_sap konteneinsicht antrag', 'ohne betreff', 'sd_telefonantrag_änderung_neuantrag', 'sd_sophos download', 'sd_geraeteausleihe', 'studierendenausweis', 'sd_citavi', 'sd_laufzeit unimail account', 'sd_login_unibib ub-it', 'sd_tu_app_keine internetverbindung', 'sd_unicard_max_laufzeit', 'ub_unicard_zusendung der karte moeglich?', 'sd_telefonbuch-eintrag_änderung', 'ub_drucker kopierer', 'windows 10', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'sd_tu-app feedback standard', 'sd_spam e-mail bekannt meldung', 'sd_spss_online_bestellung', 'sd_apple-on-campus', 'sd_studisek', 'sd_office 365 plus support', 'sd_sap_initialkennwort_englisch', 'sd_office365_asknet', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_login tu portale', 'ungueltiges ticket siehe journal', 'sd_sap_freischaltung ohne passwortaenderung', 'bd_unicard_geldkarte_laden', 'sd_verlust/antrag unicard', 'sd_unimail zu exchange', 'citavi_lizenzschluessel_nicht bekommen', 'sd_plotauftrag_zv', 'sd_citavi_support', 'sd_antworten_korrekt', 'sd_wlan-gastkonto', 'sd_antwort_phishingmail', 'bd_unicard_freigabe_beantragung', 'sd_origin nur noch eine seriennummer', 'cm_asiexception', 'sd_login_tu_portale', 'sd_webmailer_thread-anzeige', 'apps_dms-passwort d.3', 'apps_redmine_repository', 'sd_uniaccount_passwortaenderung', 'sd_phishing', 'sd_sap_firefox_esr', 'vpn verbindung fuer unitymedia kunden', 'sd_kurs-angebote anmeldung', 'sd_unicard fehlerhafte geldbuchung', 'sd_uniaccount_ehemalige_passwortaenderung', 'sd_sap_dienstreise', 'cm_lsf-boss_freischaltung', 'wlan', 'uni card', 'sd_webmailer einrichtung weiterleitung', 'spam ohne tu bezug', 'sd_outlook_in_exchange_einbinden', 'sd_wlan_beratung', 'sd_uniaccount_dauer freischaltung', 'sd_sap_konteneinsicht_ workaround', 'sd_vpn anleitungen', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_unicard_abholung', 'sd_vpn_probleme_mit_unitymedia', 'sd_diensthandy beschaffung', 'sd_unicard_defekt', 'sd_freischaltung uniaccount verzoegert', 'sd_kurs-angebote itmc', 'bd_goeke_allgemein', 'sd_uniaccount_ehemalige_studierende', 'sd_stellenausschreibung schwarzes brett', 'freischaltung uniaccount', 'sd_unicard_workaround_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'ub_geldchip-problem bei uc', 'sd_semesterticket', 'problem mit der beantragung von der unicard', 'sd_citavi bestellung', 'sd_immatrikulationsbescheigung_druckfehler', 'sd_vpn_aktualisierung', 'vpn_ipsec_stoerung', 'sd_dreamspark', 'ub_namensaenderung', 'sd_immatrikulationsbescheinigung_portal', 'ub_prod_neue unicard bei beschaedigung', 'sd_vpn_webvpn', 'sd_telefonbuch_prof_eintragung', 'sd_kontakt_asknet', 'probleme mit unicard', 'sd_office 356 plus bestellung', 'sd_gmx_web.de', 'fehlender eintrag im elektronischen telefonbuch', 'ub_prod_namenskorrektur_student', 'einrichtung des eduroam netzwerks', 'sd_sap_initialkennwort', 'sd_boss-bescheinigung', 'sd_wlan passwort setzen', 'sd_aktivierung uniaccount', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_heirat_namensaenderung_student', 'ub_unicard_spaetere abholung moeglich?', 'unicard nochmal beantragen', 'sd_studisek_buchung_semesterbeitrag', 'sd_pruefungsamt', 'unicard vergessen abzuholen und nicht mehr da', 'sd_antrag funktionale mailadresse', 'sd_email_namensaenderung', 'sd_telefonbuch, neues system', 'sd_account_abmelden', 'ub_unicard_abholungszeiten'] -labellist = ['sd_antworten_korrekt', 'sd_kurs-angebote anmeldung', 'sd_semesterticket', 'apps_dms-passwort d.3', 'freischaltung uniaccount', 'sd_heirat_namensaenderung_student', 'bd_unicard_freigabe_beantragung', 'sd_uniaccount_ehemalige_studierende', 'sd_sap_dienstreise', 'sd_origin_workaround', 'sd_uniaccount_ehemalige_passwortaenderung', 'fehlender eintrag im elektronischen telefonbuch', 'wlan', 'sd_tu-app feedback standard', 'sd_wlan_beratung', 'sd_uniaccount_passwortaenderung', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_unicard_spaetere abholung moeglich?', 'sd_citavi_support', 'sd_outlook kontakte automatische aktualisierung', 'sd_origin nur noch eine seriennummer', 'lsf freischaltung als mitarbeiter/in', 'cm_asiexception', 'sd_freischaltung uniaccount verzoegert', 'ub_unicard_zusendung der karte moeglich?', 'sd_login_unibib ub-it', 'uni card', 'sd_outlook anmeldung gestoert', 'd.3 client installation', 'ub_unicard_abholungszeiten', 'sd_antwort_phishingmail', 'sd_matlab lizenzdatei pc-pools', 'sd_sap_initialkennwort', 'sd_sap_freischaltung ohne passwortaenderung', 'sd_spss_online_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'sd_keine rueckantwort kunde', 'sd_unimail imap_pop3', 'sd_beantragung_unicard', 'sd_unicard_gesperrte unicard entsperre', 'sd_internationaloffice', 'unicard nochmal beantragen', 'sd_stellenausschreibung schwarzes brett', 'sd_sophos download', 'cm_lsf-boss_freischaltung', 'sd_verlust/antrag unicard', 'vpn_ipsec_stoerung', 'sd_account_abmelden', 'sd_outlook_in_exchange_einbinden', 'ub_namensaenderung', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'unicard vergessen abzuholen und nicht mehr da', 'apps_redmine_repository', 'einrichtung des eduroam netzwerks', 'sd_unicard_max_laufzeit', 'sd_gmx_web.de', 'sd_unicard fehlerhafte geldbuchung', 'sd_geraeteausleihe', 'spam ohne tu bezug', 'sd_uniaccount_dauer freischaltung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'sd_office 365 plus support', 'sd_unicard_defekt', 'sd_phishing', 'sd_goeke drucker', 'ub_unicard_unicard mit vollmacht abholen', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_pruefungsamt', 'sd_aktivierung uniaccount', 'sd_boss-bescheinigung', 'sd_sap_initialkennwort_englisch', 'bd_unicard_geldkarte_laden', 'sd_telefonbuch-eintrag_änderung', 'vpn verbindung fuer unitymedia kunden', 'sd_studisek', 'sd_antrag funktionale mailadresse', 'sd_asknet_und_dreamspark', 'sd_unicard_workaround_bestellung', 'sd_sap_firefox_esr', 'sd_vpn anleitungen', 'sd_office365_asknet', 'citavi_lizenzschluessel_nicht bekommen', 'sd_sap konteneinsicht antrag', 'sd_spam e-mail bekannt meldung', 'ub_prod_namenskorrektur_student', 'ub_beschaedigte unicard', 'sd_namensaenderung mitarbeiter', 'sd_mail_als_anhang', 'benutzer zum redmine hinzufuegen', 'sd_login_tu_portale', 'sd_email_namensaenderung', 'windows 10', 'ungueltiges ticket siehe journal', 'sd_vpn_temporaerer fehler ub', 'ub_prod_neue unicard bei beschaedigung', 'sd_dreamspark', 'sd_webmailer einrichtung weiterleitung', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_studisek_buchung_semesterbeitrag', 'sd_immatrikulationsbescheinigung_portal', 'sd_vpn_probleme_mit_unitymedia', 'sd-e-mail_adresse_funktional_beantragen', 'sd_diensthandy beschaffung', 'sd_vpn_webvpn', 'sd_laufzeit unimail account', 'sd_citavi', 'problem mit der beantragung von der unicard', 'sd_kurs-angebote itmc', 'sd_telefonbuch, neues system', 'sd_login tu portale', 'sd_wlan passwort setzen', 'sd_zugriff_onlinedienste_rueckmeldung', 'unicard_restbetrag_auszahlung', 'sd_immatrikulationsbescheigung_druckfehler', 'bd_unicard_nicht_eingeschrieben', 'sd_unimail zu exchange', 'sd_wlan-gastkonto', 'probleme mit unicard', 'sd_telefonbuch_prof_eintragung', 'sd_vpn_aktualisierung', 'sd_apple-on-campus', 'bd_goeke_allgemein', 'studierendenausweis', 'ub_drucker kopierer', 'sd_unicard_abholung', 'sd_office 356 plus bestellung', 'ohne betreff', 'sd_tu_app_keine internetverbindung', 'sd_boss_notenverbuchung', 'ub_geldchip-problem bei uc', 'sd_itmc kurse anmeldebestaetigung', 'sd_citavi bestellung', 'sd_telefonantrag_änderung_neuantrag', 'sd_sap_konteneinsicht_ workaround', 'sd_kontakt_asknet', 'sd_plotauftrag_zv', 'sd_webmailer_thread-anzeige', 'sd_gastaufenthalter'] - -for l in used_list: - if l not in labellist: - print(l) - -print(len(used_list)) -print(len(labellist)) - # load corpus corpus_de_path = FILEPATH + config.get("de_corpus", "path") preCorpus_name = "de" + "_pre_ticket" @@ -87,17 +35,133 @@ corpus_train = corpus[0:split_index] corpus_test = corpus[split_index:len(corpus)-1] +# frage wieviele tickets pro topic? + +kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";") + +ticket2kb_dict = {} + +for line in kb2ticket_gen: + + ticket_id = line[0] + kb_id = line[1] + + ticket2kb_dict[ticket_id] = kb_id +# {'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...} # kb2keywords_dict + +kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";") + +next(kb2keywords_gen, None) # skip first line("ArticleID";"Subject";"Keywords";...) + +kb2keywords_dict = {} + +kb_keywords=False + +for line in kb2keywords_gen: + + kb_id = line[0] + + subject = line[1] + + keywords = line[2] + keywords_list = [normalize(x) for x in str(keywords).split(",")] + + if kb_id not in kb2keywords_dict.keys(): + kb2keywords_dict[kb_id] = [] + + if kb_keywords: + for item in keywords_list: + if item != "": + kb2keywords_dict[kb_id].append(item) + + else: + kb2keywords_dict[kb_id].append(subject) + +# remove all empty items +kb2keywords_dict = {k: v for k, v in kb2keywords_dict.items() if len(v) != 0} +# {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...} -# lda bild abdunkeln -# auschnitte + +cat_dict = {} +count_dict={} +keywords_dict={} +for doc in corpus: + + category_name_ = doc.metadata["categoryName"] + if category_name_ not in cat_dict.keys(): + cat_dict[category_name_] = 1 + else: + cat_dict[category_name_] += 1 + + + + try: + x=doc.metadata["TicketNumber"] + + x=ticket2kb_dict[x] + + x=kb2keywords_dict[x] + + except: + pass + +for k,v in kb2keywords_dict.items(): #str,list + for elem in v: + + if elem not in count_dict.keys(): + count_dict[elem] = 1 + else: + count_dict[elem] += 1 + + + +import operator +""" +sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1)) + +for k,v in sorted_dict: + print(k,v) + +print(len(sorted_dict)) +""" + + + +kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";") +next(kb2keywords_gen, None) # skip first + +cnt=0 +for kb in kb2keywords_gen: + cnt +=1 +print(str(cnt)) + + + +count_dict = {} + +# "ArticleID";"Subject";"Keywords";"Solution";"SolutionText";"CreatedOn" + +for kb_entry in kb2keywords_gen: + + entry_ = kb_entry[1] + + if entry_ not in count_dict.keys(): + count_dict[entry_] = 1 + else: + count_dict[entry_] += 1 + + +sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1)) + +#for k,v in sorted_dict: +# print(k,v) + +#print(len(sorted_dict)) -import numpy as np -matplotlib.use('Agg') -import matplotlib.pyplot as plt @@ -110,6 +174,64 @@ import matplotlib.pyplot as plt end = time.time() print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start)) + + +""" +# kb2keywords_dict +kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", + delimiter=";") +used_kb=[] +for kb in kb2keywords_gen: + used_kb.append(kb[1]) +print("used_kb: {}".format(len(list(set(used_kb))))) + + + + +#"ArticleID";"Subject";"Keywords";"Solution";"SolutionText";"CreatedOn" +kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", # + delimiter=";") +next(kb2keywords_gen, None) # skip first +cat_lst=[] +sub_lst=[] +key_lst=[] +for kb in kb2keywords_gen: + cat_lst.append(kb[0]) + sub_lst.append(kb[1]) + key_lst.append(kb[2].split(",")) + +cats_setlist = list(set(cat_lst)) +print("cats: {}".format(len(cats_setlist))) +print(cats_setlist[0:20]) +print() + + +print("sub_lst: {}".format(len(sub_lst))) +sub_setlist = list(set(sub_lst)) #frage: hat wirklich jeder kb_eintrag ein anderesn Betreff? +print("sub_setlist: {}".format(len(sub_setlist))) +#print(sub_setlist[0:20]) +print() + +key_lst = [item for sublist in key_lst for item in sublist] #flatten list +key_setlist = list(set(key_lst)) +print("key_setlist: {}".format(len(key_setlist))) +#print(key_setlist[0:20]) + +print("\n\n\n\n") +""" + +""" +used_list = ['bd_unicard_nicht_eingeschrieben', 'sd_vpn_temporaerer fehler ub', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_beschaedigte unicard', 'sd_boss_notenverbuchung', 'd.3 client installation', 'sd_keine rueckantwort kunde', 'sd_asknet_und_dreamspark', 'sd_beantragung_unicard', 'sd_gastaufenthalter', 'sd_internationaloffice', 'sd_outlook anmeldung gestoert', 'unicard_restbetrag_auszahlung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'ub_unicard_unicard mit vollmacht abholen', 'sd_namensaenderung mitarbeiter', 'sd_itmc kurse anmeldebestaetigung', 'sd_zugriff_onlinedienste_rueckmeldung', 'benutzer zum redmine hinzufuegen', 'sd_unicard_gesperrte unicard entsperre', 'lsf freischaltung als mitarbeiter/in', 'sd_mail_als_anhang', 'sd-e-mail_adresse_funktional_beantragen', 'sd_goeke drucker', 'sd_unimail imap_pop3', 'sd_origin_workaround', 'sd_matlab lizenzdatei pc-pools', 'sd_outlook kontakte automatische aktualisierung', 'sd_sap konteneinsicht antrag', 'ohne betreff', 'sd_telefonantrag_änderung_neuantrag', 'sd_sophos download', 'sd_geraeteausleihe', 'studierendenausweis', 'sd_citavi', 'sd_laufzeit unimail account', 'sd_login_unibib ub-it', 'sd_tu_app_keine internetverbindung', 'sd_unicard_max_laufzeit', 'ub_unicard_zusendung der karte moeglich?', 'sd_telefonbuch-eintrag_änderung', 'ub_drucker kopierer', 'windows 10', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'sd_tu-app feedback standard', 'sd_spam e-mail bekannt meldung', 'sd_spss_online_bestellung', 'sd_apple-on-campus', 'sd_studisek', 'sd_office 365 plus support', 'sd_sap_initialkennwort_englisch', 'sd_office365_asknet', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_login tu portale', 'ungueltiges ticket siehe journal', 'sd_sap_freischaltung ohne passwortaenderung', 'bd_unicard_geldkarte_laden', 'sd_verlust/antrag unicard', 'sd_unimail zu exchange', 'citavi_lizenzschluessel_nicht bekommen', 'sd_plotauftrag_zv', 'sd_citavi_support', 'sd_antworten_korrekt', 'sd_wlan-gastkonto', 'sd_antwort_phishingmail', 'bd_unicard_freigabe_beantragung', 'sd_origin nur noch eine seriennummer', 'cm_asiexception', 'sd_login_tu_portale', 'sd_webmailer_thread-anzeige', 'apps_dms-passwort d.3', 'apps_redmine_repository', 'sd_uniaccount_passwortaenderung', 'sd_phishing', 'sd_sap_firefox_esr', 'vpn verbindung fuer unitymedia kunden', 'sd_kurs-angebote anmeldung', 'sd_unicard fehlerhafte geldbuchung', 'sd_uniaccount_ehemalige_passwortaenderung', 'sd_sap_dienstreise', 'cm_lsf-boss_freischaltung', 'wlan', 'uni card', 'sd_webmailer einrichtung weiterleitung', 'spam ohne tu bezug', 'sd_outlook_in_exchange_einbinden', 'sd_wlan_beratung', 'sd_uniaccount_dauer freischaltung', 'sd_sap_konteneinsicht_ workaround', 'sd_vpn anleitungen', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_unicard_abholung', 'sd_vpn_probleme_mit_unitymedia', 'sd_diensthandy beschaffung', 'sd_unicard_defekt', 'sd_freischaltung uniaccount verzoegert', 'sd_kurs-angebote itmc', 'bd_goeke_allgemein', 'sd_uniaccount_ehemalige_studierende', 'sd_stellenausschreibung schwarzes brett', 'freischaltung uniaccount', 'sd_unicard_workaround_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'ub_geldchip-problem bei uc', 'sd_semesterticket', 'problem mit der beantragung von der unicard', 'sd_citavi bestellung', 'sd_immatrikulationsbescheigung_druckfehler', 'sd_vpn_aktualisierung', 'vpn_ipsec_stoerung', 'sd_dreamspark', 'ub_namensaenderung', 'sd_immatrikulationsbescheinigung_portal', 'ub_prod_neue unicard bei beschaedigung', 'sd_vpn_webvpn', 'sd_telefonbuch_prof_eintragung', 'sd_kontakt_asknet', 'probleme mit unicard', 'sd_office 356 plus bestellung', 'sd_gmx_web.de', 'fehlender eintrag im elektronischen telefonbuch', 'ub_prod_namenskorrektur_student', 'einrichtung des eduroam netzwerks', 'sd_sap_initialkennwort', 'sd_boss-bescheinigung', 'sd_wlan passwort setzen', 'sd_aktivierung uniaccount', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_heirat_namensaenderung_student', 'ub_unicard_spaetere abholung moeglich?', 'unicard nochmal beantragen', 'sd_studisek_buchung_semesterbeitrag', 'sd_pruefungsamt', 'unicard vergessen abzuholen und nicht mehr da', 'sd_antrag funktionale mailadresse', 'sd_email_namensaenderung', 'sd_telefonbuch, neues system', 'sd_account_abmelden', 'ub_unicard_abholungszeiten'] +labellist = ['sd_antworten_korrekt', 'sd_kurs-angebote anmeldung', 'sd_semesterticket', 'apps_dms-passwort d.3', 'freischaltung uniaccount', 'sd_heirat_namensaenderung_student', 'bd_unicard_freigabe_beantragung', 'sd_uniaccount_ehemalige_studierende', 'sd_sap_dienstreise', 'sd_origin_workaround', 'sd_uniaccount_ehemalige_passwortaenderung', 'fehlender eintrag im elektronischen telefonbuch', 'wlan', 'sd_tu-app feedback standard', 'sd_wlan_beratung', 'sd_uniaccount_passwortaenderung', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_unicard_spaetere abholung moeglich?', 'sd_citavi_support', 'sd_outlook kontakte automatische aktualisierung', 'sd_origin nur noch eine seriennummer', 'lsf freischaltung als mitarbeiter/in', 'cm_asiexception', 'sd_freischaltung uniaccount verzoegert', 'ub_unicard_zusendung der karte moeglich?', 'sd_login_unibib ub-it', 'uni card', 'sd_outlook anmeldung gestoert', 'd.3 client installation', 'ub_unicard_abholungszeiten', 'sd_antwort_phishingmail', 'sd_matlab lizenzdatei pc-pools', 'sd_sap_initialkennwort', 'sd_sap_freischaltung ohne passwortaenderung', 'sd_spss_online_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'sd_keine rueckantwort kunde', 'sd_unimail imap_pop3', 'sd_beantragung_unicard', 'sd_unicard_gesperrte unicard entsperre', 'sd_internationaloffice', 'unicard nochmal beantragen', 'sd_stellenausschreibung schwarzes brett', 'sd_sophos download', 'cm_lsf-boss_freischaltung', 'sd_verlust/antrag unicard', 'vpn_ipsec_stoerung', 'sd_account_abmelden', 'sd_outlook_in_exchange_einbinden', 'ub_namensaenderung', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'unicard vergessen abzuholen und nicht mehr da', 'apps_redmine_repository', 'einrichtung des eduroam netzwerks', 'sd_unicard_max_laufzeit', 'sd_gmx_web.de', 'sd_unicard fehlerhafte geldbuchung', 'sd_geraeteausleihe', 'spam ohne tu bezug', 'sd_uniaccount_dauer freischaltung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'sd_office 365 plus support', 'sd_unicard_defekt', 'sd_phishing', 'sd_goeke drucker', 'ub_unicard_unicard mit vollmacht abholen', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_pruefungsamt', 'sd_aktivierung uniaccount', 'sd_boss-bescheinigung', 'sd_sap_initialkennwort_englisch', 'bd_unicard_geldkarte_laden', 'sd_telefonbuch-eintrag_änderung', 'vpn verbindung fuer unitymedia kunden', 'sd_studisek', 'sd_antrag funktionale mailadresse', 'sd_asknet_und_dreamspark', 'sd_unicard_workaround_bestellung', 'sd_sap_firefox_esr', 'sd_vpn anleitungen', 'sd_office365_asknet', 'citavi_lizenzschluessel_nicht bekommen', 'sd_sap konteneinsicht antrag', 'sd_spam e-mail bekannt meldung', 'ub_prod_namenskorrektur_student', 'ub_beschaedigte unicard', 'sd_namensaenderung mitarbeiter', 'sd_mail_als_anhang', 'benutzer zum redmine hinzufuegen', 'sd_login_tu_portale', 'sd_email_namensaenderung', 'windows 10', 'ungueltiges ticket siehe journal', 'sd_vpn_temporaerer fehler ub', 'ub_prod_neue unicard bei beschaedigung', 'sd_dreamspark', 'sd_webmailer einrichtung weiterleitung', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_studisek_buchung_semesterbeitrag', 'sd_immatrikulationsbescheinigung_portal', 'sd_vpn_probleme_mit_unitymedia', 'sd-e-mail_adresse_funktional_beantragen', 'sd_diensthandy beschaffung', 'sd_vpn_webvpn', 'sd_laufzeit unimail account', 'sd_citavi', 'problem mit der beantragung von der unicard', 'sd_kurs-angebote itmc', 'sd_telefonbuch, neues system', 'sd_login tu portale', 'sd_wlan passwort setzen', 'sd_zugriff_onlinedienste_rueckmeldung', 'unicard_restbetrag_auszahlung', 'sd_immatrikulationsbescheigung_druckfehler', 'bd_unicard_nicht_eingeschrieben', 'sd_unimail zu exchange', 'sd_wlan-gastkonto', 'probleme mit unicard', 'sd_telefonbuch_prof_eintragung', 'sd_vpn_aktualisierung', 'sd_apple-on-campus', 'bd_goeke_allgemein', 'studierendenausweis', 'ub_drucker kopierer', 'sd_unicard_abholung', 'sd_office 356 plus bestellung', 'ohne betreff', 'sd_tu_app_keine internetverbindung', 'sd_boss_notenverbuchung', 'ub_geldchip-problem bei uc', 'sd_itmc kurse anmeldebestaetigung', 'sd_citavi bestellung', 'sd_telefonantrag_änderung_neuantrag', 'sd_sap_konteneinsicht_ workaround', 'sd_kontakt_asknet', 'sd_plotauftrag_zv', 'sd_webmailer_thread-anzeige', 'sd_gastaufenthalter'] + +for l in used_list: + if l not in labellist: + print(l) + +print(len(used_list)) +print(len(labellist)) +""" + """ vllt kategorien in unterkategorien aufteilen diff --git a/topicModeling.py b/topicModeling.py index a8fd351..642eaf8 100644 --- a/topicModeling.py +++ b/topicModeling.py @@ -240,16 +240,20 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7): logprint("") logprint("start Category-LLDA:") + + # build dictionary of ticketcategories labelist = [] - for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): - labelist.append(texdoc.metadata["categoryName"]) + for doc in corpus: + labelist.append(normalize(doc.metadata["categoryName"])) labelist = list(set(labelist)) print("len(labelist): {}".format(len(labelist))) labeldict = {k: v for v, k in enumerate(labelist)} + + def gen_cat_lines(textacyCorpus, labeldict): """ generates [topic1, topic2....] tok1 tok2 tok3 out of corpi""" @@ -404,6 +408,213 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa logprint("\n\n\nTime Elapsed {1}-LLDA :{0} min\n\n".format((end - start) / 60,"Keyword" if kb_keywords else "Subject")) + +def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): + + start = time.time() + logprint("") + logprint("start LLDA:") + + + + + + + # kb2keywords_dict / kb2subj_dict {str : [str]} + + kb2keywords_dict = {} + kb2subjects_dict = {} + + kb_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";") + next(kb_gen, None) # skip first line "ArticleID";"Subject";"Keywords";... + + for line in kb_gen: + + kb_id = line[0] + + + subject = normalize(line[1]) + + keywords = [normalize(x) for x in str(line[2]).split(",")] + + + if kb_id not in kb2keywords_dict.keys(): + kb2keywords_dict[kb_id] = keywords if keywords != [''] else ["DEFAULT"] + else: + kb2keywords_dict[kb_id] = kb2keywords_dict[kb_id] + keywords + + + if kb_id not in kb2subjects_dict.keys(): + kb2subjects_dict[kb_id] = [normalize(subject) if subject != [''] else "DEFAULT"] + else: + kb2subjects_dict[kb_id].append(normalize(subject)) + + + + + + + + + + + # ticket2kbs_dict + ticket2kbs_dict = {} + kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";") + next(kb2ticket_gen, None) # skip first line"TicketNumber";"ArticleID" + + for line in kb2ticket_gen: + ticket_id = line[0] + kb_id = line[1] + + if ticket_id not in ticket2kbs_dict.keys(): + ticket2kbs_dict[ticket_id] = [kb_id] + else: + ticket2kbs_dict[ticket_id].append(kb_id) + + + + + + + + + # ticket2keywords + ticket2keywords_dict = {} # {str:[str]} + + for ticket_id, kb_ids in ticket2kbs_dict.items(): + + if ticket_id not in ticket2keywords_dict.keys(): + ticket2keywords_dict[ticket_id] = [] + + for kb_id in kb_ids: + ticket2keywords_dict[ticket_id].append(kb2keywords_dict[kb_id]) + + + ticket2keywords_dict[ticket_id] = flatten(ticket2keywords_dict[ticket_id]) + + + + + # ticket2subjects + ticket2subjects_dict = {} # {str:[str]} + + for ticket_id, kb_ids in ticket2kbs_dict.items(): + + if ticket_id not in ticket2subjects_dict.keys(): + ticket2subjects_dict[ticket_id] = [] + + for kb_id in kb_ids: + ticket2subjects_dict[ticket_id].append(kb2subjects_dict[kb_id]) + + + ticket2subjects_dict[ticket_id] = flatten(ticket2subjects_dict[ticket_id]) + + + + # kb2keywords_dict {'KBA10230': ['DEFAULT'], 'KBA10129': ['DEFAULT'], 'KBA10287': ['sd_ansys_informationen'], } len = 260 + #kb2subjects_dict {'KBA10230': ['unicard nochmal beantragen'], 'KBA10129': ['sd_entsperrung unicard nach verlust/wiederfinden'], } len = 260 + # ticket2kbs_dict {'INC44526': ['KBA10056'], 'INC67205': ['KBA10056'], } len = 4832 + # ticket2keywords_dict {'INC44526': ['DEFAULT'], 'INC67205': ['DEFAULT'], 'INC71863': ['DEFAULT'], 'INC44392': ['asknet'] } len=4832 + #ticket2subjects_dioct {'INC44526': ['sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt)'], len=4832 + + + # frage wieviele tickets pro topic? + count_dict = {} + for v in ticket2kbs_dict.values(): + for kb in v: + if kb in count_dict.keys(): + count_dict[kb] +=1 + else: + count_dict[kb] = 1 + import operator + + sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1)) + print("kb_entrys used: {}".format(len(sorted_dict))) + for k,v in sorted_dict: + print(k,kb2subjects_dict[k],v) #todo das selbe mit keywords + + + #todo hier weiter + + + # todo frage wie viele kb_entry's insg genutzt? + + labelist = ticket2keywords_dict.values() + labelist = flatten(labelist) + labelist = list(set(labelist)) + labeldict = {k: v for v, k in enumerate(labelist)} + + + + + + def gen_key_lines(textacyCorpus, labeldict, ticket2keywords_dict): + for doc in corpus: + + ticket_number = doc.metadata["TicketNumber"] + + keywords = ticket2keywords_dict.get(ticket_number, ['DEFAULT']) + + if keywords != ['DEFAULT']: + + label = "" + for kw in keywords: + label = label + str(labeldict.get(normalize(str(kw)), labeldict['DEFAULT'])) + " " + + yield "[ " + label + "] " + doc.text + + keys_line_gen = gen_key_lines(corpus, labeldict, ticket2keywords_dict) + + path2save_keys_results = path2save_results + "_kb_keys_llda_{}".format("top" + str(top_topic_words)) + + jgibbsLLDA(labeldict, keys_line_gen, path2save_keys_results, top_topic_words=top_topic_words) + + + + + + """ + def gen_subj_lines(textacyCorpus, labeldict, ticket2subjects_dict): + + for doc in corpus: + + ticket_number = doc.metadata["TicketNumber"] + + keywords = ticket2subjects_dict.get(ticket_number, ['DEFAULT']) + + if keywords != ['DEFAULT']: + + label = "" + for kw in keywords: + label = label + str(labeldict.get(normalize(str(kw)), len(labeldict))) + " " + + yield "[ " + label + "] " + doc.text + """ + + labelist = ticket2subjects_dict.values() + labelist = flatten(labelist) + labelist = list(set(labelist)) + labeldict = {k: v for v, k in enumerate(labelist)} + labeldict.update({'DEFAULT' : len(labeldict)}) + + subj_line_gen = gen_key_lines(corpus, labeldict, ticket2subjects_dict) + + path2save_subj_results = path2save_results + "_kb_subj_llda_{}".format("top" + str(top_topic_words)) + + jgibbsLLDA(labeldict, subj_line_gen, path2save_subj_results, top_topic_words=top_topic_words) + + + + + + end = time.time() + logprint("\n\n\nTime Elapsed LLDA :{0} min\n\n".format((end - start) / 60)) + + + + + def main( algorithm="llda"): @@ -427,14 +638,16 @@ def main( algorithm="llda"): if algorithm == "llda": top_topic_words = 5 + jgibbsLLDA_category(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words) + jgibbsLLDA_KB_v2(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words) kb_keywords = False - jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords) + #jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords) kb_keywords = True - jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords) + #jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords) diff --git a/topicModeling_1711_0846.py b/topicModeling_1711_0846.py new file mode 100644 index 0000000..9493640 --- /dev/null +++ b/topicModeling_1711_0846.py @@ -0,0 +1,727 @@ +# -*- coding: utf-8 -*- + +from datetime import datetime +import draw +import draw1 +import time +import numpy as np + +import csv +import sys +import json +import os.path +import subprocess +from textacy import Vectorizer, viz + +from miscellaneous import * +import textacy +from scipy import * + +import os + +csv.field_size_limit(sys.maxsize) +FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" + + +# load config +config_ini = FILEPATH + "config.ini" + +config = ConfigParser.ConfigParser() +with open(config_ini) as f: + config.read_file(f) + + +def label2ID(label, labeldict): + return labeldict.get(label, len(labeldict)) + + +def generate_lablelID_lines(textacyCorpus, labeldict): + for doc in textacyCorpus: + # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi + yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text + +""" +def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): + logprint(str("ngrams: {0}".format(ngrams))) + logprint(str("min_df: {0}".format(min_df))) + logprint(str("max_df: {0}".format(max_df))) + logprint(str("named_entities: {0}".format(named_entities))) + + # printlog("vectorize corpi...") + vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) + + terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus) + doc_term_matrix = vectorizer.fit_transform(terms_list) + id2term = vectorizer.__getattribute__("id_to_term") + + for t in terms_list: + print(t) + logprint("doc_term_matrix: {0}".format(doc_term_matrix)) + logprint("id2term: {0}".format(id2term)) +""" + +def textacyTopicModeling(corpus, + n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5, + ngrams = 1, min_df=1, max_df=1.0, + topicModel='lda'): + + + + + n_terms = int(n_topics * top_topic_words) + sort_terms_by = 'seriation' # 'seriation', 'weight', 'index', 'alphabetical' + rank_terms_by = 'corpus' # 'corpus', 'topic' + + + + + logprint( + "############### Topic Modeling {0} ###########################".format( + topicModel)) + logprint(str("ngrams: {0}".format(ngrams))) + logprint(str("min_df: {0}".format(min_df))) + logprint(str("max_df: {0}".format(max_df))) + logprint(str("n_topics: {0}".format(n_topics))) + logprint("\n") + + start = time.time() + + # http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix + weighting = ('tf' if topicModel == 'lda' else 'tfidf') + + + + + + + #################### vectorize corpi #################### + + vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) + + terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus) + doc_term_matrix = vectorizer.fit_transform(terms_list) + id2term = vectorizer.__getattribute__("id_to_term") + + # printlog("terms_list: {0}".format(list(terms_list))) + # printlog("doc_term_matrix: {0}".format(doc_term_matrix)) + + + + + + ##################### Initialize and train a topic model ############################################## + + model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) + + model.fit(doc_term_matrix) + + doc_topic_matrix = model.transform(doc_term_matrix) + + + for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words): + logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms))) + + for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic): + logprint(topic_idx) + for j in top_docs: + logprint(corpus[j].metadata['categoryName']) + + + + + ####################### termite plot ################################################################### + + grams_label = "uni" if ngrams == 1 else "bi" + """ + model.termite_plot(doc_term_matrix, id2term, + + n_terms=n_terms, + sort_terms_by=sort_terms_by, + rank_terms_by=rank_terms_by+'_weight', + + + save= FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label,topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by)) + """ + draw1.termite_plot(model,doc_term_matrix, id2term, + + n_terms=n_terms, + sort_terms_by=sort_terms_by, + rank_terms_by=rank_terms_by + '_weight', + + save=FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label, topicModel, n_topics, + n_terms, sort_terms_by, rank_terms_by)) + + end = time.time() + logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) + + + +def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7): + start = time.time() + + + + jgibbsLLDA_root = FILEPATH + "java_LabledLDA/" + + LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) + + + + # build dictionary of ticketcategories + labelist = [] + for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): + labelist.append(texdoc.metadata["categoryName"]) + + + labeldict = {k: v for v, k in enumerate(labelist)} + reverse_labeldict = {v: k for k, v in labeldict.items()} + + #and save + labeldict_path = FILEPATH + "results/labeldict.txt" + with open(labeldict_path, 'w') as file: + file.write(json.dumps(labeldict)) + + + n_topics = len(labeldict) #+1 #default-topic + + + + # create file with label_IDs (input for llda) + textacy.fileio.write_file_lines(generate_lablelID_lines(corpus, labeldict), filepath=LLDA_filepath) + + # wait for file to exist + while not os.path.exists(LLDA_filepath): + time.sleep(1) + + logprint("") + logprint("start LLDA:") + + + # run JGibbsLLDA file + + FNULL = open(os.devnull, 'w') # supress output + cmd_jgibbs_java = ["java", "-cp", + "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( + jgibbsLLDA_root), + "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile", + "tickets.gz", + "-twords", str(top_topic_words), "-ntopics", str(n_topics)] + subprocess.call(cmd_jgibbs_java, stdout=FNULL) + + + # ANMERKUNG: Dateien sind versteckt. zu finden in models/ + cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)] + output = subprocess.check_output(cmd_gzip).decode("utf-8") + + + topic_regex = re.compile(r'Topic [0-9]*') + + ##################################### + # todo save results in file aufgrund von results + result = [] + + for line in output.splitlines(): + findall = topic_regex.findall(line) + if len(findall) != 0: + try: + index = int(findall[0].split()[1]) + result.append("Topic {} {}:".format(index, reverse_labeldict[index])) + + except: + result.append(line) + + else: + result.append(line) + + textacy.fileio.write_file_lines(result, path2save_results+".txt") + ##################################### + + results = [] + res_dict = {} + count =0 + for line in output.splitlines(): + + findall = topic_regex.findall(line) + + if len(findall) != 0: + + if len(res_dict) != 0: + results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig) + + index = int(findall[0].split()[1]) + + res_dict = {index : str(reverse_labeldict[index]) } + + else: + splitted = line.split() + res_dict[splitted[0]] = float(splitted[1]) + """ + ### print terms that are topics + for s in list(res_dict.values()): + if isinstance(s,str) and splitted[0] in s: + vals = list(res_dict.values()) + keys = list(res_dict.keys()) + for v in vals: + if not isinstance(v,float): + print("{}".format(v)) + print("{}".format(splitted[0])) + count +=1 + print() + ### + """ + + if len(res_dict) != 0: + results.append(res_dict) # letzes an die liste ran + + #print(count) + #print(float(count)/float(len(labelist))) + + + + + # {0: 'betrieb', 'service': 0.24162679425837305, 'support': 0.24162679425837305, 'browser': 0.24162679425837305, 'unicard': 0.24162679425837305, 'telefon': 0.0023923444976076593} + + + # every term in the resulsts to a list + + terms=[] + for res in results: + for key,value in res.items(): + if not isinstance(key, int) and not key in terms: + terms.append(key) + + term2id = {t:i for i,t in enumerate(terms)} #and to dict + + ################# termite plot ##################################################################### + + #term_topic_weights.shape = (len(term_ids),len(topic_ids) + + + #topic_labels = tuple(labelist) + + topic_labels = list(range(len(labelist))) + term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()]) + + + term_topic_weights = np.zeros((len(term2id),len(topic_labels))) + + + + for i,res in enumerate(results): + + for key,value in res.items(): + + if not isinstance(key, int): + term_topic_weights[term2id[key]][i] = value + term_labels[term2id[key]] = key + else: + topic_labels[i] = reverse_labeldict[key] + + + #viz.draw_termite_plot(term_topic_weights, topic_labels, term_labels, save=path2save_results+".png") + draw.draw_termite( + term_topic_weights, topic_labels, term_labels, save=path2save_results+".png") + + + end = time.time() + logprint("Time Elapsed Topic Modeling JGibbsLLDA:{0} min\n".format((end - start) / 60)) + + + + +def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words=7, kb_keywords=False): + + jgibbsLLDA_root = FILEPATH + "java_LabledLDA/" + LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) + + + + + + + # ticket2kb_dict + + kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";") + + ticket2kb_dict = {} #{'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...} + for line in kb2ticket_gen: + ticket_id = line[0] + kb_id = line[1] + + ticket2kb_dict[ticket_id] = kb_id + ############# + + + + # kb2keywords_dict + + kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";") #"ArticleID";"Subject";"Keywords";..... + next(kb2keywords_gen,None) #skip first + kb2keywords_dict = {} + + for lino in kb2keywords_gen: + kb_id = lino[0] + kb2keywords_dict[kb_id] = [] + + subject = lino[1] + + keywords = lino[2] + + keywords_list = [x.lower().strip() for x in map(replaceRockDots(),str(keywords).split(","))] + + if kb_keywords: + for item in keywords_list: + if item != "": + kb2keywords_dict[kb_id].append(item) + + else: + kb2keywords_dict[kb_id].append(subject) + + + #remove all empty items + kb2keywords_dict = { k : v for k,v in kb2keywords_dict.items() if len(v) != 0} + ############### + + + #keywords2kb_dict + keywords2kb_dict = {} + for kb_id, lst in kb2keywords_dict.items(): + for l in lst: + if l not in keywords2kb_dict.keys(): + keywords2kb_dict[l] = [kb_id] + else: + keywords2kb_dict[l].append(kb_id) + ############ + + + # idee topic_ID -> KB_ID -> keywords / subject -> llda + + + + # ticket2kb_dict {'INC65627': 'KBA10044', 'INC66057': 'KBA10009', ...} + + # kb2keywords_dict {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...} + + # keywords2kb_dict {'unicard namensaenderung': ['KBA10276'], 'vpn': ['KBA10063'], 'outlook_exchange': ['KBA10181'], ...} + + + # Look for actually used keywords + used_keywords = [] + for doc in corpus: + ticket_number = doc.metadata["TicketNumber"] + + kb_number = ticket2kb_dict.get(ticket_number, None) + + keywords = kb2keywords_dict.get(kb_number, None) + + if keywords and kb_number: + used_keywords.append(list(map(normalize,keywords))) + + kb_entries_used = (len(list(set([kb for kb in ticket2kb_dict.values()])))) + print("kb_entries_used: {}".format(kb_entries_used)) + + labelist = [item for sublist in used_keywords for item in sublist] + labelist = list(set(labelist)) + print("len(labelist): {}".format(len(labelist))) + + + labeldict = {k: v for v, k in enumerate(labelist)} + labeldict_rev = {v: k for k, v in labeldict.items()} + print("labledict created") + + def genos_linos(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict): + + for doc in textacyCorpus: + + ticket_number = doc.metadata["TicketNumber"] + + kb_number = ticket2kb_dict.get(ticket_number, None) + + + + keywords = kb2keywords_dict.get(kb_number, None) + + if keywords is not None: + pass + if keywords and kb_number: + + label = "" + for kw in keywords: + label = label + str(labeldict.get( normalize(str(kw)) , len(labeldict))) + " " + + yield "[ " + label + "] " + doc.text + + line_gen = genos_linos(corpus, labeldict, ticket2kb_dict, kb2keywords_dict) + + + + + + + textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath) + + + + + + + # wait for file to exist + while not os.path.exists(LLDA_filepath): + time.sleep(1) + + logprint("") + logprint("start LLDA:") + + + # run JGibbsLLDA file + + n_topics = len(labeldict) #+1 #default-topic + + FNULL = open(os.devnull, 'w') # supress output + cmd_jgibbs_java = ["java", "-cp", + "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( + jgibbsLLDA_root), + "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile", + "tickets.gz", + "-twords", str(top_topic_words), "-ntopics", str(n_topics)] + subprocess.call(cmd_jgibbs_java, stdout=FNULL) + + + # ANMERKUNG: Dateien sind versteckt. zu finden in models/ + cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)] + output = subprocess.check_output(cmd_gzip).decode("utf-8") + + + topic_regex = re.compile(r'Topic [0-9]*') + + ##################################### + # todo save results in file aufgrund von results + result = [] + + for line in output.splitlines(): + findall = topic_regex.findall(line) + if len(findall) != 0: + try: + index = int(findall[0].split()[1]) + result.append("Topic {} {}:".format(index, labeldict_rev[index])) + + except: + result.append(line) + + else: + result.append(line) + + textacy.fileio.write_file_lines(result, path2save_results+".txt") + ##################################### + + results = [] + res_dict = {} + count =0 + for line in output.splitlines(): + + findall = topic_regex.findall(line) + + if len(findall) != 0: + + if len(res_dict) != 0: + results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig) + + index = int(findall[0].split()[1]) + + res_dict = {index : str(labeldict_rev[index]) } + + else: + splitted = line.split() + res_dict[splitted[0]] = float(splitted[1]) + + if len(res_dict) != 0: + results.append(res_dict) # letzes an die liste ran + + + # every term in the resulsts to a list + + terms=[] + for res in results: + for key,value in res.items(): + if not isinstance(key, int) and not key in terms: + terms.append(key) + + term2id = {t:i for i,t in enumerate(terms)} #and to dict + + ################# termite plot ##################################################################### + topic_labels = list(range(len(labelist))) + term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()]) + + + term_topic_weights = np.zeros((len(term2id),len(topic_labels))) + + for i,res in enumerate(results): + + for key,value in res.items(): + + if not isinstance(key, int): + term_topic_weights[term2id[key]][i] = value + term_labels[term2id[key]] = key + else: + topic_labels[i] = labeldict_rev[key] + + + draw.draw_termite( + term_topic_weights, topic_labels, term_labels, save=path2save_results+".png") + + + end = time.time() + + + + + + +def main(use_cleaned=False, algorithm="llda"): + + + + logprint("Topic Modeling: {0}".format(datetime.now())) + + corpus_de_path = FILEPATH + config.get("de_corpus", "path") + corpus_en_path = FILEPATH + config.get("en_corpus", "path") + + + if use_cleaned: + preCorpus_name = "de" + "_clean_ticket" + resultspath = FILEPATH + "results/clean" + else: + preCorpus_name = "de" + "_pre_ticket" + resultspath = FILEPATH + "results/pre" + + + + # load cleand corpus + de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) + logprint("Corpus loaded: {0}".format(de_corpus.lang)) + + + """ + ngrams = 1 + min_df = 1 + max_df = 1.0 + weighting = 'tf' + # weighting ='tfidf' + named_entities = False + + + printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting) + printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting) + printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting) + + printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting) + printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting) + printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting) + """ + + if algorithm == "llda": + top_topic_words = 5 + path2save_results = resultspath + "_cat_{}_{}".format(algorithm,"top"+str(top_topic_words)) + jgibbsLLDA_category(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words) + + + kb_keywords = False + path2save_results = resultspath + "_kb_{}_{}_{}".format("keys" if kb_keywords else "subs",algorithm,"top"+str(top_topic_words)) + jgibbsLLDA_KB(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, kb_keywords=kb_keywords) + + kb_keywords = True + path2save_results = resultspath + "_kb_{}_{}_{}".format("keys" if kb_keywords else "subs", algorithm, + "top" + str(top_topic_words)) + jgibbsLLDA_KB(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, + kb_keywords=kb_keywords) + + """ + top_topic_words = 10 + path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words)) + jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words) + + + top_topic_words = 15 + path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words)) + jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words) + + top_topic_words = 20 + path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words)) + jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words) + + """ + else: + + + textacyTopicModeling(ngrams = 1, + min_df = 1, + max_df = 0.9, + topicModel = algorithm, + n_topics =15, + corpus=de_corpus) + """ + textacyTopicModeling(ngrams=1, + min_df=1, + max_df=0.9, + topicModel=algorithm, + n_topics=20, + corpus=de_corpus) + + textacyTopicModeling(ngrams=1, + min_df=1, + max_df=0.9, + topicModel=algorithm, + n_topics=25, + corpus=de_corpus) + + + textacyTopicModeling(ngrams=1, + min_df=1, + max_df=0.9, + topicModel=algorithm, + n_topics=30, + corpus=de_corpus) + """ + + + textacyTopicModeling(ngrams=(1, 2), + min_df=1, + max_df=0.9, + topicModel=algorithm, + n_topics=15, + corpus=de_corpus) + """ + textacyTopicModeling(ngrams = (1,2), + min_df = 1, + max_df = 0.9, + topicModel = algorithm, + n_topics =20, + corpus=de_corpus) + + textacyTopicModeling(ngrams = (1,2), + min_df = 1, + max_df = 0.9, + topicModel = algorithm, + n_topics =25, + corpus=de_corpus) + + + textacyTopicModeling(ngrams = (1,2), + min_df = 1, + max_df = 0.9, + topicModel = algorithm, + n_topics =30, + corpus=de_corpus) + """ + + + + +if __name__ == "__main__": + main() + + + + + +