llda mit subjects und keywords korrigiert

This commit is contained in:
jannis.grundmann 2017-11-21 10:14:37 +01:00
parent 4a3683635e
commit 7214911606
9 changed files with 1131 additions and 69 deletions

11
main.py
View File

@ -21,15 +21,12 @@ start = time.time()
# todo llda topics zusammenfassen
# idee llda topics zusammenfassen
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
# frage wieviele tickets pro topic?
# todo modelle testen
# frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen
# frage wenn 155 versch. kb-einträge benutzt wurden, wieso gibt es nur 139 topics?
# idee word vorher mit semantischen netz abgleichen: wenn zu weit entfernt, dann ignore
#todo FREITAG zeichnen, refactoring
# todo modelle testen
@ -59,11 +56,11 @@ logprint("")
logprint("")
#topicModeling.main(algorithm="lda")
topicModeling.main(algorithm="llda")
logprint("")
topicModeling.main(algorithm="llda")
#topicModeling.main(algorithm="llda")
logprint("")

View File

@ -121,7 +121,7 @@ def list_from_files(*paths):
return list(map(textacy.preprocess.normalize_whitespace, liste))
def debug():
def breakpoint():
pass
def normalize(string):
@ -148,6 +148,9 @@ def deprecated(func):
return new_func
def flatten(liste):
return [item for sublist in liste for item in sublist]
def printRandomDoc(textacyCorpus):
"""

236
test.py
View File

@ -22,58 +22,6 @@ import draw
# kb2keywords_dict
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv",
delimiter=";")
next(kb2keywords_gen, None) # skip first
used_kb=[]
for kb in kb2keywords_gen:
used_kb.append(kb[1])
print("used_kb: {}".format(len(list(set(used_kb)))))
# von 260 kb einträgen insg. wurden 155 genutzt
#"ArticleID";"Subject";"Keywords";"Solution";"SolutionText";"CreatedOn"
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", #
delimiter=";")
next(kb2keywords_gen, None) # skip first
cats=[]
subjects=[]
keywords=[]
for kb in kb2keywords_gen:
cats.append(kb[0])
subjects.append(kb[1])
keywords.append(kb[2].split(","))
cats_lst = list(set(cats))
print("cats: {}".format(len(cats_lst)))
print(cats_lst[0:20])
print(len(subjects))
subj_lst = list(set(subjects)) #frage: hat wirklich jeder kb_eintrag ein anderesn Betreff?
print("subjects: {}".format(len(subj_lst)))
print(subj_lst[0:20])
keywords = [item for sublist in keywords for item in sublist]
kys_lst = list(set(keywords))
print("keywords: {}".format(len(kys_lst)))
print(kys_lst[0:20])
used_list = ['bd_unicard_nicht_eingeschrieben', 'sd_vpn_temporaerer fehler ub', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_beschaedigte unicard', 'sd_boss_notenverbuchung', 'd.3 client installation', 'sd_keine rueckantwort kunde', 'sd_asknet_und_dreamspark', 'sd_beantragung_unicard', 'sd_gastaufenthalter', 'sd_internationaloffice', 'sd_outlook anmeldung gestoert', 'unicard_restbetrag_auszahlung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'ub_unicard_unicard mit vollmacht abholen', 'sd_namensaenderung mitarbeiter', 'sd_itmc kurse anmeldebestaetigung', 'sd_zugriff_onlinedienste_rueckmeldung', 'benutzer zum redmine hinzufuegen', 'sd_unicard_gesperrte unicard entsperre', 'lsf freischaltung als mitarbeiter/in', 'sd_mail_als_anhang', 'sd-e-mail_adresse_funktional_beantragen', 'sd_goeke drucker', 'sd_unimail imap_pop3', 'sd_origin_workaround', 'sd_matlab lizenzdatei pc-pools', 'sd_outlook kontakte automatische aktualisierung', 'sd_sap konteneinsicht antrag', 'ohne betreff', 'sd_telefonantrag_änderung_neuantrag', 'sd_sophos download', 'sd_geraeteausleihe', 'studierendenausweis', 'sd_citavi', 'sd_laufzeit unimail account', 'sd_login_unibib ub-it', 'sd_tu_app_keine internetverbindung', 'sd_unicard_max_laufzeit', 'ub_unicard_zusendung der karte moeglich?', 'sd_telefonbuch-eintrag_änderung', 'ub_drucker kopierer', 'windows 10', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'sd_tu-app feedback standard', 'sd_spam e-mail bekannt meldung', 'sd_spss_online_bestellung', 'sd_apple-on-campus', 'sd_studisek', 'sd_office 365 plus support', 'sd_sap_initialkennwort_englisch', 'sd_office365_asknet', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_login tu portale', 'ungueltiges ticket siehe journal', 'sd_sap_freischaltung ohne passwortaenderung', 'bd_unicard_geldkarte_laden', 'sd_verlust/antrag unicard', 'sd_unimail zu exchange', 'citavi_lizenzschluessel_nicht bekommen', 'sd_plotauftrag_zv', 'sd_citavi_support', 'sd_antworten_korrekt', 'sd_wlan-gastkonto', 'sd_antwort_phishingmail', 'bd_unicard_freigabe_beantragung', 'sd_origin nur noch eine seriennummer', 'cm_asiexception', 'sd_login_tu_portale', 'sd_webmailer_thread-anzeige', 'apps_dms-passwort d.3', 'apps_redmine_repository', 'sd_uniaccount_passwortaenderung', 'sd_phishing', 'sd_sap_firefox_esr', 'vpn verbindung fuer unitymedia kunden', 'sd_kurs-angebote anmeldung', 'sd_unicard fehlerhafte geldbuchung', 'sd_uniaccount_ehemalige_passwortaenderung', 'sd_sap_dienstreise', 'cm_lsf-boss_freischaltung', 'wlan', 'uni card', 'sd_webmailer einrichtung weiterleitung', 'spam ohne tu bezug', 'sd_outlook_in_exchange_einbinden', 'sd_wlan_beratung', 'sd_uniaccount_dauer freischaltung', 'sd_sap_konteneinsicht_ workaround', 'sd_vpn anleitungen', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_unicard_abholung', 'sd_vpn_probleme_mit_unitymedia', 'sd_diensthandy beschaffung', 'sd_unicard_defekt', 'sd_freischaltung uniaccount verzoegert', 'sd_kurs-angebote itmc', 'bd_goeke_allgemein', 'sd_uniaccount_ehemalige_studierende', 'sd_stellenausschreibung schwarzes brett', 'freischaltung uniaccount', 'sd_unicard_workaround_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'ub_geldchip-problem bei uc', 'sd_semesterticket', 'problem mit der beantragung von der unicard', 'sd_citavi bestellung', 'sd_immatrikulationsbescheigung_druckfehler', 'sd_vpn_aktualisierung', 'vpn_ipsec_stoerung', 'sd_dreamspark', 'ub_namensaenderung', 'sd_immatrikulationsbescheinigung_portal', 'ub_prod_neue unicard bei beschaedigung', 'sd_vpn_webvpn', 'sd_telefonbuch_prof_eintragung', 'sd_kontakt_asknet', 'probleme mit unicard', 'sd_office 356 plus bestellung', 'sd_gmx_web.de', 'fehlender eintrag im elektronischen telefonbuch', 'ub_prod_namenskorrektur_student', 'einrichtung des eduroam netzwerks', 'sd_sap_initialkennwort', 'sd_boss-bescheinigung', 'sd_wlan passwort setzen', 'sd_aktivierung uniaccount', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_heirat_namensaenderung_student', 'ub_unicard_spaetere abholung moeglich?', 'unicard nochmal beantragen', 'sd_studisek_buchung_semesterbeitrag', 'sd_pruefungsamt', 'unicard vergessen abzuholen und nicht mehr da', 'sd_antrag funktionale mailadresse', 'sd_email_namensaenderung', 'sd_telefonbuch, neues system', 'sd_account_abmelden', 'ub_unicard_abholungszeiten']
labellist = ['sd_antworten_korrekt', 'sd_kurs-angebote anmeldung', 'sd_semesterticket', 'apps_dms-passwort d.3', 'freischaltung uniaccount', 'sd_heirat_namensaenderung_student', 'bd_unicard_freigabe_beantragung', 'sd_uniaccount_ehemalige_studierende', 'sd_sap_dienstreise', 'sd_origin_workaround', 'sd_uniaccount_ehemalige_passwortaenderung', 'fehlender eintrag im elektronischen telefonbuch', 'wlan', 'sd_tu-app feedback standard', 'sd_wlan_beratung', 'sd_uniaccount_passwortaenderung', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_unicard_spaetere abholung moeglich?', 'sd_citavi_support', 'sd_outlook kontakte automatische aktualisierung', 'sd_origin nur noch eine seriennummer', 'lsf freischaltung als mitarbeiter/in', 'cm_asiexception', 'sd_freischaltung uniaccount verzoegert', 'ub_unicard_zusendung der karte moeglich?', 'sd_login_unibib ub-it', 'uni card', 'sd_outlook anmeldung gestoert', 'd.3 client installation', 'ub_unicard_abholungszeiten', 'sd_antwort_phishingmail', 'sd_matlab lizenzdatei pc-pools', 'sd_sap_initialkennwort', 'sd_sap_freischaltung ohne passwortaenderung', 'sd_spss_online_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'sd_keine rueckantwort kunde', 'sd_unimail imap_pop3', 'sd_beantragung_unicard', 'sd_unicard_gesperrte unicard entsperre', 'sd_internationaloffice', 'unicard nochmal beantragen', 'sd_stellenausschreibung schwarzes brett', 'sd_sophos download', 'cm_lsf-boss_freischaltung', 'sd_verlust/antrag unicard', 'vpn_ipsec_stoerung', 'sd_account_abmelden', 'sd_outlook_in_exchange_einbinden', 'ub_namensaenderung', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'unicard vergessen abzuholen und nicht mehr da', 'apps_redmine_repository', 'einrichtung des eduroam netzwerks', 'sd_unicard_max_laufzeit', 'sd_gmx_web.de', 'sd_unicard fehlerhafte geldbuchung', 'sd_geraeteausleihe', 'spam ohne tu bezug', 'sd_uniaccount_dauer freischaltung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'sd_office 365 plus support', 'sd_unicard_defekt', 'sd_phishing', 'sd_goeke drucker', 'ub_unicard_unicard mit vollmacht abholen', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_pruefungsamt', 'sd_aktivierung uniaccount', 'sd_boss-bescheinigung', 'sd_sap_initialkennwort_englisch', 'bd_unicard_geldkarte_laden', 'sd_telefonbuch-eintrag_änderung', 'vpn verbindung fuer unitymedia kunden', 'sd_studisek', 'sd_antrag funktionale mailadresse', 'sd_asknet_und_dreamspark', 'sd_unicard_workaround_bestellung', 'sd_sap_firefox_esr', 'sd_vpn anleitungen', 'sd_office365_asknet', 'citavi_lizenzschluessel_nicht bekommen', 'sd_sap konteneinsicht antrag', 'sd_spam e-mail bekannt meldung', 'ub_prod_namenskorrektur_student', 'ub_beschaedigte unicard', 'sd_namensaenderung mitarbeiter', 'sd_mail_als_anhang', 'benutzer zum redmine hinzufuegen', 'sd_login_tu_portale', 'sd_email_namensaenderung', 'windows 10', 'ungueltiges ticket siehe journal', 'sd_vpn_temporaerer fehler ub', 'ub_prod_neue unicard bei beschaedigung', 'sd_dreamspark', 'sd_webmailer einrichtung weiterleitung', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_studisek_buchung_semesterbeitrag', 'sd_immatrikulationsbescheinigung_portal', 'sd_vpn_probleme_mit_unitymedia', 'sd-e-mail_adresse_funktional_beantragen', 'sd_diensthandy beschaffung', 'sd_vpn_webvpn', 'sd_laufzeit unimail account', 'sd_citavi', 'problem mit der beantragung von der unicard', 'sd_kurs-angebote itmc', 'sd_telefonbuch, neues system', 'sd_login tu portale', 'sd_wlan passwort setzen', 'sd_zugriff_onlinedienste_rueckmeldung', 'unicard_restbetrag_auszahlung', 'sd_immatrikulationsbescheigung_druckfehler', 'bd_unicard_nicht_eingeschrieben', 'sd_unimail zu exchange', 'sd_wlan-gastkonto', 'probleme mit unicard', 'sd_telefonbuch_prof_eintragung', 'sd_vpn_aktualisierung', 'sd_apple-on-campus', 'bd_goeke_allgemein', 'studierendenausweis', 'ub_drucker kopierer', 'sd_unicard_abholung', 'sd_office 356 plus bestellung', 'ohne betreff', 'sd_tu_app_keine internetverbindung', 'sd_boss_notenverbuchung', 'ub_geldchip-problem bei uc', 'sd_itmc kurse anmeldebestaetigung', 'sd_citavi bestellung', 'sd_telefonantrag_änderung_neuantrag', 'sd_sap_konteneinsicht_ workaround', 'sd_kontakt_asknet', 'sd_plotauftrag_zv', 'sd_webmailer_thread-anzeige', 'sd_gastaufenthalter']
for l in used_list:
if l not in labellist:
print(l)
print(len(used_list))
print(len(labellist))
# load corpus
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
preCorpus_name = "de" + "_pre_ticket"
@ -87,17 +35,133 @@ corpus_train = corpus[0:split_index]
corpus_test = corpus[split_index:len(corpus)-1]
# frage wieviele tickets pro topic?
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
ticket2kb_dict = {}
for line in kb2ticket_gen:
ticket_id = line[0]
kb_id = line[1]
ticket2kb_dict[ticket_id] = kb_id
# {'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...} # kb2keywords_dict
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";")
next(kb2keywords_gen, None) # skip first line("ArticleID";"Subject";"Keywords";...)
kb2keywords_dict = {}
kb_keywords=False
for line in kb2keywords_gen:
kb_id = line[0]
subject = line[1]
keywords = line[2]
keywords_list = [normalize(x) for x in str(keywords).split(",")]
if kb_id not in kb2keywords_dict.keys():
kb2keywords_dict[kb_id] = []
if kb_keywords:
for item in keywords_list:
if item != "":
kb2keywords_dict[kb_id].append(item)
else:
kb2keywords_dict[kb_id].append(subject)
# remove all empty items
kb2keywords_dict = {k: v for k, v in kb2keywords_dict.items() if len(v) != 0}
# {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...}
# lda bild abdunkeln
# auschnitte
cat_dict = {}
count_dict={}
keywords_dict={}
for doc in corpus:
category_name_ = doc.metadata["categoryName"]
if category_name_ not in cat_dict.keys():
cat_dict[category_name_] = 1
else:
cat_dict[category_name_] += 1
try:
x=doc.metadata["TicketNumber"]
x=ticket2kb_dict[x]
x=kb2keywords_dict[x]
except:
pass
for k,v in kb2keywords_dict.items(): #str,list
for elem in v:
if elem not in count_dict.keys():
count_dict[elem] = 1
else:
count_dict[elem] += 1
import operator
"""
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
for k,v in sorted_dict:
print(k,v)
print(len(sorted_dict))
"""
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";")
next(kb2keywords_gen, None) # skip first
cnt=0
for kb in kb2keywords_gen:
cnt +=1
print(str(cnt))
count_dict = {}
# "ArticleID";"Subject";"Keywords";"Solution";"SolutionText";"CreatedOn"
for kb_entry in kb2keywords_gen:
entry_ = kb_entry[1]
if entry_ not in count_dict.keys():
count_dict[entry_] = 1
else:
count_dict[entry_] += 1
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
#for k,v in sorted_dict:
# print(k,v)
#print(len(sorted_dict))
import numpy as np
matplotlib.use('Agg')
import matplotlib.pyplot as plt
@ -110,6 +174,64 @@ import matplotlib.pyplot as plt
end = time.time()
print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start))
"""
# kb2keywords_dict
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv",
delimiter=";")
used_kb=[]
for kb in kb2keywords_gen:
used_kb.append(kb[1])
print("used_kb: {}".format(len(list(set(used_kb)))))
#"ArticleID";"Subject";"Keywords";"Solution";"SolutionText";"CreatedOn"
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", #
delimiter=";")
next(kb2keywords_gen, None) # skip first
cat_lst=[]
sub_lst=[]
key_lst=[]
for kb in kb2keywords_gen:
cat_lst.append(kb[0])
sub_lst.append(kb[1])
key_lst.append(kb[2].split(","))
cats_setlist = list(set(cat_lst))
print("cats: {}".format(len(cats_setlist)))
print(cats_setlist[0:20])
print()
print("sub_lst: {}".format(len(sub_lst)))
sub_setlist = list(set(sub_lst)) #frage: hat wirklich jeder kb_eintrag ein anderesn Betreff?
print("sub_setlist: {}".format(len(sub_setlist)))
#print(sub_setlist[0:20])
print()
key_lst = [item for sublist in key_lst for item in sublist] #flatten list
key_setlist = list(set(key_lst))
print("key_setlist: {}".format(len(key_setlist)))
#print(key_setlist[0:20])
print("\n\n\n\n")
"""
"""
used_list = ['bd_unicard_nicht_eingeschrieben', 'sd_vpn_temporaerer fehler ub', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_beschaedigte unicard', 'sd_boss_notenverbuchung', 'd.3 client installation', 'sd_keine rueckantwort kunde', 'sd_asknet_und_dreamspark', 'sd_beantragung_unicard', 'sd_gastaufenthalter', 'sd_internationaloffice', 'sd_outlook anmeldung gestoert', 'unicard_restbetrag_auszahlung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'ub_unicard_unicard mit vollmacht abholen', 'sd_namensaenderung mitarbeiter', 'sd_itmc kurse anmeldebestaetigung', 'sd_zugriff_onlinedienste_rueckmeldung', 'benutzer zum redmine hinzufuegen', 'sd_unicard_gesperrte unicard entsperre', 'lsf freischaltung als mitarbeiter/in', 'sd_mail_als_anhang', 'sd-e-mail_adresse_funktional_beantragen', 'sd_goeke drucker', 'sd_unimail imap_pop3', 'sd_origin_workaround', 'sd_matlab lizenzdatei pc-pools', 'sd_outlook kontakte automatische aktualisierung', 'sd_sap konteneinsicht antrag', 'ohne betreff', 'sd_telefonantrag_änderung_neuantrag', 'sd_sophos download', 'sd_geraeteausleihe', 'studierendenausweis', 'sd_citavi', 'sd_laufzeit unimail account', 'sd_login_unibib ub-it', 'sd_tu_app_keine internetverbindung', 'sd_unicard_max_laufzeit', 'ub_unicard_zusendung der karte moeglich?', 'sd_telefonbuch-eintrag_änderung', 'ub_drucker kopierer', 'windows 10', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'sd_tu-app feedback standard', 'sd_spam e-mail bekannt meldung', 'sd_spss_online_bestellung', 'sd_apple-on-campus', 'sd_studisek', 'sd_office 365 plus support', 'sd_sap_initialkennwort_englisch', 'sd_office365_asknet', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_login tu portale', 'ungueltiges ticket siehe journal', 'sd_sap_freischaltung ohne passwortaenderung', 'bd_unicard_geldkarte_laden', 'sd_verlust/antrag unicard', 'sd_unimail zu exchange', 'citavi_lizenzschluessel_nicht bekommen', 'sd_plotauftrag_zv', 'sd_citavi_support', 'sd_antworten_korrekt', 'sd_wlan-gastkonto', 'sd_antwort_phishingmail', 'bd_unicard_freigabe_beantragung', 'sd_origin nur noch eine seriennummer', 'cm_asiexception', 'sd_login_tu_portale', 'sd_webmailer_thread-anzeige', 'apps_dms-passwort d.3', 'apps_redmine_repository', 'sd_uniaccount_passwortaenderung', 'sd_phishing', 'sd_sap_firefox_esr', 'vpn verbindung fuer unitymedia kunden', 'sd_kurs-angebote anmeldung', 'sd_unicard fehlerhafte geldbuchung', 'sd_uniaccount_ehemalige_passwortaenderung', 'sd_sap_dienstreise', 'cm_lsf-boss_freischaltung', 'wlan', 'uni card', 'sd_webmailer einrichtung weiterleitung', 'spam ohne tu bezug', 'sd_outlook_in_exchange_einbinden', 'sd_wlan_beratung', 'sd_uniaccount_dauer freischaltung', 'sd_sap_konteneinsicht_ workaround', 'sd_vpn anleitungen', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_unicard_abholung', 'sd_vpn_probleme_mit_unitymedia', 'sd_diensthandy beschaffung', 'sd_unicard_defekt', 'sd_freischaltung uniaccount verzoegert', 'sd_kurs-angebote itmc', 'bd_goeke_allgemein', 'sd_uniaccount_ehemalige_studierende', 'sd_stellenausschreibung schwarzes brett', 'freischaltung uniaccount', 'sd_unicard_workaround_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'ub_geldchip-problem bei uc', 'sd_semesterticket', 'problem mit der beantragung von der unicard', 'sd_citavi bestellung', 'sd_immatrikulationsbescheigung_druckfehler', 'sd_vpn_aktualisierung', 'vpn_ipsec_stoerung', 'sd_dreamspark', 'ub_namensaenderung', 'sd_immatrikulationsbescheinigung_portal', 'ub_prod_neue unicard bei beschaedigung', 'sd_vpn_webvpn', 'sd_telefonbuch_prof_eintragung', 'sd_kontakt_asknet', 'probleme mit unicard', 'sd_office 356 plus bestellung', 'sd_gmx_web.de', 'fehlender eintrag im elektronischen telefonbuch', 'ub_prod_namenskorrektur_student', 'einrichtung des eduroam netzwerks', 'sd_sap_initialkennwort', 'sd_boss-bescheinigung', 'sd_wlan passwort setzen', 'sd_aktivierung uniaccount', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_heirat_namensaenderung_student', 'ub_unicard_spaetere abholung moeglich?', 'unicard nochmal beantragen', 'sd_studisek_buchung_semesterbeitrag', 'sd_pruefungsamt', 'unicard vergessen abzuholen und nicht mehr da', 'sd_antrag funktionale mailadresse', 'sd_email_namensaenderung', 'sd_telefonbuch, neues system', 'sd_account_abmelden', 'ub_unicard_abholungszeiten']
labellist = ['sd_antworten_korrekt', 'sd_kurs-angebote anmeldung', 'sd_semesterticket', 'apps_dms-passwort d.3', 'freischaltung uniaccount', 'sd_heirat_namensaenderung_student', 'bd_unicard_freigabe_beantragung', 'sd_uniaccount_ehemalige_studierende', 'sd_sap_dienstreise', 'sd_origin_workaround', 'sd_uniaccount_ehemalige_passwortaenderung', 'fehlender eintrag im elektronischen telefonbuch', 'wlan', 'sd_tu-app feedback standard', 'sd_wlan_beratung', 'sd_uniaccount_passwortaenderung', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_unicard_spaetere abholung moeglich?', 'sd_citavi_support', 'sd_outlook kontakte automatische aktualisierung', 'sd_origin nur noch eine seriennummer', 'lsf freischaltung als mitarbeiter/in', 'cm_asiexception', 'sd_freischaltung uniaccount verzoegert', 'ub_unicard_zusendung der karte moeglich?', 'sd_login_unibib ub-it', 'uni card', 'sd_outlook anmeldung gestoert', 'd.3 client installation', 'ub_unicard_abholungszeiten', 'sd_antwort_phishingmail', 'sd_matlab lizenzdatei pc-pools', 'sd_sap_initialkennwort', 'sd_sap_freischaltung ohne passwortaenderung', 'sd_spss_online_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'sd_keine rueckantwort kunde', 'sd_unimail imap_pop3', 'sd_beantragung_unicard', 'sd_unicard_gesperrte unicard entsperre', 'sd_internationaloffice', 'unicard nochmal beantragen', 'sd_stellenausschreibung schwarzes brett', 'sd_sophos download', 'cm_lsf-boss_freischaltung', 'sd_verlust/antrag unicard', 'vpn_ipsec_stoerung', 'sd_account_abmelden', 'sd_outlook_in_exchange_einbinden', 'ub_namensaenderung', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'unicard vergessen abzuholen und nicht mehr da', 'apps_redmine_repository', 'einrichtung des eduroam netzwerks', 'sd_unicard_max_laufzeit', 'sd_gmx_web.de', 'sd_unicard fehlerhafte geldbuchung', 'sd_geraeteausleihe', 'spam ohne tu bezug', 'sd_uniaccount_dauer freischaltung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'sd_office 365 plus support', 'sd_unicard_defekt', 'sd_phishing', 'sd_goeke drucker', 'ub_unicard_unicard mit vollmacht abholen', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_pruefungsamt', 'sd_aktivierung uniaccount', 'sd_boss-bescheinigung', 'sd_sap_initialkennwort_englisch', 'bd_unicard_geldkarte_laden', 'sd_telefonbuch-eintrag_änderung', 'vpn verbindung fuer unitymedia kunden', 'sd_studisek', 'sd_antrag funktionale mailadresse', 'sd_asknet_und_dreamspark', 'sd_unicard_workaround_bestellung', 'sd_sap_firefox_esr', 'sd_vpn anleitungen', 'sd_office365_asknet', 'citavi_lizenzschluessel_nicht bekommen', 'sd_sap konteneinsicht antrag', 'sd_spam e-mail bekannt meldung', 'ub_prod_namenskorrektur_student', 'ub_beschaedigte unicard', 'sd_namensaenderung mitarbeiter', 'sd_mail_als_anhang', 'benutzer zum redmine hinzufuegen', 'sd_login_tu_portale', 'sd_email_namensaenderung', 'windows 10', 'ungueltiges ticket siehe journal', 'sd_vpn_temporaerer fehler ub', 'ub_prod_neue unicard bei beschaedigung', 'sd_dreamspark', 'sd_webmailer einrichtung weiterleitung', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_studisek_buchung_semesterbeitrag', 'sd_immatrikulationsbescheinigung_portal', 'sd_vpn_probleme_mit_unitymedia', 'sd-e-mail_adresse_funktional_beantragen', 'sd_diensthandy beschaffung', 'sd_vpn_webvpn', 'sd_laufzeit unimail account', 'sd_citavi', 'problem mit der beantragung von der unicard', 'sd_kurs-angebote itmc', 'sd_telefonbuch, neues system', 'sd_login tu portale', 'sd_wlan passwort setzen', 'sd_zugriff_onlinedienste_rueckmeldung', 'unicard_restbetrag_auszahlung', 'sd_immatrikulationsbescheigung_druckfehler', 'bd_unicard_nicht_eingeschrieben', 'sd_unimail zu exchange', 'sd_wlan-gastkonto', 'probleme mit unicard', 'sd_telefonbuch_prof_eintragung', 'sd_vpn_aktualisierung', 'sd_apple-on-campus', 'bd_goeke_allgemein', 'studierendenausweis', 'ub_drucker kopierer', 'sd_unicard_abholung', 'sd_office 356 plus bestellung', 'ohne betreff', 'sd_tu_app_keine internetverbindung', 'sd_boss_notenverbuchung', 'ub_geldchip-problem bei uc', 'sd_itmc kurse anmeldebestaetigung', 'sd_citavi bestellung', 'sd_telefonantrag_änderung_neuantrag', 'sd_sap_konteneinsicht_ workaround', 'sd_kontakt_asknet', 'sd_plotauftrag_zv', 'sd_webmailer_thread-anzeige', 'sd_gastaufenthalter']
for l in used_list:
if l not in labellist:
print(l)
print(len(used_list))
print(len(labellist))
"""
"""
vllt kategorien in unterkategorien aufteilen

View File

@ -240,16 +240,20 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
logprint("")
logprint("start Category-LLDA:")
# build dictionary of ticketcategories
labelist = []
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
for doc in corpus:
labelist.append(normalize(doc.metadata["categoryName"]))
labelist = list(set(labelist))
print("len(labelist): {}".format(len(labelist)))
labeldict = {k: v for v, k in enumerate(labelist)}
def gen_cat_lines(textacyCorpus, labeldict):
""" generates [topic1, topic2....] tok1 tok2 tok3 out of corpi"""
@ -404,6 +408,213 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
logprint("\n\n\nTime Elapsed {1}-LLDA :{0} min\n\n".format((end - start) / 60,"Keyword" if kb_keywords else "Subject"))
def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
start = time.time()
logprint("")
logprint("start LLDA:")
# kb2keywords_dict / kb2subj_dict {str : [str]}
kb2keywords_dict = {}
kb2subjects_dict = {}
kb_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";")
next(kb_gen, None) # skip first line "ArticleID";"Subject";"Keywords";...
for line in kb_gen:
kb_id = line[0]
subject = normalize(line[1])
keywords = [normalize(x) for x in str(line[2]).split(",")]
if kb_id not in kb2keywords_dict.keys():
kb2keywords_dict[kb_id] = keywords if keywords != [''] else ["DEFAULT"]
else:
kb2keywords_dict[kb_id] = kb2keywords_dict[kb_id] + keywords
if kb_id not in kb2subjects_dict.keys():
kb2subjects_dict[kb_id] = [normalize(subject) if subject != [''] else "DEFAULT"]
else:
kb2subjects_dict[kb_id].append(normalize(subject))
# ticket2kbs_dict
ticket2kbs_dict = {}
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
next(kb2ticket_gen, None) # skip first line"TicketNumber";"ArticleID"
for line in kb2ticket_gen:
ticket_id = line[0]
kb_id = line[1]
if ticket_id not in ticket2kbs_dict.keys():
ticket2kbs_dict[ticket_id] = [kb_id]
else:
ticket2kbs_dict[ticket_id].append(kb_id)
# ticket2keywords
ticket2keywords_dict = {} # {str:[str]}
for ticket_id, kb_ids in ticket2kbs_dict.items():
if ticket_id not in ticket2keywords_dict.keys():
ticket2keywords_dict[ticket_id] = []
for kb_id in kb_ids:
ticket2keywords_dict[ticket_id].append(kb2keywords_dict[kb_id])
ticket2keywords_dict[ticket_id] = flatten(ticket2keywords_dict[ticket_id])
# ticket2subjects
ticket2subjects_dict = {} # {str:[str]}
for ticket_id, kb_ids in ticket2kbs_dict.items():
if ticket_id not in ticket2subjects_dict.keys():
ticket2subjects_dict[ticket_id] = []
for kb_id in kb_ids:
ticket2subjects_dict[ticket_id].append(kb2subjects_dict[kb_id])
ticket2subjects_dict[ticket_id] = flatten(ticket2subjects_dict[ticket_id])
# kb2keywords_dict {'KBA10230': ['DEFAULT'], 'KBA10129': ['DEFAULT'], 'KBA10287': ['sd_ansys_informationen'], } len = 260
#kb2subjects_dict {'KBA10230': ['unicard nochmal beantragen'], 'KBA10129': ['sd_entsperrung unicard nach verlust/wiederfinden'], } len = 260
# ticket2kbs_dict {'INC44526': ['KBA10056'], 'INC67205': ['KBA10056'], } len = 4832
# ticket2keywords_dict {'INC44526': ['DEFAULT'], 'INC67205': ['DEFAULT'], 'INC71863': ['DEFAULT'], 'INC44392': ['asknet'] } len=4832
#ticket2subjects_dioct {'INC44526': ['sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt)'], len=4832
# frage wieviele tickets pro topic?
count_dict = {}
for v in ticket2kbs_dict.values():
for kb in v:
if kb in count_dict.keys():
count_dict[kb] +=1
else:
count_dict[kb] = 1
import operator
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
print("kb_entrys used: {}".format(len(sorted_dict)))
for k,v in sorted_dict:
print(k,kb2subjects_dict[k],v) #todo das selbe mit keywords
#todo hier weiter
# todo frage wie viele kb_entry's insg genutzt?
labelist = ticket2keywords_dict.values()
labelist = flatten(labelist)
labelist = list(set(labelist))
labeldict = {k: v for v, k in enumerate(labelist)}
def gen_key_lines(textacyCorpus, labeldict, ticket2keywords_dict):
for doc in corpus:
ticket_number = doc.metadata["TicketNumber"]
keywords = ticket2keywords_dict.get(ticket_number, ['DEFAULT'])
if keywords != ['DEFAULT']:
label = ""
for kw in keywords:
label = label + str(labeldict.get(normalize(str(kw)), labeldict['DEFAULT'])) + " "
yield "[ " + label + "] " + doc.text
keys_line_gen = gen_key_lines(corpus, labeldict, ticket2keywords_dict)
path2save_keys_results = path2save_results + "_kb_keys_llda_{}".format("top" + str(top_topic_words))
jgibbsLLDA(labeldict, keys_line_gen, path2save_keys_results, top_topic_words=top_topic_words)
"""
def gen_subj_lines(textacyCorpus, labeldict, ticket2subjects_dict):
for doc in corpus:
ticket_number = doc.metadata["TicketNumber"]
keywords = ticket2subjects_dict.get(ticket_number, ['DEFAULT'])
if keywords != ['DEFAULT']:
label = ""
for kw in keywords:
label = label + str(labeldict.get(normalize(str(kw)), len(labeldict))) + " "
yield "[ " + label + "] " + doc.text
"""
labelist = ticket2subjects_dict.values()
labelist = flatten(labelist)
labelist = list(set(labelist))
labeldict = {k: v for v, k in enumerate(labelist)}
labeldict.update({'DEFAULT' : len(labeldict)})
subj_line_gen = gen_key_lines(corpus, labeldict, ticket2subjects_dict)
path2save_subj_results = path2save_results + "_kb_subj_llda_{}".format("top" + str(top_topic_words))
jgibbsLLDA(labeldict, subj_line_gen, path2save_subj_results, top_topic_words=top_topic_words)
end = time.time()
logprint("\n\n\nTime Elapsed LLDA :{0} min\n\n".format((end - start) / 60))
def main( algorithm="llda"):
@ -427,14 +638,16 @@ def main( algorithm="llda"):
if algorithm == "llda":
top_topic_words = 5
jgibbsLLDA_category(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words)
jgibbsLLDA_KB_v2(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words)
kb_keywords = False
jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
#jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
kb_keywords = True
jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
#jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)

727
topicModeling_1711_0846.py Normal file
View File

@ -0,0 +1,727 @@
# -*- coding: utf-8 -*-
from datetime import datetime
import draw
import draw1
import time
import numpy as np
import csv
import sys
import json
import os.path
import subprocess
from textacy import Vectorizer, viz
from miscellaneous import *
import textacy
from scipy import *
import os
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
def label2ID(label, labeldict):
return labeldict.get(label, len(labeldict))
def generate_lablelID_lines(textacyCorpus, labeldict):
for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text
"""
def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
logprint(str("max_df: {0}".format(max_df)))
logprint(str("named_entities: {0}".format(named_entities)))
# printlog("vectorize corpi...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
for t in terms_list:
print(t)
logprint("doc_term_matrix: {0}".format(doc_term_matrix))
logprint("id2term: {0}".format(id2term))
"""
def textacyTopicModeling(corpus,
n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5,
ngrams = 1, min_df=1, max_df=1.0,
topicModel='lda'):
n_terms = int(n_topics * top_topic_words)
sort_terms_by = 'seriation' # 'seriation', 'weight', 'index', 'alphabetical'
rank_terms_by = 'corpus' # 'corpus', 'topic'
logprint(
"############### Topic Modeling {0} ###########################".format(
topicModel))
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
logprint(str("max_df: {0}".format(max_df)))
logprint(str("n_topics: {0}".format(n_topics)))
logprint("\n")
start = time.time()
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
#################### vectorize corpi ####################
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
# printlog("terms_list: {0}".format(list(terms_list)))
# printlog("doc_term_matrix: {0}".format(doc_term_matrix))
##################### Initialize and train a topic model ##############################################
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
logprint(topic_idx)
for j in top_docs:
logprint(corpus[j].metadata['categoryName'])
####################### termite plot ###################################################################
grams_label = "uni" if ngrams == 1 else "bi"
"""
model.termite_plot(doc_term_matrix, id2term,
n_terms=n_terms,
sort_terms_by=sort_terms_by,
rank_terms_by=rank_terms_by+'_weight',
save= FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label,topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by))
"""
draw1.termite_plot(model,doc_term_matrix, id2term,
n_terms=n_terms,
sort_terms_by=sort_terms_by,
rank_terms_by=rank_terms_by + '_weight',
save=FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label, topicModel, n_topics,
n_terms, sort_terms_by, rank_terms_by))
end = time.time()
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
start = time.time()
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
# build dictionary of ticketcategories
labelist = []
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
labeldict = {k: v for v, k in enumerate(labelist)}
reverse_labeldict = {v: k for k, v in labeldict.items()}
#and save
labeldict_path = FILEPATH + "results/labeldict.txt"
with open(labeldict_path, 'w') as file:
file.write(json.dumps(labeldict))
n_topics = len(labeldict) #+1 #default-topic
# create file with label_IDs (input for llda)
textacy.fileio.write_file_lines(generate_lablelID_lines(corpus, labeldict), filepath=LLDA_filepath)
# wait for file to exist
while not os.path.exists(LLDA_filepath):
time.sleep(1)
logprint("")
logprint("start LLDA:")
# run JGibbsLLDA file
FNULL = open(os.devnull, 'w') # supress output
cmd_jgibbs_java = ["java", "-cp",
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
jgibbsLLDA_root),
"jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
"tickets.gz",
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
output = subprocess.check_output(cmd_gzip).decode("utf-8")
topic_regex = re.compile(r'Topic [0-9]*')
#####################################
# todo save results in file aufgrund von results
result = []
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
try:
index = int(findall[0].split()[1])
result.append("Topic {} {}:".format(index, reverse_labeldict[index]))
except:
result.append(line)
else:
result.append(line)
textacy.fileio.write_file_lines(result, path2save_results+".txt")
#####################################
results = []
res_dict = {}
count =0
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
if len(res_dict) != 0:
results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig)
index = int(findall[0].split()[1])
res_dict = {index : str(reverse_labeldict[index]) }
else:
splitted = line.split()
res_dict[splitted[0]] = float(splitted[1])
"""
### print terms that are topics
for s in list(res_dict.values()):
if isinstance(s,str) and splitted[0] in s:
vals = list(res_dict.values())
keys = list(res_dict.keys())
for v in vals:
if not isinstance(v,float):
print("{}".format(v))
print("{}".format(splitted[0]))
count +=1
print()
###
"""
if len(res_dict) != 0:
results.append(res_dict) # letzes an die liste ran
#print(count)
#print(float(count)/float(len(labelist)))
# {0: 'betrieb', 'service': 0.24162679425837305, 'support': 0.24162679425837305, 'browser': 0.24162679425837305, 'unicard': 0.24162679425837305, 'telefon': 0.0023923444976076593}
# every term in the resulsts to a list
terms=[]
for res in results:
for key,value in res.items():
if not isinstance(key, int) and not key in terms:
terms.append(key)
term2id = {t:i for i,t in enumerate(terms)} #and to dict
################# termite plot #####################################################################
#term_topic_weights.shape = (len(term_ids),len(topic_ids)
#topic_labels = tuple(labelist)
topic_labels = list(range(len(labelist)))
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
for i,res in enumerate(results):
for key,value in res.items():
if not isinstance(key, int):
term_topic_weights[term2id[key]][i] = value
term_labels[term2id[key]] = key
else:
topic_labels[i] = reverse_labeldict[key]
#viz.draw_termite_plot(term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
draw.draw_termite(
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
end = time.time()
logprint("Time Elapsed Topic Modeling JGibbsLLDA:{0} min\n".format((end - start) / 60))
def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words=7, kb_keywords=False):
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
# ticket2kb_dict
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
ticket2kb_dict = {} #{'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...}
for line in kb2ticket_gen:
ticket_id = line[0]
kb_id = line[1]
ticket2kb_dict[ticket_id] = kb_id
#############
# kb2keywords_dict
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";") #"ArticleID";"Subject";"Keywords";.....
next(kb2keywords_gen,None) #skip first
kb2keywords_dict = {}
for lino in kb2keywords_gen:
kb_id = lino[0]
kb2keywords_dict[kb_id] = []
subject = lino[1]
keywords = lino[2]
keywords_list = [x.lower().strip() for x in map(replaceRockDots(),str(keywords).split(","))]
if kb_keywords:
for item in keywords_list:
if item != "":
kb2keywords_dict[kb_id].append(item)
else:
kb2keywords_dict[kb_id].append(subject)
#remove all empty items
kb2keywords_dict = { k : v for k,v in kb2keywords_dict.items() if len(v) != 0}
###############
#keywords2kb_dict
keywords2kb_dict = {}
for kb_id, lst in kb2keywords_dict.items():
for l in lst:
if l not in keywords2kb_dict.keys():
keywords2kb_dict[l] = [kb_id]
else:
keywords2kb_dict[l].append(kb_id)
############
# idee topic_ID -> KB_ID -> keywords / subject -> llda
# ticket2kb_dict {'INC65627': 'KBA10044', 'INC66057': 'KBA10009', ...}
# kb2keywords_dict {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...}
# keywords2kb_dict {'unicard namensaenderung': ['KBA10276'], 'vpn': ['KBA10063'], 'outlook_exchange': ['KBA10181'], ...}
# Look for actually used keywords
used_keywords = []
for doc in corpus:
ticket_number = doc.metadata["TicketNumber"]
kb_number = ticket2kb_dict.get(ticket_number, None)
keywords = kb2keywords_dict.get(kb_number, None)
if keywords and kb_number:
used_keywords.append(list(map(normalize,keywords)))
kb_entries_used = (len(list(set([kb for kb in ticket2kb_dict.values()]))))
print("kb_entries_used: {}".format(kb_entries_used))
labelist = [item for sublist in used_keywords for item in sublist]
labelist = list(set(labelist))
print("len(labelist): {}".format(len(labelist)))
labeldict = {k: v for v, k in enumerate(labelist)}
labeldict_rev = {v: k for k, v in labeldict.items()}
print("labledict created")
def genos_linos(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict):
for doc in textacyCorpus:
ticket_number = doc.metadata["TicketNumber"]
kb_number = ticket2kb_dict.get(ticket_number, None)
keywords = kb2keywords_dict.get(kb_number, None)
if keywords is not None:
pass
if keywords and kb_number:
label = ""
for kw in keywords:
label = label + str(labeldict.get( normalize(str(kw)) , len(labeldict))) + " "
yield "[ " + label + "] " + doc.text
line_gen = genos_linos(corpus, labeldict, ticket2kb_dict, kb2keywords_dict)
textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath)
# wait for file to exist
while not os.path.exists(LLDA_filepath):
time.sleep(1)
logprint("")
logprint("start LLDA:")
# run JGibbsLLDA file
n_topics = len(labeldict) #+1 #default-topic
FNULL = open(os.devnull, 'w') # supress output
cmd_jgibbs_java = ["java", "-cp",
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
jgibbsLLDA_root),
"jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
"tickets.gz",
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
output = subprocess.check_output(cmd_gzip).decode("utf-8")
topic_regex = re.compile(r'Topic [0-9]*')
#####################################
# todo save results in file aufgrund von results
result = []
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
try:
index = int(findall[0].split()[1])
result.append("Topic {} {}:".format(index, labeldict_rev[index]))
except:
result.append(line)
else:
result.append(line)
textacy.fileio.write_file_lines(result, path2save_results+".txt")
#####################################
results = []
res_dict = {}
count =0
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
if len(res_dict) != 0:
results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig)
index = int(findall[0].split()[1])
res_dict = {index : str(labeldict_rev[index]) }
else:
splitted = line.split()
res_dict[splitted[0]] = float(splitted[1])
if len(res_dict) != 0:
results.append(res_dict) # letzes an die liste ran
# every term in the resulsts to a list
terms=[]
for res in results:
for key,value in res.items():
if not isinstance(key, int) and not key in terms:
terms.append(key)
term2id = {t:i for i,t in enumerate(terms)} #and to dict
################# termite plot #####################################################################
topic_labels = list(range(len(labelist)))
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
for i,res in enumerate(results):
for key,value in res.items():
if not isinstance(key, int):
term_topic_weights[term2id[key]][i] = value
term_labels[term2id[key]] = key
else:
topic_labels[i] = labeldict_rev[key]
draw.draw_termite(
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
end = time.time()
def main(use_cleaned=False, algorithm="llda"):
logprint("Topic Modeling: {0}".format(datetime.now()))
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
if use_cleaned:
preCorpus_name = "de" + "_clean_ticket"
resultspath = FILEPATH + "results/clean"
else:
preCorpus_name = "de" + "_pre_ticket"
resultspath = FILEPATH + "results/pre"
# load cleand corpus
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
logprint("Corpus loaded: {0}".format(de_corpus.lang))
"""
ngrams = 1
min_df = 1
max_df = 1.0
weighting = 'tf'
# weighting ='tfidf'
named_entities = False
printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
"""
if algorithm == "llda":
top_topic_words = 5
path2save_results = resultspath + "_cat_{}_{}".format(algorithm,"top"+str(top_topic_words))
jgibbsLLDA_category(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
kb_keywords = False
path2save_results = resultspath + "_kb_{}_{}_{}".format("keys" if kb_keywords else "subs",algorithm,"top"+str(top_topic_words))
jgibbsLLDA_KB(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
kb_keywords = True
path2save_results = resultspath + "_kb_{}_{}_{}".format("keys" if kb_keywords else "subs", algorithm,
"top" + str(top_topic_words))
jgibbsLLDA_KB(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
kb_keywords=kb_keywords)
"""
top_topic_words = 10
path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
top_topic_words = 15
path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
top_topic_words = 20
path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
"""
else:
textacyTopicModeling(ngrams = 1,
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =15,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=20,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=25,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=30,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams=(1, 2),
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=15,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =20,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =25,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =30,
corpus=de_corpus)
"""
if __name__ == "__main__":
main()