refactored

This commit is contained in:
jannis.grundmann 2017-11-17 11:46:57 +01:00
parent 0a6a68b8aa
commit 4a3683635e
14 changed files with 643 additions and 241 deletions

View File

@ -62,8 +62,6 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
[preprocessing]
#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
autocorrect = false
#true
@ -72,26 +70,4 @@ custom_words=aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderu
[topicmodeling]
ngrams=(1,2)
min_df=0
max_df=1.0
no_below=20
no_above=0.5
topicModel=lda
top_topic_words=5
top_document_labels_per_topic=2

165
draw.py Normal file
View File

@ -0,0 +1,165 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
try:
import matplotlib.pyplot as plt
except ImportError:
pass
RC_PARAMS = {'axes.axisbelow': True,
'axes.edgecolor': '.8',
'axes.facecolor': 'white',
'axes.grid': False,
'axes.labelcolor': '.15',
'axes.linewidth': 1.0,
'axes.labelpad' : 10.0,
'figure.facecolor': 'white',
'font.family': ['sans-serif'],
'font.sans-serif': ['Arial', 'Liberation Sans', 'sans-serif'],
'grid.color': '.8', 'grid.linestyle': '-',
'image.cmap': 'Greys',
'legend.frameon': False,
'legend.numpoints': 1, 'legend.scatterpoints': 1,
'lines.solid_capstyle': 'round',
'text.color': '1.0',
'xtick.color': '1.0', 'xtick.direction': 'out',
'xtick.major.size': 0.0, 'xtick.minor.size': 0.0,
'xtick.major.pad' : 5,
'ytick.color': '1.0', 'ytick.direction': 'out',
'ytick.major.size': 0.0, 'ytick.minor.size': 0.0,
'axes.ymargin' : 0.9,
'ytick.major.pad': 5}
COLOR_PAIRS = (((0.65098041296005249, 0.80784314870834351, 0.89019608497619629),
(0.12572087695201239, 0.47323337360924367, 0.707327968232772)),
((0.68899655751153521, 0.8681737867056154, 0.54376011946622071),
(0.21171857311445125, 0.63326415104024547, 0.1812226118410335)),
((0.98320646005518297, 0.5980161709820524, 0.59423301088459368),
(0.89059593116535862, 0.10449827132271793, 0.11108035462744099)),
((0.99175701702342312, 0.74648213716698619, 0.43401768935077328),
(0.99990772780250103, 0.50099192647372981, 0.0051211073118098693)),
((0.78329874347238004, 0.68724338552531095, 0.8336793640080622),
(0.42485198495434734, 0.2511495584950722, 0.60386007743723258)),
((0.99760092286502611, 0.99489427150464516, 0.5965244373854468),
(0.69411766529083252, 0.3490196168422699, 0.15686275064945221)))
def draw_termite(values_mat, col_labels, row_labels,
highlight_cols=None, highlight_colors=None,
save=False, pow_x = 0.66, pow_y = 0.8):
"""
Make a "termite" plot, typically used for assessing topic models with a tabular
layout that promotes comparison of terms both within and across topics.
Args:
values_mat (``np.ndarray`` or matrix): matrix of values with shape
(# row labels, # col labels) used to size the dots on the grid
col_labels (seq[str]): labels used to identify x-axis ticks on the grid
row_labels(seq[str]): labels used to identify y-axis ticks on the grid
highlight_cols (int or seq[int], optional): indices for columns
to visually highlight in the plot with contrasting colors
highlight_colors (tuple of 2-tuples): each 2-tuple corresponds to a pair
of (light/dark) matplotlib-friendly colors used to highlight a single
column; if not specified (default), a good set of 6 pairs are used
save (str, optional): give the full /path/to/fname on disk to save figure
Returns:
``matplotlib.axes.Axes.axis``: axis on which termite plot is plotted
Raises:
ValueError: if more columns are selected for highlighting than colors
or if any of the inputs' dimensions don't match
References:
.. Chuang, Jason, Christopher D. Manning, and Jeffrey Heer. "Termite:
Visualization techniques for assessing textual topic models."
Proceedings of the International Working Conference on Advanced
Visual Interfaces. ACM, 2012.
.. seealso:: :func:`TopicModel.termite_plot <textacy.tm.TopicModel.termite_plot>`
"""
try:
plt
except NameError:
raise ImportError(
'matplotlib is not installed, so textacy.viz won\'t work; install it \
individually, or along with textacy via `pip install textacy[viz]`')
n_rows, n_cols = values_mat.shape
max_val = np.max(values_mat)
if n_rows != len(row_labels):
msg = "values_mat and row_labels dimensions don't match: {} vs. {}".format(
n_rows, len(row_labels))
raise ValueError(msg)
if n_cols != len(col_labels):
msg = "values_mat and col_labels dimensions don't match: {} vs. {}".format(
n_cols, len(col_labels))
raise ValueError(msg)
if highlight_colors is None:
highlight_colors = COLOR_PAIRS
if highlight_cols is not None:
if isinstance(highlight_cols, int):
highlight_cols = (highlight_cols,)
elif len(highlight_cols) > len(highlight_colors):
msg = 'no more than {} columns may be highlighted at once'.format(
len(highlight_colors))
raise ValueError(msg)
highlight_colors = {hc: COLOR_PAIRS[i]
for i, hc in enumerate(highlight_cols)}
with plt.rc_context(RC_PARAMS):
fig, ax = plt.subplots(figsize=(pow(n_cols, pow_y), pow(n_rows, pow_x))) #hier fesntergröße
_ = ax.set_yticks(range(n_rows))
yticklabels = ax.set_yticklabels(row_labels,
fontsize=14, color='gray')
if highlight_cols is not None:
for i, ticklabel in enumerate(yticklabels):
max_tick_val = max(values_mat[i, hc] for hc in highlight_cols)
for hc in highlight_cols:
if max_tick_val > 0 and values_mat[i, hc] == max_tick_val:
ticklabel.set_color(highlight_colors[hc][1])
ax.get_xaxis().set_ticks_position('top')
_ = ax.set_xticks(range(n_cols))
xticklabels = ax.set_xticklabels(col_labels,
fontsize=14, color='gray',
rotation=30, ha='left')
if highlight_cols is not None:
gridlines = ax.get_xgridlines()
for i, ticklabel in enumerate(xticklabels):
if i in highlight_cols:
ticklabel.set_color(highlight_colors[i][1])
gridlines[i].set_color(highlight_colors[i][0])
gridlines[i].set_alpha(0.5)
for col_ind in range(n_cols):
if highlight_cols is not None and col_ind in highlight_cols:
ax.scatter([col_ind for _ in range(n_rows)],
[i for i in range(n_rows)],
s=600 * (values_mat[:, col_ind] / max_val),
alpha=0.5, linewidth=1,
color=highlight_colors[col_ind][0],
edgecolor=highlight_colors[col_ind][1])
else:
ax.scatter([col_ind for _ in range(n_rows)],
[i for i in range(n_rows)],
s=600 * (values_mat[:, col_ind] / max_val),
alpha=0.5, linewidth=1,
color='black', edgecolor='gray')
_ = ax.set_xlim(left=-1, right=n_cols)
_ = ax.set_ylim(bottom=-1, top=n_rows)
ax.invert_yaxis() # otherwise, values/labels go from bottom to top
#plt.ylim(ymax=5)
if save:
fig.savefig(save, bbox_inches='tight', dpi=100)
return ax

105
draw1.py Normal file
View File

@ -0,0 +1,105 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import numpy as np
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.externals import joblib
from textacy import viz
import draw
LOGGER = logging.getLogger(__name__)
def termite_plot(model, doc_term_matrix, id2term,
topics=-1, sort_topics_by='index', highlight_topics=None,
n_terms=25, rank_terms_by='topic_weight', sort_terms_by='seriation',
save=False, pow_x = 0.66, pow_y = 0.8):
if highlight_topics is not None:
if isinstance(highlight_topics, int):
highlight_topics = (highlight_topics,)
elif len(highlight_topics) > 6:
raise ValueError('no more than 6 topics may be highlighted at once')
# get topics indices
if topics == -1:
topic_inds = tuple(range(model.n_topics))
elif isinstance(topics, int):
topic_inds = (topics,)
else:
topic_inds = tuple(topics)
# get topic indices in sorted order
if sort_topics_by == 'index':
topic_inds = sorted(topic_inds)
elif sort_topics_by == 'weight':
topic_inds = tuple(topic_ind for topic_ind
in np.argsort(model.topic_weights(model.transform(doc_term_matrix)))[::-1]
if topic_ind in topic_inds)
else:
msg = 'invalid sort_topics_by value; must be in {}'.format(
{'index', 'weight'})
raise ValueError(msg)
# get column index of any topics to highlight in termite plot
if highlight_topics is not None:
highlight_cols = tuple(i for i in range(len(topic_inds))
if topic_inds[i] in highlight_topics)
else:
highlight_cols = None
# get top term indices
if rank_terms_by == 'corpus_weight':
term_inds = np.argsort(np.ravel(doc_term_matrix.sum(axis=0)))[:-n_terms - 1:-1]
elif rank_terms_by == 'topic_weight':
term_inds = np.argsort(model.model.components_.sum(axis=0))[:-n_terms - 1:-1]
else:
msg = 'invalid rank_terms_by value; must be in {}'.format(
{'corpus_weight', 'topic_weight'})
raise ValueError(msg)
# get top term indices in sorted order
if sort_terms_by == 'weight':
pass
elif sort_terms_by == 'index':
term_inds = sorted(term_inds)
elif sort_terms_by == 'alphabetical':
term_inds = sorted(term_inds, key=lambda x: id2term[x])
elif sort_terms_by == 'seriation':
topic_term_weights_mat = np.array(
np.array([model.model.components_[topic_ind][term_inds]
for topic_ind in topic_inds])).T
# calculate similarity matrix
topic_term_weights_sim = np.dot(topic_term_weights_mat, topic_term_weights_mat.T)
# substract minimum of sim mat in order to keep sim mat nonnegative
topic_term_weights_sim = topic_term_weights_sim - topic_term_weights_sim.min()
# compute Laplacian matrice and its 2nd eigenvector
L = np.diag(sum(topic_term_weights_sim, 1)) - topic_term_weights_sim
D, V = np.linalg.eigh(L)
D = D[np.argsort(D)]
V = V[:, np.argsort(D)]
fiedler = V[:, 1]
# get permutation corresponding to sorting the 2nd eigenvector
term_inds = [term_inds[i] for i in np.argsort(fiedler)]
else:
msg = 'invalid sort_terms_by value; must be in {}'.format(
{'weight', 'index', 'alphabetical', 'seriation'})
raise ValueError(msg)
# get topic and term labels
topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)
term_labels = tuple(id2term[term_ind] for term_ind in term_inds)
# get topic-term weights to size dots
term_topic_weights = np.array([model.model.components_[topic_ind][term_inds]
for topic_ind in topic_inds]).T
return draw.draw_termite(
term_topic_weights, topic_labels, term_labels,
highlight_cols=highlight_cols, save=save, pow_x = pow_x, pow_y = pow_y)

31
main.py
View File

@ -14,8 +14,27 @@ from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/printout_main.log &"
start = time.time()
# idee http://bigartm.org/
# idee http://wiki.languagetool.org/tips-and-tricks
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
# idee https://gate.ac.uk/family/
# todo llda topics zusammenfassen
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
# frage wieviele tickets pro topic?
# todo modelle testen
# frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen
# frage wenn 155 versch. kb-einträge benutzt wurden, wieso gibt es nur 139 topics?
# idee word vorher mit semantischen netz abgleichen: wenn zu weit entfernt, dann ignore
#todo FREITAG zeichnen, refactoring
"""
init.main()
logprint("")
@ -30,24 +49,26 @@ logprint("")
"""
#topicModeling.main(use_cleaned=False,algorithm="lsa")
#topicModeling.main(algorithm="lsa")
logprint("")
#topicModeling.main(use_cleaned=False,algorithm="nmf")
#topicModeling.main(algorithm="nmf")
logprint("")
#topicModeling.main(use_cleaned=False,algorithm="lda")
#topicModeling.main(algorithm="lda")
logprint("")
topicModeling.main(use_cleaned=False,algorithm="llda")
topicModeling.main(algorithm="llda")
logprint("")
"""
end = time.time()
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))
#800*400

View File

@ -121,6 +121,18 @@ def list_from_files(*paths):
return list(map(textacy.preprocess.normalize_whitespace, liste))
def debug():
pass
def normalize(string):
# replaceRockDots
string = re.sub(r'[ß]', "ss", string.lower())
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
string = textacy.preprocess.normalize_whitespace(string)
return string
def deprecated(func):
"""This is a decorator which can be used to mark functions

View File

@ -364,7 +364,7 @@ def main():
removePOS(["PUNCT", "SPACE", "NUM"]),
removeWords(DE_STOP_WORDS + custom_words),
removeWords(DE_STOP_WORDS + custom_words + VORNAMEN),
#removeWords(DE_STOP_WORDS),
remove_long_words(),

181
test.py
View File

@ -1,31 +1,134 @@
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import re
import matplotlib
matplotlib.use('Agg')
import time
import json
# import spacy
# import textacy
from functools import reduce
import textacy
start = time.time()
import enchant
from datetime import datetime
import os
import xml.etree.ElementTree as ET
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
from miscellaneous import *
import numpy as np
start = time.time()
import json
import os.path
import subprocess
from textacy import Vectorizer, viz
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
from miscellaneous import *
import textacy
from scipy import *
import os
import json
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
import draw
# kb2keywords_dict
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv",
delimiter=";")
next(kb2keywords_gen, None) # skip first
used_kb=[]
for kb in kb2keywords_gen:
used_kb.append(kb[1])
print("used_kb: {}".format(len(list(set(used_kb)))))
# von 260 kb einträgen insg. wurden 155 genutzt
#"ArticleID";"Subject";"Keywords";"Solution";"SolutionText";"CreatedOn"
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", #
delimiter=";")
next(kb2keywords_gen, None) # skip first
cats=[]
subjects=[]
keywords=[]
for kb in kb2keywords_gen:
cats.append(kb[0])
subjects.append(kb[1])
keywords.append(kb[2].split(","))
cats_lst = list(set(cats))
print("cats: {}".format(len(cats_lst)))
print(cats_lst[0:20])
print(len(subjects))
subj_lst = list(set(subjects)) #frage: hat wirklich jeder kb_eintrag ein anderesn Betreff?
print("subjects: {}".format(len(subj_lst)))
print(subj_lst[0:20])
keywords = [item for sublist in keywords for item in sublist]
kys_lst = list(set(keywords))
print("keywords: {}".format(len(kys_lst)))
print(kys_lst[0:20])
used_list = ['bd_unicard_nicht_eingeschrieben', 'sd_vpn_temporaerer fehler ub', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_beschaedigte unicard', 'sd_boss_notenverbuchung', 'd.3 client installation', 'sd_keine rueckantwort kunde', 'sd_asknet_und_dreamspark', 'sd_beantragung_unicard', 'sd_gastaufenthalter', 'sd_internationaloffice', 'sd_outlook anmeldung gestoert', 'unicard_restbetrag_auszahlung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'ub_unicard_unicard mit vollmacht abholen', 'sd_namensaenderung mitarbeiter', 'sd_itmc kurse anmeldebestaetigung', 'sd_zugriff_onlinedienste_rueckmeldung', 'benutzer zum redmine hinzufuegen', 'sd_unicard_gesperrte unicard entsperre', 'lsf freischaltung als mitarbeiter/in', 'sd_mail_als_anhang', 'sd-e-mail_adresse_funktional_beantragen', 'sd_goeke drucker', 'sd_unimail imap_pop3', 'sd_origin_workaround', 'sd_matlab lizenzdatei pc-pools', 'sd_outlook kontakte automatische aktualisierung', 'sd_sap konteneinsicht antrag', 'ohne betreff', 'sd_telefonantrag_änderung_neuantrag', 'sd_sophos download', 'sd_geraeteausleihe', 'studierendenausweis', 'sd_citavi', 'sd_laufzeit unimail account', 'sd_login_unibib ub-it', 'sd_tu_app_keine internetverbindung', 'sd_unicard_max_laufzeit', 'ub_unicard_zusendung der karte moeglich?', 'sd_telefonbuch-eintrag_änderung', 'ub_drucker kopierer', 'windows 10', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'sd_tu-app feedback standard', 'sd_spam e-mail bekannt meldung', 'sd_spss_online_bestellung', 'sd_apple-on-campus', 'sd_studisek', 'sd_office 365 plus support', 'sd_sap_initialkennwort_englisch', 'sd_office365_asknet', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_login tu portale', 'ungueltiges ticket siehe journal', 'sd_sap_freischaltung ohne passwortaenderung', 'bd_unicard_geldkarte_laden', 'sd_verlust/antrag unicard', 'sd_unimail zu exchange', 'citavi_lizenzschluessel_nicht bekommen', 'sd_plotauftrag_zv', 'sd_citavi_support', 'sd_antworten_korrekt', 'sd_wlan-gastkonto', 'sd_antwort_phishingmail', 'bd_unicard_freigabe_beantragung', 'sd_origin nur noch eine seriennummer', 'cm_asiexception', 'sd_login_tu_portale', 'sd_webmailer_thread-anzeige', 'apps_dms-passwort d.3', 'apps_redmine_repository', 'sd_uniaccount_passwortaenderung', 'sd_phishing', 'sd_sap_firefox_esr', 'vpn verbindung fuer unitymedia kunden', 'sd_kurs-angebote anmeldung', 'sd_unicard fehlerhafte geldbuchung', 'sd_uniaccount_ehemalige_passwortaenderung', 'sd_sap_dienstreise', 'cm_lsf-boss_freischaltung', 'wlan', 'uni card', 'sd_webmailer einrichtung weiterleitung', 'spam ohne tu bezug', 'sd_outlook_in_exchange_einbinden', 'sd_wlan_beratung', 'sd_uniaccount_dauer freischaltung', 'sd_sap_konteneinsicht_ workaround', 'sd_vpn anleitungen', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_unicard_abholung', 'sd_vpn_probleme_mit_unitymedia', 'sd_diensthandy beschaffung', 'sd_unicard_defekt', 'sd_freischaltung uniaccount verzoegert', 'sd_kurs-angebote itmc', 'bd_goeke_allgemein', 'sd_uniaccount_ehemalige_studierende', 'sd_stellenausschreibung schwarzes brett', 'freischaltung uniaccount', 'sd_unicard_workaround_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'ub_geldchip-problem bei uc', 'sd_semesterticket', 'problem mit der beantragung von der unicard', 'sd_citavi bestellung', 'sd_immatrikulationsbescheigung_druckfehler', 'sd_vpn_aktualisierung', 'vpn_ipsec_stoerung', 'sd_dreamspark', 'ub_namensaenderung', 'sd_immatrikulationsbescheinigung_portal', 'ub_prod_neue unicard bei beschaedigung', 'sd_vpn_webvpn', 'sd_telefonbuch_prof_eintragung', 'sd_kontakt_asknet', 'probleme mit unicard', 'sd_office 356 plus bestellung', 'sd_gmx_web.de', 'fehlender eintrag im elektronischen telefonbuch', 'ub_prod_namenskorrektur_student', 'einrichtung des eduroam netzwerks', 'sd_sap_initialkennwort', 'sd_boss-bescheinigung', 'sd_wlan passwort setzen', 'sd_aktivierung uniaccount', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_heirat_namensaenderung_student', 'ub_unicard_spaetere abholung moeglich?', 'unicard nochmal beantragen', 'sd_studisek_buchung_semesterbeitrag', 'sd_pruefungsamt', 'unicard vergessen abzuholen und nicht mehr da', 'sd_antrag funktionale mailadresse', 'sd_email_namensaenderung', 'sd_telefonbuch, neues system', 'sd_account_abmelden', 'ub_unicard_abholungszeiten']
labellist = ['sd_antworten_korrekt', 'sd_kurs-angebote anmeldung', 'sd_semesterticket', 'apps_dms-passwort d.3', 'freischaltung uniaccount', 'sd_heirat_namensaenderung_student', 'bd_unicard_freigabe_beantragung', 'sd_uniaccount_ehemalige_studierende', 'sd_sap_dienstreise', 'sd_origin_workaround', 'sd_uniaccount_ehemalige_passwortaenderung', 'fehlender eintrag im elektronischen telefonbuch', 'wlan', 'sd_tu-app feedback standard', 'sd_wlan_beratung', 'sd_uniaccount_passwortaenderung', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_unicard_spaetere abholung moeglich?', 'sd_citavi_support', 'sd_outlook kontakte automatische aktualisierung', 'sd_origin nur noch eine seriennummer', 'lsf freischaltung als mitarbeiter/in', 'cm_asiexception', 'sd_freischaltung uniaccount verzoegert', 'ub_unicard_zusendung der karte moeglich?', 'sd_login_unibib ub-it', 'uni card', 'sd_outlook anmeldung gestoert', 'd.3 client installation', 'ub_unicard_abholungszeiten', 'sd_antwort_phishingmail', 'sd_matlab lizenzdatei pc-pools', 'sd_sap_initialkennwort', 'sd_sap_freischaltung ohne passwortaenderung', 'sd_spss_online_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'sd_keine rueckantwort kunde', 'sd_unimail imap_pop3', 'sd_beantragung_unicard', 'sd_unicard_gesperrte unicard entsperre', 'sd_internationaloffice', 'unicard nochmal beantragen', 'sd_stellenausschreibung schwarzes brett', 'sd_sophos download', 'cm_lsf-boss_freischaltung', 'sd_verlust/antrag unicard', 'vpn_ipsec_stoerung', 'sd_account_abmelden', 'sd_outlook_in_exchange_einbinden', 'ub_namensaenderung', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'unicard vergessen abzuholen und nicht mehr da', 'apps_redmine_repository', 'einrichtung des eduroam netzwerks', 'sd_unicard_max_laufzeit', 'sd_gmx_web.de', 'sd_unicard fehlerhafte geldbuchung', 'sd_geraeteausleihe', 'spam ohne tu bezug', 'sd_uniaccount_dauer freischaltung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'sd_office 365 plus support', 'sd_unicard_defekt', 'sd_phishing', 'sd_goeke drucker', 'ub_unicard_unicard mit vollmacht abholen', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_pruefungsamt', 'sd_aktivierung uniaccount', 'sd_boss-bescheinigung', 'sd_sap_initialkennwort_englisch', 'bd_unicard_geldkarte_laden', 'sd_telefonbuch-eintrag_änderung', 'vpn verbindung fuer unitymedia kunden', 'sd_studisek', 'sd_antrag funktionale mailadresse', 'sd_asknet_und_dreamspark', 'sd_unicard_workaround_bestellung', 'sd_sap_firefox_esr', 'sd_vpn anleitungen', 'sd_office365_asknet', 'citavi_lizenzschluessel_nicht bekommen', 'sd_sap konteneinsicht antrag', 'sd_spam e-mail bekannt meldung', 'ub_prod_namenskorrektur_student', 'ub_beschaedigte unicard', 'sd_namensaenderung mitarbeiter', 'sd_mail_als_anhang', 'benutzer zum redmine hinzufuegen', 'sd_login_tu_portale', 'sd_email_namensaenderung', 'windows 10', 'ungueltiges ticket siehe journal', 'sd_vpn_temporaerer fehler ub', 'ub_prod_neue unicard bei beschaedigung', 'sd_dreamspark', 'sd_webmailer einrichtung weiterleitung', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_studisek_buchung_semesterbeitrag', 'sd_immatrikulationsbescheinigung_portal', 'sd_vpn_probleme_mit_unitymedia', 'sd-e-mail_adresse_funktional_beantragen', 'sd_diensthandy beschaffung', 'sd_vpn_webvpn', 'sd_laufzeit unimail account', 'sd_citavi', 'problem mit der beantragung von der unicard', 'sd_kurs-angebote itmc', 'sd_telefonbuch, neues system', 'sd_login tu portale', 'sd_wlan passwort setzen', 'sd_zugriff_onlinedienste_rueckmeldung', 'unicard_restbetrag_auszahlung', 'sd_immatrikulationsbescheigung_druckfehler', 'bd_unicard_nicht_eingeschrieben', 'sd_unimail zu exchange', 'sd_wlan-gastkonto', 'probleme mit unicard', 'sd_telefonbuch_prof_eintragung', 'sd_vpn_aktualisierung', 'sd_apple-on-campus', 'bd_goeke_allgemein', 'studierendenausweis', 'ub_drucker kopierer', 'sd_unicard_abholung', 'sd_office 356 plus bestellung', 'ohne betreff', 'sd_tu_app_keine internetverbindung', 'sd_boss_notenverbuchung', 'ub_geldchip-problem bei uc', 'sd_itmc kurse anmeldebestaetigung', 'sd_citavi bestellung', 'sd_telefonantrag_änderung_neuantrag', 'sd_sap_konteneinsicht_ workaround', 'sd_kontakt_asknet', 'sd_plotauftrag_zv', 'sd_webmailer_thread-anzeige', 'sd_gastaufenthalter']
for l in used_list:
if l not in labellist:
print(l)
print(len(used_list))
print(len(labellist))
# load corpus
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
preCorpus_name = "de" + "_pre_ticket"
corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
logprint("Corpus loaded: {0}".format(corpus.lang))
#
#todo randomize
split_index = int(float(len(corpus)) * 0.8)
corpus_train = corpus[0:split_index]
corpus_test = corpus[split_index:len(corpus)-1]
# lda bild abdunkeln
# auschnitte
import numpy as np
matplotlib.use('Agg')
import matplotlib.pyplot as plt
end = time.time()
print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start))
"""
vllt kategorien in unterkategorien aufteilen
allg:
utf-korregieren, bei sonderzeichen wörter trennen
namen raus, addressen nach grüßen
emails, urls, nummern raus
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
sinnvoller wörter von müll trennen: 8203;verfügung
abkürzungen raus: m.a, o.ä.
sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
"\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
"""
"""
# load config
@ -81,17 +184,6 @@ print(len(set(bla)))
print()
"""
x = [[1,2,3],[3,4,5]]
arr = np.array(x)
print(arr)
"""
#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name)
@ -104,6 +196,7 @@ list = [(key,value) for key,value in dict.items()]
list.sort(key=lambda tup : tup[1])
"""
"""
from spacy.tokens.doc import Doc as SpacyDoc
@ -137,7 +230,6 @@ textacy.fileio.write_file_lines(readCorpus(filepath),"/home/jannis.grundmann/Pyc
# printRandomDoc(raw_corpus)
"""
spacy_doc = PARSER("test")
save_obj(spacy_doc, "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
@ -155,6 +247,7 @@ laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_h
with open(LLDA_filepath, 'w') as file:
file.write(json.dumps(laveldict))
"""
"""
def load_corpus(corpus_path, corpus_name, lang="de"):
from pathlib import Path
@ -609,25 +702,3 @@ textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
"""
end = time.time()
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
"""
vllt kategorien in unterkategorien aufteilen
allg:
utf-korregieren, bei sonderzeichen wörter trennen
namen raus, addressen nach grüßen
emails, urls, nummern raus
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
sinnvoller wörter von müll trennen: 8203;verfügung
abkürzungen raus: m.a, o.ä.
sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
"\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
"""

View File

@ -1,7 +1,8 @@
# -*- coding: utf-8 -*-
from datetime import datetime
import draw
import draw1
import time
import numpy as np
@ -30,38 +31,9 @@ with open(config_ini) as f:
config.read_file(f)
def label2ID(label, labeldict):
return labeldict.get(label, len(labeldict))
def generate_lablelID_lines(textacyCorpus, labeldict):
for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text
"""
def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
logprint(str("max_df: {0}".format(max_df)))
logprint(str("named_entities: {0}".format(named_entities)))
# printlog("vectorize corpi...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
for t in terms_list:
print(t)
logprint("doc_term_matrix: {0}".format(doc_term_matrix))
logprint("id2term: {0}".format(id2term))
"""
def textacyTopicModeling(corpus,
n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5,
ngrams = 1, min_df=1, max_df=1.0,
ngrams = 1, min_df=1, max_df=0.9,
topicModel='lda'):
@ -73,10 +45,7 @@ def textacyTopicModeling(corpus,
logprint(
"############### Topic Modeling {0} ###########################".format(
topicModel))
logprint("#### Topic Modeling {0}".format(topicModel))
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
logprint(str("max_df: {0}".format(max_df)))
@ -93,7 +62,7 @@ def textacyTopicModeling(corpus,
#################### vectorize corpi ####################
###### vectorize corpi
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
@ -101,14 +70,9 @@ def textacyTopicModeling(corpus,
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
# printlog("terms_list: {0}".format(list(terms_list)))
# printlog("doc_term_matrix: {0}".format(doc_term_matrix))
##################### Initialize and train a topic model ##############################################
####### Initialize and train a topic model
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
@ -118,7 +82,7 @@ def textacyTopicModeling(corpus,
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
logprint('{0}: {1}'.format(topic_idx, " ".join(top_terms)))
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
logprint(topic_idx)
@ -132,66 +96,44 @@ def textacyTopicModeling(corpus,
grams_label = "uni" if ngrams == 1 else "bi"
model.termite_plot(doc_term_matrix, id2term,
draw1.termite_plot(model,doc_term_matrix, id2term,
n_terms=n_terms,
sort_terms_by=sort_terms_by,
rank_terms_by=rank_terms_by+'_weight',
save= FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label,topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by))
rank_terms_by=rank_terms_by + '_weight',
save=FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label, topicModel, n_topics,
n_terms, sort_terms_by, rank_terms_by))
end = time.time()
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
def jgibbsLLDA(labeldict,line_gen,path2save_results, top_topic_words=7):
def jgibbsLLDA(corpus, path2save_results, top_topic_words=7):
start = time.time()
labeldict_rev = {v: k for k, v in labeldict.items()}
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
# build dictionary of ticketcategories
labelist = []
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
labeldict = {k: v for v, k in enumerate(labelist)}
reverse_labeldict = {v: k for k, v in labeldict.items()}
#and save
labeldict_path = FILEPATH + "results/labeldict.txt"
with open(labeldict_path, 'w') as file:
file.write(json.dumps(labeldict))
n_topics = len(labeldict) #+1 #default-topic
textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath)
# create file with label_IDs (input for llda)
textacy.fileio.write_file_lines(generate_lablelID_lines(corpus, labeldict), filepath=LLDA_filepath)
# wait for file to exist
while not os.path.exists(LLDA_filepath):
time.sleep(1)
logprint("")
logprint("start LLDA:")
# run JGibbsLLDA file
n_topics = len(labeldict) #+1 #default-topic
FNULL = open(os.devnull, 'w') # supress output
cmd_jgibbs_java = ["java", "-cp",
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
@ -218,7 +160,7 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7):
if len(findall) != 0:
try:
index = int(findall[0].split()[1])
result.append("Topic {} {}:".format(index, reverse_labeldict[index]))
result.append("Topic {} {}:".format(index, labeldict_rev[index]))
except:
result.append(line)
@ -243,37 +185,15 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7):
index = int(findall[0].split()[1])
res_dict = {index : str(reverse_labeldict[index]) }
res_dict = {index : str(labeldict_rev[index]) }
else:
splitted = line.split()
res_dict[splitted[0]] = float(splitted[1])
### print terms that are topics
for s in list(res_dict.values()):
if isinstance(s,str) and splitted[0] in s:
vals = list(res_dict.values())
keys = list(res_dict.keys())
for v in vals:
if not isinstance(v,float):
print("{}".format(v))
print("{}".format(splitted[0]))
count +=1
print()
###
if len(res_dict) != 0:
results.append(res_dict) # letzes an die liste ran
print(count)
print(float(count)/float(len(labelist)))
# {0: 'betrieb', 'service': 0.24162679425837305, 'support': 0.24162679425837305, 'browser': 0.24162679425837305, 'unicard': 0.24162679425837305, 'telefon': 0.0023923444976076593}
# every term in the resulsts to a list
@ -286,20 +206,12 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7):
term2id = {t:i for i,t in enumerate(terms)} #and to dict
################# termite plot #####################################################################
#term_topic_weights.shape = (len(term_ids),len(topic_ids)
#topic_labels = tuple(labelist)
topic_labels = list(range(len(labelist)))
topic_labels = list(range(len(labeldict)))
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
for i,res in enumerate(results):
for key,value in res.items():
@ -308,77 +220,223 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7):
term_topic_weights[term2id[key]][i] = value
term_labels[term2id[key]] = key
else:
topic_labels[i] = reverse_labeldict[key]
topic_labels[i] = labeldict_rev[key]
viz.draw_termite_plot(
draw.draw_termite(
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
draw.draw_termite(
term_topic_weights, topic_labels, term_labels, save=path2save_results+"_spaced.png",pow_x=0.78,pow_y=0.87)
# save labeldict
labeldict_path = path2save_results + "_labeldict.json"
with open(labeldict_path, 'w') as file:
file.write(json.dumps(labeldict))
def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
start = time.time()
logprint("")
logprint("start Category-LLDA:")
# build dictionary of ticketcategories
labelist = []
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
labelist = list(set(labelist))
print("len(labelist): {}".format(len(labelist)))
labeldict = {k: v for v, k in enumerate(labelist)}
def gen_cat_lines(textacyCorpus, labeldict):
""" generates [topic1, topic2....] tok1 tok2 tok3 out of corpi"""
for doc in textacyCorpus:
yield "[" + str(labeldict.get(doc.metadata["categoryName"], len(labeldict))) + "] " + doc.text
line_gen = gen_cat_lines(corpus, labeldict)
path2save_results = path2save_results + "_kb_cat_llda_{}".format("top" + str(top_topic_words))
jgibbsLLDA(labeldict, line_gen, path2save_results, top_topic_words=top_topic_words)
end = time.time()
logprint("Time Elapsed Topic Modeling JGibbsLLDA:{0} min\n".format((end - start) / 60))
logprint("\n\n\nTime Elapsed Category-LLDA :{0} min\n\n".format((end - start) / 60))
def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=False):
"""ticket_ID -> KB_ID -> keywords / subject -> llda"""
start = time.time()
logprint("")
logprint("start {}-LLDA:".format("Keyword" if kb_keywords else "Subject"))
# ticket2kb_dict
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
ticket2kb_dict = {}
for line in kb2ticket_gen:
ticket_id = line[0]
kb_id = line[1]
ticket2kb_dict[ticket_id] = kb_id
# {'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...}
kb_entries_used = len(list(set(ticket2kb_dict.values())))
print("kb_entries_used: {}".format(kb_entries_used))
# kb2keywords_dict
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";")
next(kb2keywords_gen,None) #skip first line("ArticleID";"Subject";"Keywords";...)
kb2keywords_dict = {}
for line in kb2keywords_gen:
kb_id = line[0]
subject = line[1]
keywords = line[2]
keywords_list = [normalize(x) for x in str(keywords).split(",")]
if kb_id not in kb2keywords_dict.keys():
kb2keywords_dict[kb_id] = []
if kb_keywords:
for item in keywords_list:
if item != "":
kb2keywords_dict[kb_id].append(item)
else:
kb2keywords_dict[kb_id].append(subject)
#remove all empty items
kb2keywords_dict = { k : v for k,v in kb2keywords_dict.items() if len(v) != 0}
# {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...}
#keywords2kb_dict
def main(use_cleaned=False, algorithm="llda"):
keywords2kb_dict = {}
for kb_id, lst in kb2keywords_dict.items():
for l in lst:
if l not in keywords2kb_dict.keys():
keywords2kb_dict[l] = [kb_id]
else:
keywords2kb_dict[l].append(kb_id)
# {'unicard namensaenderung': ['KBA10276'], 'vpn': ['KBA10063'], 'outlook_exchange': ['KBA10181'], ...}
# idee http://bigartm.org/
# idee http://wiki.languagetool.org/tips-and-tricks
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
# idee https://gate.ac.uk/family/
# Look for actually used keywords
used_keywords = []
for doc in corpus:
ticket_number = doc.metadata["TicketNumber"]
kb_id = ticket2kb_dict.get(ticket_number, None)
keywords = kb2keywords_dict.get(kb_id, None)
if keywords and kb_id:
used_keywords.append(list(map(normalize,keywords)))
labelist = [item for sublist in used_keywords for item in sublist] #flatten list
labelist = list(set(labelist))
print("len(labelist): {}".format(len(labelist)))
labeldict = {k: v for v, k in enumerate(labelist)}
def gen_KB_lines(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict):
for doc in corpus:
ticket_number = doc.metadata["TicketNumber"]
kb_number = ticket2kb_dict.get(ticket_number, None)
keywords = kb2keywords_dict.get(kb_number, None)
if keywords:
label = ""
for kw in keywords:
label = label + str(labeldict.get(normalize(str(kw)), len(labeldict))) + " "
yield "[ " + label + "] " + doc.text
line_gen = gen_KB_lines(corpus, labeldict, ticket2kb_dict, kb2keywords_dict)
path2save_results = path2save_results + "_kb_{}_llda_{}".format("keys" if kb_keywords else "subs",
"top" + str(top_topic_words))
jgibbsLLDA(labeldict, line_gen, path2save_results, top_topic_words=top_topic_words)
end = time.time()
logprint("\n\n\nTime Elapsed {1}-LLDA :{0} min\n\n".format((end - start) / 60,"Keyword" if kb_keywords else "Subject"))
def main( algorithm="llda"):
logprint("Topic Modeling: {0}".format(datetime.now()))
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
if use_cleaned:
preCorpus_name = "de" + "_clean_ticket"
resultspath = FILEPATH + "results/clean"
else:
preCorpus_name = "de" + "_pre_ticket"
resultspath = FILEPATH + "results/pre"
preCorpus_name = "de" + "_pre_ticket"
resultspath = FILEPATH + "results/pre"
# load cleand corpus
# load corpus
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
logprint("Corpus loaded: {0}".format(de_corpus.lang))
# todo llda topics zusammenfassen
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
# frage wieviele tickets pro topic?
"""
ngrams = 1
min_df = 1
max_df = 1.0
weighting = 'tf'
# weighting ='tfidf'
named_entities = False
printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
"""
if algorithm == "llda":
top_topic_words = 5
path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
jgibbsLLDA_category(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words)
kb_keywords = False
jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
kb_keywords = True
jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
"""
top_topic_words = 10
@ -399,10 +457,7 @@ def main(use_cleaned=False, algorithm="llda"):
textacyTopicModeling(ngrams = 1,
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =15,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams=1,
@ -411,7 +466,7 @@ def main(use_cleaned=False, algorithm="llda"):
topicModel=algorithm,
n_topics=20,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
@ -430,10 +485,7 @@ def main(use_cleaned=False, algorithm="llda"):
textacyTopicModeling(ngrams=(1, 2),
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=15,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams = (1,2),
@ -442,7 +494,7 @@ def main(use_cleaned=False, algorithm="llda"):
topicModel = algorithm,
n_topics =20,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,