diff --git a/config.ini b/config.ini index d176da5..3141464 100644 --- a/config.ini +++ b/config.ini @@ -62,8 +62,6 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI [preprocessing] -#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC - autocorrect = false #true @@ -72,26 +70,4 @@ custom_words=aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderu -[topicmodeling] - -ngrams=(1,2) - -min_df=0 -max_df=1.0 -no_below=20 -no_above=0.5 - -topicModel=lda - -top_topic_words=5 - -top_document_labels_per_topic=2 - - - - - - - - diff --git a/draw.py b/draw.py new file mode 100644 index 0000000..05086f8 --- /dev/null +++ b/draw.py @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, division, print_function, unicode_literals + +import numpy as np +try: + import matplotlib.pyplot as plt +except ImportError: + pass + + +RC_PARAMS = {'axes.axisbelow': True, + 'axes.edgecolor': '.8', + 'axes.facecolor': 'white', + 'axes.grid': False, + 'axes.labelcolor': '.15', + 'axes.linewidth': 1.0, + 'axes.labelpad' : 10.0, + 'figure.facecolor': 'white', + 'font.family': ['sans-serif'], + 'font.sans-serif': ['Arial', 'Liberation Sans', 'sans-serif'], + 'grid.color': '.8', 'grid.linestyle': '-', + 'image.cmap': 'Greys', + 'legend.frameon': False, + 'legend.numpoints': 1, 'legend.scatterpoints': 1, + 'lines.solid_capstyle': 'round', + 'text.color': '1.0', + 'xtick.color': '1.0', 'xtick.direction': 'out', + 'xtick.major.size': 0.0, 'xtick.minor.size': 0.0, + 'xtick.major.pad' : 5, + 'ytick.color': '1.0', 'ytick.direction': 'out', + 'ytick.major.size': 0.0, 'ytick.minor.size': 0.0, + 'axes.ymargin' : 0.9, + 'ytick.major.pad': 5} + +COLOR_PAIRS = (((0.65098041296005249, 0.80784314870834351, 0.89019608497619629), + (0.12572087695201239, 0.47323337360924367, 0.707327968232772)), + ((0.68899655751153521, 0.8681737867056154, 0.54376011946622071), + (0.21171857311445125, 0.63326415104024547, 0.1812226118410335)), + ((0.98320646005518297, 0.5980161709820524, 0.59423301088459368), + (0.89059593116535862, 0.10449827132271793, 0.11108035462744099)), + ((0.99175701702342312, 0.74648213716698619, 0.43401768935077328), + (0.99990772780250103, 0.50099192647372981, 0.0051211073118098693)), + ((0.78329874347238004, 0.68724338552531095, 0.8336793640080622), + (0.42485198495434734, 0.2511495584950722, 0.60386007743723258)), + ((0.99760092286502611, 0.99489427150464516, 0.5965244373854468), + (0.69411766529083252, 0.3490196168422699, 0.15686275064945221))) + + +def draw_termite(values_mat, col_labels, row_labels, + highlight_cols=None, highlight_colors=None, + save=False, pow_x = 0.66, pow_y = 0.8): + """ + Make a "termite" plot, typically used for assessing topic models with a tabular + layout that promotes comparison of terms both within and across topics. + + Args: + values_mat (``np.ndarray`` or matrix): matrix of values with shape + (# row labels, # col labels) used to size the dots on the grid + col_labels (seq[str]): labels used to identify x-axis ticks on the grid + row_labels(seq[str]): labels used to identify y-axis ticks on the grid + highlight_cols (int or seq[int], optional): indices for columns + to visually highlight in the plot with contrasting colors + highlight_colors (tuple of 2-tuples): each 2-tuple corresponds to a pair + of (light/dark) matplotlib-friendly colors used to highlight a single + column; if not specified (default), a good set of 6 pairs are used + save (str, optional): give the full /path/to/fname on disk to save figure + + Returns: + ``matplotlib.axes.Axes.axis``: axis on which termite plot is plotted + + Raises: + ValueError: if more columns are selected for highlighting than colors + or if any of the inputs' dimensions don't match + + References: + .. Chuang, Jason, Christopher D. Manning, and Jeffrey Heer. "Termite: + Visualization techniques for assessing textual topic models." + Proceedings of the International Working Conference on Advanced + Visual Interfaces. ACM, 2012. + + .. seealso:: :func:`TopicModel.termite_plot ` + """ + try: + plt + except NameError: + raise ImportError( + 'matplotlib is not installed, so textacy.viz won\'t work; install it \ + individually, or along with textacy via `pip install textacy[viz]`') + n_rows, n_cols = values_mat.shape + max_val = np.max(values_mat) + + if n_rows != len(row_labels): + msg = "values_mat and row_labels dimensions don't match: {} vs. {}".format( + n_rows, len(row_labels)) + raise ValueError(msg) + if n_cols != len(col_labels): + msg = "values_mat and col_labels dimensions don't match: {} vs. {}".format( + n_cols, len(col_labels)) + raise ValueError(msg) + + if highlight_colors is None: + highlight_colors = COLOR_PAIRS + if highlight_cols is not None: + if isinstance(highlight_cols, int): + highlight_cols = (highlight_cols,) + elif len(highlight_cols) > len(highlight_colors): + msg = 'no more than {} columns may be highlighted at once'.format( + len(highlight_colors)) + raise ValueError(msg) + highlight_colors = {hc: COLOR_PAIRS[i] + for i, hc in enumerate(highlight_cols)} + + with plt.rc_context(RC_PARAMS): + + fig, ax = plt.subplots(figsize=(pow(n_cols, pow_y), pow(n_rows, pow_x))) #hier fesntergröße + + + _ = ax.set_yticks(range(n_rows)) + yticklabels = ax.set_yticklabels(row_labels, + fontsize=14, color='gray') + if highlight_cols is not None: + for i, ticklabel in enumerate(yticklabels): + max_tick_val = max(values_mat[i, hc] for hc in highlight_cols) + for hc in highlight_cols: + if max_tick_val > 0 and values_mat[i, hc] == max_tick_val: + ticklabel.set_color(highlight_colors[hc][1]) + + ax.get_xaxis().set_ticks_position('top') + _ = ax.set_xticks(range(n_cols)) + xticklabels = ax.set_xticklabels(col_labels, + fontsize=14, color='gray', + rotation=30, ha='left') + if highlight_cols is not None: + gridlines = ax.get_xgridlines() + for i, ticklabel in enumerate(xticklabels): + if i in highlight_cols: + ticklabel.set_color(highlight_colors[i][1]) + gridlines[i].set_color(highlight_colors[i][0]) + gridlines[i].set_alpha(0.5) + + for col_ind in range(n_cols): + if highlight_cols is not None and col_ind in highlight_cols: + ax.scatter([col_ind for _ in range(n_rows)], + [i for i in range(n_rows)], + s=600 * (values_mat[:, col_ind] / max_val), + alpha=0.5, linewidth=1, + color=highlight_colors[col_ind][0], + edgecolor=highlight_colors[col_ind][1]) + else: + ax.scatter([col_ind for _ in range(n_rows)], + [i for i in range(n_rows)], + s=600 * (values_mat[:, col_ind] / max_val), + alpha=0.5, linewidth=1, + color='black', edgecolor='gray') + + _ = ax.set_xlim(left=-1, right=n_cols) + _ = ax.set_ylim(bottom=-1, top=n_rows) + + ax.invert_yaxis() # otherwise, values/labels go from bottom to top + #plt.ylim(ymax=5) + + if save: + fig.savefig(save, bbox_inches='tight', dpi=100) + + return ax diff --git a/draw1.py b/draw1.py new file mode 100644 index 0000000..f3a9aa2 --- /dev/null +++ b/draw1.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, division, print_function, unicode_literals + +import logging + +import numpy as np +from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD +from sklearn.externals import joblib + +from textacy import viz +import draw + +LOGGER = logging.getLogger(__name__) + + + +def termite_plot(model, doc_term_matrix, id2term, + topics=-1, sort_topics_by='index', highlight_topics=None, + n_terms=25, rank_terms_by='topic_weight', sort_terms_by='seriation', + save=False, pow_x = 0.66, pow_y = 0.8): + + + + if highlight_topics is not None: + if isinstance(highlight_topics, int): + highlight_topics = (highlight_topics,) + elif len(highlight_topics) > 6: + raise ValueError('no more than 6 topics may be highlighted at once') + + # get topics indices + if topics == -1: + topic_inds = tuple(range(model.n_topics)) + elif isinstance(topics, int): + topic_inds = (topics,) + else: + topic_inds = tuple(topics) + + # get topic indices in sorted order + if sort_topics_by == 'index': + topic_inds = sorted(topic_inds) + elif sort_topics_by == 'weight': + topic_inds = tuple(topic_ind for topic_ind + in np.argsort(model.topic_weights(model.transform(doc_term_matrix)))[::-1] + if topic_ind in topic_inds) + else: + msg = 'invalid sort_topics_by value; must be in {}'.format( + {'index', 'weight'}) + raise ValueError(msg) + + # get column index of any topics to highlight in termite plot + if highlight_topics is not None: + highlight_cols = tuple(i for i in range(len(topic_inds)) + if topic_inds[i] in highlight_topics) + else: + highlight_cols = None + + # get top term indices + if rank_terms_by == 'corpus_weight': + term_inds = np.argsort(np.ravel(doc_term_matrix.sum(axis=0)))[:-n_terms - 1:-1] + elif rank_terms_by == 'topic_weight': + term_inds = np.argsort(model.model.components_.sum(axis=0))[:-n_terms - 1:-1] + else: + msg = 'invalid rank_terms_by value; must be in {}'.format( + {'corpus_weight', 'topic_weight'}) + raise ValueError(msg) + + # get top term indices in sorted order + if sort_terms_by == 'weight': + pass + elif sort_terms_by == 'index': + term_inds = sorted(term_inds) + elif sort_terms_by == 'alphabetical': + term_inds = sorted(term_inds, key=lambda x: id2term[x]) + elif sort_terms_by == 'seriation': + topic_term_weights_mat = np.array( + np.array([model.model.components_[topic_ind][term_inds] + for topic_ind in topic_inds])).T + # calculate similarity matrix + topic_term_weights_sim = np.dot(topic_term_weights_mat, topic_term_weights_mat.T) + # substract minimum of sim mat in order to keep sim mat nonnegative + topic_term_weights_sim = topic_term_weights_sim - topic_term_weights_sim.min() + # compute Laplacian matrice and its 2nd eigenvector + L = np.diag(sum(topic_term_weights_sim, 1)) - topic_term_weights_sim + D, V = np.linalg.eigh(L) + D = D[np.argsort(D)] + V = V[:, np.argsort(D)] + fiedler = V[:, 1] + # get permutation corresponding to sorting the 2nd eigenvector + term_inds = [term_inds[i] for i in np.argsort(fiedler)] + else: + msg = 'invalid sort_terms_by value; must be in {}'.format( + {'weight', 'index', 'alphabetical', 'seriation'}) + raise ValueError(msg) + + # get topic and term labels + topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds) + term_labels = tuple(id2term[term_ind] for term_ind in term_inds) + + # get topic-term weights to size dots + term_topic_weights = np.array([model.model.components_[topic_ind][term_inds] + for topic_ind in topic_inds]).T + + return draw.draw_termite( + term_topic_weights, topic_labels, term_labels, + highlight_cols=highlight_cols, save=save, pow_x = pow_x, pow_y = pow_y) diff --git a/java_LabledLDA/models/tickets/.others.gz b/java_LabledLDA/models/tickets/.others.gz index 4493c78..ea94e7e 100644 Binary files a/java_LabledLDA/models/tickets/.others.gz and b/java_LabledLDA/models/tickets/.others.gz differ diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz index 1936775..5c5f7d3 100644 Binary files a/java_LabledLDA/models/tickets/.tassign.gz and b/java_LabledLDA/models/tickets/.tassign.gz differ diff --git a/java_LabledLDA/models/tickets/.theta.gz b/java_LabledLDA/models/tickets/.theta.gz index 87d0329..9309100 100644 Binary files a/java_LabledLDA/models/tickets/.theta.gz and b/java_LabledLDA/models/tickets/.theta.gz differ diff --git a/java_LabledLDA/models/tickets/.twords.gz b/java_LabledLDA/models/tickets/.twords.gz index 031c94d..84e04a9 100644 Binary files a/java_LabledLDA/models/tickets/.twords.gz and b/java_LabledLDA/models/tickets/.twords.gz differ diff --git a/java_LabledLDA/models/tickets/.wordmap.gz b/java_LabledLDA/models/tickets/.wordmap.gz index 31a50f4..5a09245 100644 Binary files a/java_LabledLDA/models/tickets/.wordmap.gz and b/java_LabledLDA/models/tickets/.wordmap.gz differ diff --git a/java_LabledLDA/models/tickets/tickets.gz b/java_LabledLDA/models/tickets/tickets.gz index e9a4831..0e5edec 100644 Binary files a/java_LabledLDA/models/tickets/tickets.gz and b/java_LabledLDA/models/tickets/tickets.gz differ diff --git a/main.py b/main.py index dbabe89..343b826 100644 --- a/main.py +++ b/main.py @@ -14,8 +14,27 @@ from miscellaneous import * # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/printout_main.log &" start = time.time() +# idee http://bigartm.org/ +# idee http://wiki.languagetool.org/tips-and-tricks +# idee https://en.wikipedia.org/wiki/Noisy_text_analytics +# idee https://gate.ac.uk/family/ + +# todo llda topics zusammenfassen +# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics +# frage wieviele tickets pro topic? +# todo modelle testen +# frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen +# frage wenn 155 versch. kb-einträge benutzt wurden, wieso gibt es nur 139 topics? +# idee word vorher mit semantischen netz abgleichen: wenn zu weit entfernt, dann ignore + +#todo FREITAG zeichnen, refactoring + + + +""" + init.main() logprint("") @@ -30,24 +49,26 @@ logprint("") """ -#topicModeling.main(use_cleaned=False,algorithm="lsa") + + +#topicModeling.main(algorithm="lsa") logprint("") -#topicModeling.main(use_cleaned=False,algorithm="nmf") +#topicModeling.main(algorithm="nmf") logprint("") -#topicModeling.main(use_cleaned=False,algorithm="lda") +#topicModeling.main(algorithm="lda") logprint("") -topicModeling.main(use_cleaned=False,algorithm="llda") +topicModeling.main(algorithm="llda") logprint("") -""" end = time.time() logprint("Total Time Elapsed: {0} min".format((end - start) / 60)) +#800*400 \ No newline at end of file diff --git a/miscellaneous.py b/miscellaneous.py index d6ac64f..cbdf9aa 100644 --- a/miscellaneous.py +++ b/miscellaneous.py @@ -121,6 +121,18 @@ def list_from_files(*paths): return list(map(textacy.preprocess.normalize_whitespace, liste)) +def debug(): + pass + +def normalize(string): + # replaceRockDots + string = re.sub(r'[ß]', "ss", string.lower()) + string = re.sub(r'[ö]', "oe", string) + string = re.sub(r'[ü]', "ue", string) + string = re.sub(r'[ä]', "ae", string) + string = textacy.preprocess.normalize_whitespace(string) + return string + def deprecated(func): """This is a decorator which can be used to mark functions diff --git a/preprocessing.py b/preprocessing.py index ca7a6a6..6327c50 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -364,7 +364,7 @@ def main(): removePOS(["PUNCT", "SPACE", "NUM"]), - removeWords(DE_STOP_WORDS + custom_words), + removeWords(DE_STOP_WORDS + custom_words + VORNAMEN), #removeWords(DE_STOP_WORDS), remove_long_words(), diff --git a/test.py b/test.py index 028652a..08b412e 100644 --- a/test.py +++ b/test.py @@ -1,31 +1,134 @@ # -*- coding: utf-8 -*- - - -# -*- coding: utf-8 -*- - -import re +import matplotlib +matplotlib.use('Agg') import time -import json - -# import spacy -# import textacy -from functools import reduce import textacy -start = time.time() - -import enchant - -from datetime import datetime -import os -import xml.etree.ElementTree as ET - -FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" -from miscellaneous import * import numpy as np +start = time.time() +import json +import os.path +import subprocess +from textacy import Vectorizer, viz -# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &" +from miscellaneous import * +import textacy +from scipy import * +import os +import json +FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" +import draw + + + +# kb2keywords_dict + +kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", + delimiter=";") +next(kb2keywords_gen, None) # skip first +used_kb=[] +for kb in kb2keywords_gen: + used_kb.append(kb[1]) +print("used_kb: {}".format(len(list(set(used_kb))))) + +# von 260 kb einträgen insg. wurden 155 genutzt + +#"ArticleID";"Subject";"Keywords";"Solution";"SolutionText";"CreatedOn" +kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", # + delimiter=";") +next(kb2keywords_gen, None) # skip first +cats=[] +subjects=[] +keywords=[] +for kb in kb2keywords_gen: + cats.append(kb[0]) + subjects.append(kb[1]) + keywords.append(kb[2].split(",")) + +cats_lst = list(set(cats)) +print("cats: {}".format(len(cats_lst))) +print(cats_lst[0:20]) + +print(len(subjects)) +subj_lst = list(set(subjects)) #frage: hat wirklich jeder kb_eintrag ein anderesn Betreff? +print("subjects: {}".format(len(subj_lst))) +print(subj_lst[0:20]) + +keywords = [item for sublist in keywords for item in sublist] + +kys_lst = list(set(keywords)) +print("keywords: {}".format(len(kys_lst))) +print(kys_lst[0:20]) + + + + +used_list = ['bd_unicard_nicht_eingeschrieben', 'sd_vpn_temporaerer fehler ub', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_beschaedigte unicard', 'sd_boss_notenverbuchung', 'd.3 client installation', 'sd_keine rueckantwort kunde', 'sd_asknet_und_dreamspark', 'sd_beantragung_unicard', 'sd_gastaufenthalter', 'sd_internationaloffice', 'sd_outlook anmeldung gestoert', 'unicard_restbetrag_auszahlung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'ub_unicard_unicard mit vollmacht abholen', 'sd_namensaenderung mitarbeiter', 'sd_itmc kurse anmeldebestaetigung', 'sd_zugriff_onlinedienste_rueckmeldung', 'benutzer zum redmine hinzufuegen', 'sd_unicard_gesperrte unicard entsperre', 'lsf freischaltung als mitarbeiter/in', 'sd_mail_als_anhang', 'sd-e-mail_adresse_funktional_beantragen', 'sd_goeke drucker', 'sd_unimail imap_pop3', 'sd_origin_workaround', 'sd_matlab lizenzdatei pc-pools', 'sd_outlook kontakte automatische aktualisierung', 'sd_sap konteneinsicht antrag', 'ohne betreff', 'sd_telefonantrag_änderung_neuantrag', 'sd_sophos download', 'sd_geraeteausleihe', 'studierendenausweis', 'sd_citavi', 'sd_laufzeit unimail account', 'sd_login_unibib ub-it', 'sd_tu_app_keine internetverbindung', 'sd_unicard_max_laufzeit', 'ub_unicard_zusendung der karte moeglich?', 'sd_telefonbuch-eintrag_änderung', 'ub_drucker kopierer', 'windows 10', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'sd_tu-app feedback standard', 'sd_spam e-mail bekannt meldung', 'sd_spss_online_bestellung', 'sd_apple-on-campus', 'sd_studisek', 'sd_office 365 plus support', 'sd_sap_initialkennwort_englisch', 'sd_office365_asknet', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_login tu portale', 'ungueltiges ticket siehe journal', 'sd_sap_freischaltung ohne passwortaenderung', 'bd_unicard_geldkarte_laden', 'sd_verlust/antrag unicard', 'sd_unimail zu exchange', 'citavi_lizenzschluessel_nicht bekommen', 'sd_plotauftrag_zv', 'sd_citavi_support', 'sd_antworten_korrekt', 'sd_wlan-gastkonto', 'sd_antwort_phishingmail', 'bd_unicard_freigabe_beantragung', 'sd_origin nur noch eine seriennummer', 'cm_asiexception', 'sd_login_tu_portale', 'sd_webmailer_thread-anzeige', 'apps_dms-passwort d.3', 'apps_redmine_repository', 'sd_uniaccount_passwortaenderung', 'sd_phishing', 'sd_sap_firefox_esr', 'vpn verbindung fuer unitymedia kunden', 'sd_kurs-angebote anmeldung', 'sd_unicard fehlerhafte geldbuchung', 'sd_uniaccount_ehemalige_passwortaenderung', 'sd_sap_dienstreise', 'cm_lsf-boss_freischaltung', 'wlan', 'uni card', 'sd_webmailer einrichtung weiterleitung', 'spam ohne tu bezug', 'sd_outlook_in_exchange_einbinden', 'sd_wlan_beratung', 'sd_uniaccount_dauer freischaltung', 'sd_sap_konteneinsicht_ workaround', 'sd_vpn anleitungen', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_unicard_abholung', 'sd_vpn_probleme_mit_unitymedia', 'sd_diensthandy beschaffung', 'sd_unicard_defekt', 'sd_freischaltung uniaccount verzoegert', 'sd_kurs-angebote itmc', 'bd_goeke_allgemein', 'sd_uniaccount_ehemalige_studierende', 'sd_stellenausschreibung schwarzes brett', 'freischaltung uniaccount', 'sd_unicard_workaround_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'ub_geldchip-problem bei uc', 'sd_semesterticket', 'problem mit der beantragung von der unicard', 'sd_citavi bestellung', 'sd_immatrikulationsbescheigung_druckfehler', 'sd_vpn_aktualisierung', 'vpn_ipsec_stoerung', 'sd_dreamspark', 'ub_namensaenderung', 'sd_immatrikulationsbescheinigung_portal', 'ub_prod_neue unicard bei beschaedigung', 'sd_vpn_webvpn', 'sd_telefonbuch_prof_eintragung', 'sd_kontakt_asknet', 'probleme mit unicard', 'sd_office 356 plus bestellung', 'sd_gmx_web.de', 'fehlender eintrag im elektronischen telefonbuch', 'ub_prod_namenskorrektur_student', 'einrichtung des eduroam netzwerks', 'sd_sap_initialkennwort', 'sd_boss-bescheinigung', 'sd_wlan passwort setzen', 'sd_aktivierung uniaccount', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_heirat_namensaenderung_student', 'ub_unicard_spaetere abholung moeglich?', 'unicard nochmal beantragen', 'sd_studisek_buchung_semesterbeitrag', 'sd_pruefungsamt', 'unicard vergessen abzuholen und nicht mehr da', 'sd_antrag funktionale mailadresse', 'sd_email_namensaenderung', 'sd_telefonbuch, neues system', 'sd_account_abmelden', 'ub_unicard_abholungszeiten'] +labellist = ['sd_antworten_korrekt', 'sd_kurs-angebote anmeldung', 'sd_semesterticket', 'apps_dms-passwort d.3', 'freischaltung uniaccount', 'sd_heirat_namensaenderung_student', 'bd_unicard_freigabe_beantragung', 'sd_uniaccount_ehemalige_studierende', 'sd_sap_dienstreise', 'sd_origin_workaround', 'sd_uniaccount_ehemalige_passwortaenderung', 'fehlender eintrag im elektronischen telefonbuch', 'wlan', 'sd_tu-app feedback standard', 'sd_wlan_beratung', 'sd_uniaccount_passwortaenderung', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_unicard_spaetere abholung moeglich?', 'sd_citavi_support', 'sd_outlook kontakte automatische aktualisierung', 'sd_origin nur noch eine seriennummer', 'lsf freischaltung als mitarbeiter/in', 'cm_asiexception', 'sd_freischaltung uniaccount verzoegert', 'ub_unicard_zusendung der karte moeglich?', 'sd_login_unibib ub-it', 'uni card', 'sd_outlook anmeldung gestoert', 'd.3 client installation', 'ub_unicard_abholungszeiten', 'sd_antwort_phishingmail', 'sd_matlab lizenzdatei pc-pools', 'sd_sap_initialkennwort', 'sd_sap_freischaltung ohne passwortaenderung', 'sd_spss_online_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'sd_keine rueckantwort kunde', 'sd_unimail imap_pop3', 'sd_beantragung_unicard', 'sd_unicard_gesperrte unicard entsperre', 'sd_internationaloffice', 'unicard nochmal beantragen', 'sd_stellenausschreibung schwarzes brett', 'sd_sophos download', 'cm_lsf-boss_freischaltung', 'sd_verlust/antrag unicard', 'vpn_ipsec_stoerung', 'sd_account_abmelden', 'sd_outlook_in_exchange_einbinden', 'ub_namensaenderung', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'unicard vergessen abzuholen und nicht mehr da', 'apps_redmine_repository', 'einrichtung des eduroam netzwerks', 'sd_unicard_max_laufzeit', 'sd_gmx_web.de', 'sd_unicard fehlerhafte geldbuchung', 'sd_geraeteausleihe', 'spam ohne tu bezug', 'sd_uniaccount_dauer freischaltung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'sd_office 365 plus support', 'sd_unicard_defekt', 'sd_phishing', 'sd_goeke drucker', 'ub_unicard_unicard mit vollmacht abholen', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_pruefungsamt', 'sd_aktivierung uniaccount', 'sd_boss-bescheinigung', 'sd_sap_initialkennwort_englisch', 'bd_unicard_geldkarte_laden', 'sd_telefonbuch-eintrag_änderung', 'vpn verbindung fuer unitymedia kunden', 'sd_studisek', 'sd_antrag funktionale mailadresse', 'sd_asknet_und_dreamspark', 'sd_unicard_workaround_bestellung', 'sd_sap_firefox_esr', 'sd_vpn anleitungen', 'sd_office365_asknet', 'citavi_lizenzschluessel_nicht bekommen', 'sd_sap konteneinsicht antrag', 'sd_spam e-mail bekannt meldung', 'ub_prod_namenskorrektur_student', 'ub_beschaedigte unicard', 'sd_namensaenderung mitarbeiter', 'sd_mail_als_anhang', 'benutzer zum redmine hinzufuegen', 'sd_login_tu_portale', 'sd_email_namensaenderung', 'windows 10', 'ungueltiges ticket siehe journal', 'sd_vpn_temporaerer fehler ub', 'ub_prod_neue unicard bei beschaedigung', 'sd_dreamspark', 'sd_webmailer einrichtung weiterleitung', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_studisek_buchung_semesterbeitrag', 'sd_immatrikulationsbescheinigung_portal', 'sd_vpn_probleme_mit_unitymedia', 'sd-e-mail_adresse_funktional_beantragen', 'sd_diensthandy beschaffung', 'sd_vpn_webvpn', 'sd_laufzeit unimail account', 'sd_citavi', 'problem mit der beantragung von der unicard', 'sd_kurs-angebote itmc', 'sd_telefonbuch, neues system', 'sd_login tu portale', 'sd_wlan passwort setzen', 'sd_zugriff_onlinedienste_rueckmeldung', 'unicard_restbetrag_auszahlung', 'sd_immatrikulationsbescheigung_druckfehler', 'bd_unicard_nicht_eingeschrieben', 'sd_unimail zu exchange', 'sd_wlan-gastkonto', 'probleme mit unicard', 'sd_telefonbuch_prof_eintragung', 'sd_vpn_aktualisierung', 'sd_apple-on-campus', 'bd_goeke_allgemein', 'studierendenausweis', 'ub_drucker kopierer', 'sd_unicard_abholung', 'sd_office 356 plus bestellung', 'ohne betreff', 'sd_tu_app_keine internetverbindung', 'sd_boss_notenverbuchung', 'ub_geldchip-problem bei uc', 'sd_itmc kurse anmeldebestaetigung', 'sd_citavi bestellung', 'sd_telefonantrag_änderung_neuantrag', 'sd_sap_konteneinsicht_ workaround', 'sd_kontakt_asknet', 'sd_plotauftrag_zv', 'sd_webmailer_thread-anzeige', 'sd_gastaufenthalter'] + +for l in used_list: + if l not in labellist: + print(l) + +print(len(used_list)) +print(len(labellist)) + +# load corpus +corpus_de_path = FILEPATH + config.get("de_corpus", "path") +preCorpus_name = "de" + "_pre_ticket" +corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) +logprint("Corpus loaded: {0}".format(corpus.lang)) +# +#todo randomize + +split_index = int(float(len(corpus)) * 0.8) +corpus_train = corpus[0:split_index] +corpus_test = corpus[split_index:len(corpus)-1] + + + + + +# lda bild abdunkeln +# auschnitte + + +import numpy as np +matplotlib.use('Agg') + +import matplotlib.pyplot as plt + + + + + + + + + +end = time.time() +print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start)) + +""" +vllt kategorien in unterkategorien aufteilen + +allg: +utf-korregieren, bei sonderzeichen wörter trennen +namen raus, addressen nach grüßen + +emails, urls, nummern raus +vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält + +sinnvoller wörter von müll trennen: 8203;verfügung + +abkürzungen raus: m.a, o.ä. + + +sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem-------- + +"\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start)) +""" """ # load config @@ -81,17 +184,6 @@ print(len(set(bla))) print() """ - - -x = [[1,2,3],[3,4,5]] - -arr = np.array(x) - -print(arr) - - - - """ #save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name) @@ -104,6 +196,7 @@ list = [(key,value) for key,value in dict.items()] list.sort(key=lambda tup : tup[1]) """ + """ from spacy.tokens.doc import Doc as SpacyDoc @@ -137,7 +230,6 @@ textacy.fileio.write_file_lines(readCorpus(filepath),"/home/jannis.grundmann/Pyc # printRandomDoc(raw_corpus) - """ spacy_doc = PARSER("test") save_obj(spacy_doc, "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl") @@ -155,6 +247,7 @@ laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_h with open(LLDA_filepath, 'w') as file: file.write(json.dumps(laveldict)) """ + """ def load_corpus(corpus_path, corpus_name, lang="de"): from pathlib import Path @@ -609,25 +702,3 @@ textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt") """ -end = time.time() -print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start)) - -""" -vllt kategorien in unterkategorien aufteilen - -allg: -utf-korregieren, bei sonderzeichen wörter trennen -namen raus, addressen nach grüßen - -emails, urls, nummern raus -vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält - -sinnvoller wörter von müll trennen: 8203;verfügung - -abkürzungen raus: m.a, o.ä. - - -sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem-------- - -"\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start)) -""" diff --git a/topicModeling.py b/topicModeling.py index e0f53c6..a8fd351 100644 --- a/topicModeling.py +++ b/topicModeling.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- from datetime import datetime - +import draw +import draw1 import time import numpy as np @@ -30,38 +31,9 @@ with open(config_ini) as f: config.read_file(f) -def label2ID(label, labeldict): - return labeldict.get(label, len(labeldict)) - - -def generate_lablelID_lines(textacyCorpus, labeldict): - for doc in textacyCorpus: - # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi - yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text - -""" -def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): - logprint(str("ngrams: {0}".format(ngrams))) - logprint(str("min_df: {0}".format(min_df))) - logprint(str("max_df: {0}".format(max_df))) - logprint(str("named_entities: {0}".format(named_entities))) - - # printlog("vectorize corpi...") - vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) - - terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus) - doc_term_matrix = vectorizer.fit_transform(terms_list) - id2term = vectorizer.__getattribute__("id_to_term") - - for t in terms_list: - print(t) - logprint("doc_term_matrix: {0}".format(doc_term_matrix)) - logprint("id2term: {0}".format(id2term)) -""" - def textacyTopicModeling(corpus, n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5, - ngrams = 1, min_df=1, max_df=1.0, + ngrams = 1, min_df=1, max_df=0.9, topicModel='lda'): @@ -73,10 +45,7 @@ def textacyTopicModeling(corpus, - - logprint( - "############### Topic Modeling {0} ###########################".format( - topicModel)) + logprint("#### Topic Modeling {0}".format(topicModel)) logprint(str("ngrams: {0}".format(ngrams))) logprint(str("min_df: {0}".format(min_df))) logprint(str("max_df: {0}".format(max_df))) @@ -93,7 +62,7 @@ def textacyTopicModeling(corpus, - #################### vectorize corpi #################### + ###### vectorize corpi vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) @@ -101,14 +70,9 @@ def textacyTopicModeling(corpus, doc_term_matrix = vectorizer.fit_transform(terms_list) id2term = vectorizer.__getattribute__("id_to_term") - # printlog("terms_list: {0}".format(list(terms_list))) - # printlog("doc_term_matrix: {0}".format(doc_term_matrix)) - - - - ##################### Initialize and train a topic model ############################################## + ####### Initialize and train a topic model model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) @@ -118,7 +82,7 @@ def textacyTopicModeling(corpus, for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words): - logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms))) + logprint('{0}: {1}'.format(topic_idx, " ".join(top_terms))) for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic): logprint(topic_idx) @@ -132,66 +96,44 @@ def textacyTopicModeling(corpus, grams_label = "uni" if ngrams == 1 else "bi" - model.termite_plot(doc_term_matrix, id2term, + draw1.termite_plot(model,doc_term_matrix, id2term, n_terms=n_terms, sort_terms_by=sort_terms_by, - rank_terms_by=rank_terms_by+'_weight', - - - save= FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label,topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by)) - + rank_terms_by=rank_terms_by + '_weight', + save=FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label, topicModel, n_topics, + n_terms, sort_terms_by, rank_terms_by)) end = time.time() logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) +def jgibbsLLDA(labeldict,line_gen,path2save_results, top_topic_words=7): -def jgibbsLLDA(corpus, path2save_results, top_topic_words=7): - start = time.time() - - + labeldict_rev = {v: k for k, v in labeldict.items()} jgibbsLLDA_root = FILEPATH + "java_LabledLDA/" - LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) - # build dictionary of ticketcategories - labelist = [] - for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): - labelist.append(texdoc.metadata["categoryName"]) - - - labeldict = {k: v for v, k in enumerate(labelist)} - reverse_labeldict = {v: k for k, v in labeldict.items()} - - #and save - labeldict_path = FILEPATH + "results/labeldict.txt" - with open(labeldict_path, 'w') as file: - file.write(json.dumps(labeldict)) - - - n_topics = len(labeldict) #+1 #default-topic + textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath) - # create file with label_IDs (input for llda) - textacy.fileio.write_file_lines(generate_lablelID_lines(corpus, labeldict), filepath=LLDA_filepath) # wait for file to exist while not os.path.exists(LLDA_filepath): time.sleep(1) - logprint("") - logprint("start LLDA:") # run JGibbsLLDA file + n_topics = len(labeldict) #+1 #default-topic + FNULL = open(os.devnull, 'w') # supress output cmd_jgibbs_java = ["java", "-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( @@ -218,7 +160,7 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7): if len(findall) != 0: try: index = int(findall[0].split()[1]) - result.append("Topic {} {}:".format(index, reverse_labeldict[index])) + result.append("Topic {} {}:".format(index, labeldict_rev[index])) except: result.append(line) @@ -243,37 +185,15 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7): index = int(findall[0].split()[1]) - res_dict = {index : str(reverse_labeldict[index]) } + res_dict = {index : str(labeldict_rev[index]) } else: splitted = line.split() res_dict[splitted[0]] = float(splitted[1]) - ### print terms that are topics - for s in list(res_dict.values()): - if isinstance(s,str) and splitted[0] in s: - vals = list(res_dict.values()) - keys = list(res_dict.keys()) - for v in vals: - if not isinstance(v,float): - print("{}".format(v)) - print("{}".format(splitted[0])) - count +=1 - print() - ### - - if len(res_dict) != 0: results.append(res_dict) # letzes an die liste ran - print(count) - print(float(count)/float(len(labelist))) - - - - - # {0: 'betrieb', 'service': 0.24162679425837305, 'support': 0.24162679425837305, 'browser': 0.24162679425837305, 'unicard': 0.24162679425837305, 'telefon': 0.0023923444976076593} - # every term in the resulsts to a list @@ -286,20 +206,12 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7): term2id = {t:i for i,t in enumerate(terms)} #and to dict ################# termite plot ##################################################################### - - #term_topic_weights.shape = (len(term_ids),len(topic_ids) - - - #topic_labels = tuple(labelist) - - topic_labels = list(range(len(labelist))) + topic_labels = list(range(len(labeldict))) term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()]) term_topic_weights = np.zeros((len(term2id),len(topic_labels))) - - for i,res in enumerate(results): for key,value in res.items(): @@ -308,77 +220,223 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7): term_topic_weights[term2id[key]][i] = value term_labels[term2id[key]] = key else: - topic_labels[i] = reverse_labeldict[key] + topic_labels[i] = labeldict_rev[key] - - viz.draw_termite_plot( + draw.draw_termite( term_topic_weights, topic_labels, term_labels, save=path2save_results+".png") + draw.draw_termite( + term_topic_weights, topic_labels, term_labels, save=path2save_results+"_spaced.png",pow_x=0.78,pow_y=0.87) + + # save labeldict + labeldict_path = path2save_results + "_labeldict.json" + with open(labeldict_path, 'w') as file: + file.write(json.dumps(labeldict)) + + +def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7): + + start = time.time() + logprint("") + logprint("start Category-LLDA:") + + # build dictionary of ticketcategories + labelist = [] + for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): + labelist.append(texdoc.metadata["categoryName"]) + + labelist = list(set(labelist)) + print("len(labelist): {}".format(len(labelist))) + + labeldict = {k: v for v, k in enumerate(labelist)} + + def gen_cat_lines(textacyCorpus, labeldict): + """ generates [topic1, topic2....] tok1 tok2 tok3 out of corpi""" + + for doc in textacyCorpus: + yield "[" + str(labeldict.get(doc.metadata["categoryName"], len(labeldict))) + "] " + doc.text + + + line_gen = gen_cat_lines(corpus, labeldict) + + + path2save_results = path2save_results + "_kb_cat_llda_{}".format("top" + str(top_topic_words)) + + + jgibbsLLDA(labeldict, line_gen, path2save_results, top_topic_words=top_topic_words) + end = time.time() - logprint("Time Elapsed Topic Modeling JGibbsLLDA:{0} min\n".format((end - start) / 60)) + logprint("\n\n\nTime Elapsed Category-LLDA :{0} min\n\n".format((end - start) / 60)) + + +def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=False): + """ticket_ID -> KB_ID -> keywords / subject -> llda""" + + start = time.time() + logprint("") + logprint("start {}-LLDA:".format("Keyword" if kb_keywords else "Subject")) + + # ticket2kb_dict + + kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";") + + ticket2kb_dict = {} + + for line in kb2ticket_gen: + + ticket_id = line[0] + kb_id = line[1] + + ticket2kb_dict[ticket_id] = kb_id + # {'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...} + + kb_entries_used = len(list(set(ticket2kb_dict.values()))) + print("kb_entries_used: {}".format(kb_entries_used)) + + + + # kb2keywords_dict + + kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";") + + next(kb2keywords_gen,None) #skip first line("ArticleID";"Subject";"Keywords";...) + + kb2keywords_dict = {} + + for line in kb2keywords_gen: + + kb_id = line[0] + + subject = line[1] + + keywords = line[2] + keywords_list = [normalize(x) for x in str(keywords).split(",")] + + if kb_id not in kb2keywords_dict.keys(): + kb2keywords_dict[kb_id] = [] + + if kb_keywords: + for item in keywords_list: + if item != "": + kb2keywords_dict[kb_id].append(item) + + else: + kb2keywords_dict[kb_id].append(subject) + + + #remove all empty items + kb2keywords_dict = { k : v for k,v in kb2keywords_dict.items() if len(v) != 0} + # {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...} + #keywords2kb_dict -def main(use_cleaned=False, algorithm="llda"): + keywords2kb_dict = {} + + for kb_id, lst in kb2keywords_dict.items(): + for l in lst: + if l not in keywords2kb_dict.keys(): + keywords2kb_dict[l] = [kb_id] + else: + keywords2kb_dict[l].append(kb_id) + # {'unicard namensaenderung': ['KBA10276'], 'vpn': ['KBA10063'], 'outlook_exchange': ['KBA10181'], ...} - # idee http://bigartm.org/ - # idee http://wiki.languagetool.org/tips-and-tricks - # idee https://en.wikipedia.org/wiki/Noisy_text_analytics - # idee https://gate.ac.uk/family/ + # Look for actually used keywords + used_keywords = [] + + for doc in corpus: + + ticket_number = doc.metadata["TicketNumber"] + + kb_id = ticket2kb_dict.get(ticket_number, None) + + keywords = kb2keywords_dict.get(kb_id, None) + + if keywords and kb_id: + used_keywords.append(list(map(normalize,keywords))) + + + + labelist = [item for sublist in used_keywords for item in sublist] #flatten list + labelist = list(set(labelist)) + print("len(labelist): {}".format(len(labelist))) + + + + labeldict = {k: v for v, k in enumerate(labelist)} + + + + def gen_KB_lines(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict): + for doc in corpus: + + ticket_number = doc.metadata["TicketNumber"] + + kb_number = ticket2kb_dict.get(ticket_number, None) + + keywords = kb2keywords_dict.get(kb_number, None) + + if keywords: + + label = "" + for kw in keywords: + label = label + str(labeldict.get(normalize(str(kw)), len(labeldict))) + " " + + yield "[ " + label + "] " + doc.text + + + line_gen = gen_KB_lines(corpus, labeldict, ticket2kb_dict, kb2keywords_dict) + + + path2save_results = path2save_results + "_kb_{}_llda_{}".format("keys" if kb_keywords else "subs", + "top" + str(top_topic_words)) + + + jgibbsLLDA(labeldict, line_gen, path2save_results, top_topic_words=top_topic_words) + + end = time.time() + logprint("\n\n\nTime Elapsed {1}-LLDA :{0} min\n\n".format((end - start) / 60,"Keyword" if kb_keywords else "Subject")) + + +def main( algorithm="llda"): + logprint("Topic Modeling: {0}".format(datetime.now())) + corpus_de_path = FILEPATH + config.get("de_corpus", "path") corpus_en_path = FILEPATH + config.get("en_corpus", "path") - if use_cleaned: - preCorpus_name = "de" + "_clean_ticket" - resultspath = FILEPATH + "results/clean" - else: - preCorpus_name = "de" + "_pre_ticket" - resultspath = FILEPATH + "results/pre" + preCorpus_name = "de" + "_pre_ticket" + resultspath = FILEPATH + "results/pre" - - # load cleand corpus + # load corpus de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) logprint("Corpus loaded: {0}".format(de_corpus.lang)) - # todo llda topics zusammenfassen - # idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics - # frage wieviele tickets pro topic? - - """ - ngrams = 1 - min_df = 1 - max_df = 1.0 - weighting = 'tf' - # weighting ='tfidf' - named_entities = False - - - printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting) - printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting) - printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting) - - printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting) - printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting) - printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting) - """ - if algorithm == "llda": + top_topic_words = 5 - path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words)) - jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words) + jgibbsLLDA_category(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words) + + + kb_keywords = False + jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords) + + kb_keywords = True + jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords) + + """ top_topic_words = 10 @@ -399,10 +457,7 @@ def main(use_cleaned=False, algorithm="llda"): textacyTopicModeling(ngrams = 1, - min_df = 1, - max_df = 0.9, topicModel = algorithm, - n_topics =15, corpus=de_corpus) """ textacyTopicModeling(ngrams=1, @@ -411,7 +466,7 @@ def main(use_cleaned=False, algorithm="llda"): topicModel=algorithm, n_topics=20, corpus=de_corpus) - + textacyTopicModeling(ngrams=1, min_df=1, max_df=0.9, @@ -430,10 +485,7 @@ def main(use_cleaned=False, algorithm="llda"): textacyTopicModeling(ngrams=(1, 2), - min_df=1, - max_df=0.9, topicModel=algorithm, - n_topics=15, corpus=de_corpus) """ textacyTopicModeling(ngrams = (1,2), @@ -442,7 +494,7 @@ def main(use_cleaned=False, algorithm="llda"): topicModel = algorithm, n_topics =20, corpus=de_corpus) - + textacyTopicModeling(ngrams = (1,2), min_df = 1, max_df = 0.9,