refactored
This commit is contained in:
parent
0a6a68b8aa
commit
4a3683635e
24
config.ini
24
config.ini
|
@ -62,8 +62,6 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
|
||||||
|
|
||||||
[preprocessing]
|
[preprocessing]
|
||||||
|
|
||||||
#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
|
|
||||||
|
|
||||||
autocorrect = false
|
autocorrect = false
|
||||||
#true
|
#true
|
||||||
|
|
||||||
|
@ -72,26 +70,4 @@ custom_words=aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderu
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
[topicmodeling]
|
|
||||||
|
|
||||||
ngrams=(1,2)
|
|
||||||
|
|
||||||
min_df=0
|
|
||||||
max_df=1.0
|
|
||||||
no_below=20
|
|
||||||
no_above=0.5
|
|
||||||
|
|
||||||
topicModel=lda
|
|
||||||
|
|
||||||
top_topic_words=5
|
|
||||||
|
|
||||||
top_document_labels_per_topic=2
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,165 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
try:
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
RC_PARAMS = {'axes.axisbelow': True,
|
||||||
|
'axes.edgecolor': '.8',
|
||||||
|
'axes.facecolor': 'white',
|
||||||
|
'axes.grid': False,
|
||||||
|
'axes.labelcolor': '.15',
|
||||||
|
'axes.linewidth': 1.0,
|
||||||
|
'axes.labelpad' : 10.0,
|
||||||
|
'figure.facecolor': 'white',
|
||||||
|
'font.family': ['sans-serif'],
|
||||||
|
'font.sans-serif': ['Arial', 'Liberation Sans', 'sans-serif'],
|
||||||
|
'grid.color': '.8', 'grid.linestyle': '-',
|
||||||
|
'image.cmap': 'Greys',
|
||||||
|
'legend.frameon': False,
|
||||||
|
'legend.numpoints': 1, 'legend.scatterpoints': 1,
|
||||||
|
'lines.solid_capstyle': 'round',
|
||||||
|
'text.color': '1.0',
|
||||||
|
'xtick.color': '1.0', 'xtick.direction': 'out',
|
||||||
|
'xtick.major.size': 0.0, 'xtick.minor.size': 0.0,
|
||||||
|
'xtick.major.pad' : 5,
|
||||||
|
'ytick.color': '1.0', 'ytick.direction': 'out',
|
||||||
|
'ytick.major.size': 0.0, 'ytick.minor.size': 0.0,
|
||||||
|
'axes.ymargin' : 0.9,
|
||||||
|
'ytick.major.pad': 5}
|
||||||
|
|
||||||
|
COLOR_PAIRS = (((0.65098041296005249, 0.80784314870834351, 0.89019608497619629),
|
||||||
|
(0.12572087695201239, 0.47323337360924367, 0.707327968232772)),
|
||||||
|
((0.68899655751153521, 0.8681737867056154, 0.54376011946622071),
|
||||||
|
(0.21171857311445125, 0.63326415104024547, 0.1812226118410335)),
|
||||||
|
((0.98320646005518297, 0.5980161709820524, 0.59423301088459368),
|
||||||
|
(0.89059593116535862, 0.10449827132271793, 0.11108035462744099)),
|
||||||
|
((0.99175701702342312, 0.74648213716698619, 0.43401768935077328),
|
||||||
|
(0.99990772780250103, 0.50099192647372981, 0.0051211073118098693)),
|
||||||
|
((0.78329874347238004, 0.68724338552531095, 0.8336793640080622),
|
||||||
|
(0.42485198495434734, 0.2511495584950722, 0.60386007743723258)),
|
||||||
|
((0.99760092286502611, 0.99489427150464516, 0.5965244373854468),
|
||||||
|
(0.69411766529083252, 0.3490196168422699, 0.15686275064945221)))
|
||||||
|
|
||||||
|
|
||||||
|
def draw_termite(values_mat, col_labels, row_labels,
|
||||||
|
highlight_cols=None, highlight_colors=None,
|
||||||
|
save=False, pow_x = 0.66, pow_y = 0.8):
|
||||||
|
"""
|
||||||
|
Make a "termite" plot, typically used for assessing topic models with a tabular
|
||||||
|
layout that promotes comparison of terms both within and across topics.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
values_mat (``np.ndarray`` or matrix): matrix of values with shape
|
||||||
|
(# row labels, # col labels) used to size the dots on the grid
|
||||||
|
col_labels (seq[str]): labels used to identify x-axis ticks on the grid
|
||||||
|
row_labels(seq[str]): labels used to identify y-axis ticks on the grid
|
||||||
|
highlight_cols (int or seq[int], optional): indices for columns
|
||||||
|
to visually highlight in the plot with contrasting colors
|
||||||
|
highlight_colors (tuple of 2-tuples): each 2-tuple corresponds to a pair
|
||||||
|
of (light/dark) matplotlib-friendly colors used to highlight a single
|
||||||
|
column; if not specified (default), a good set of 6 pairs are used
|
||||||
|
save (str, optional): give the full /path/to/fname on disk to save figure
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
``matplotlib.axes.Axes.axis``: axis on which termite plot is plotted
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: if more columns are selected for highlighting than colors
|
||||||
|
or if any of the inputs' dimensions don't match
|
||||||
|
|
||||||
|
References:
|
||||||
|
.. Chuang, Jason, Christopher D. Manning, and Jeffrey Heer. "Termite:
|
||||||
|
Visualization techniques for assessing textual topic models."
|
||||||
|
Proceedings of the International Working Conference on Advanced
|
||||||
|
Visual Interfaces. ACM, 2012.
|
||||||
|
|
||||||
|
.. seealso:: :func:`TopicModel.termite_plot <textacy.tm.TopicModel.termite_plot>`
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
plt
|
||||||
|
except NameError:
|
||||||
|
raise ImportError(
|
||||||
|
'matplotlib is not installed, so textacy.viz won\'t work; install it \
|
||||||
|
individually, or along with textacy via `pip install textacy[viz]`')
|
||||||
|
n_rows, n_cols = values_mat.shape
|
||||||
|
max_val = np.max(values_mat)
|
||||||
|
|
||||||
|
if n_rows != len(row_labels):
|
||||||
|
msg = "values_mat and row_labels dimensions don't match: {} vs. {}".format(
|
||||||
|
n_rows, len(row_labels))
|
||||||
|
raise ValueError(msg)
|
||||||
|
if n_cols != len(col_labels):
|
||||||
|
msg = "values_mat and col_labels dimensions don't match: {} vs. {}".format(
|
||||||
|
n_cols, len(col_labels))
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
if highlight_colors is None:
|
||||||
|
highlight_colors = COLOR_PAIRS
|
||||||
|
if highlight_cols is not None:
|
||||||
|
if isinstance(highlight_cols, int):
|
||||||
|
highlight_cols = (highlight_cols,)
|
||||||
|
elif len(highlight_cols) > len(highlight_colors):
|
||||||
|
msg = 'no more than {} columns may be highlighted at once'.format(
|
||||||
|
len(highlight_colors))
|
||||||
|
raise ValueError(msg)
|
||||||
|
highlight_colors = {hc: COLOR_PAIRS[i]
|
||||||
|
for i, hc in enumerate(highlight_cols)}
|
||||||
|
|
||||||
|
with plt.rc_context(RC_PARAMS):
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=(pow(n_cols, pow_y), pow(n_rows, pow_x))) #hier fesntergröße
|
||||||
|
|
||||||
|
|
||||||
|
_ = ax.set_yticks(range(n_rows))
|
||||||
|
yticklabels = ax.set_yticklabels(row_labels,
|
||||||
|
fontsize=14, color='gray')
|
||||||
|
if highlight_cols is not None:
|
||||||
|
for i, ticklabel in enumerate(yticklabels):
|
||||||
|
max_tick_val = max(values_mat[i, hc] for hc in highlight_cols)
|
||||||
|
for hc in highlight_cols:
|
||||||
|
if max_tick_val > 0 and values_mat[i, hc] == max_tick_val:
|
||||||
|
ticklabel.set_color(highlight_colors[hc][1])
|
||||||
|
|
||||||
|
ax.get_xaxis().set_ticks_position('top')
|
||||||
|
_ = ax.set_xticks(range(n_cols))
|
||||||
|
xticklabels = ax.set_xticklabels(col_labels,
|
||||||
|
fontsize=14, color='gray',
|
||||||
|
rotation=30, ha='left')
|
||||||
|
if highlight_cols is not None:
|
||||||
|
gridlines = ax.get_xgridlines()
|
||||||
|
for i, ticklabel in enumerate(xticklabels):
|
||||||
|
if i in highlight_cols:
|
||||||
|
ticklabel.set_color(highlight_colors[i][1])
|
||||||
|
gridlines[i].set_color(highlight_colors[i][0])
|
||||||
|
gridlines[i].set_alpha(0.5)
|
||||||
|
|
||||||
|
for col_ind in range(n_cols):
|
||||||
|
if highlight_cols is not None and col_ind in highlight_cols:
|
||||||
|
ax.scatter([col_ind for _ in range(n_rows)],
|
||||||
|
[i for i in range(n_rows)],
|
||||||
|
s=600 * (values_mat[:, col_ind] / max_val),
|
||||||
|
alpha=0.5, linewidth=1,
|
||||||
|
color=highlight_colors[col_ind][0],
|
||||||
|
edgecolor=highlight_colors[col_ind][1])
|
||||||
|
else:
|
||||||
|
ax.scatter([col_ind for _ in range(n_rows)],
|
||||||
|
[i for i in range(n_rows)],
|
||||||
|
s=600 * (values_mat[:, col_ind] / max_val),
|
||||||
|
alpha=0.5, linewidth=1,
|
||||||
|
color='black', edgecolor='gray')
|
||||||
|
|
||||||
|
_ = ax.set_xlim(left=-1, right=n_cols)
|
||||||
|
_ = ax.set_ylim(bottom=-1, top=n_rows)
|
||||||
|
|
||||||
|
ax.invert_yaxis() # otherwise, values/labels go from bottom to top
|
||||||
|
#plt.ylim(ymax=5)
|
||||||
|
|
||||||
|
if save:
|
||||||
|
fig.savefig(save, bbox_inches='tight', dpi=100)
|
||||||
|
|
||||||
|
return ax
|
|
@ -0,0 +1,105 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
|
||||||
|
from sklearn.externals import joblib
|
||||||
|
|
||||||
|
from textacy import viz
|
||||||
|
import draw
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def termite_plot(model, doc_term_matrix, id2term,
|
||||||
|
topics=-1, sort_topics_by='index', highlight_topics=None,
|
||||||
|
n_terms=25, rank_terms_by='topic_weight', sort_terms_by='seriation',
|
||||||
|
save=False, pow_x = 0.66, pow_y = 0.8):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if highlight_topics is not None:
|
||||||
|
if isinstance(highlight_topics, int):
|
||||||
|
highlight_topics = (highlight_topics,)
|
||||||
|
elif len(highlight_topics) > 6:
|
||||||
|
raise ValueError('no more than 6 topics may be highlighted at once')
|
||||||
|
|
||||||
|
# get topics indices
|
||||||
|
if topics == -1:
|
||||||
|
topic_inds = tuple(range(model.n_topics))
|
||||||
|
elif isinstance(topics, int):
|
||||||
|
topic_inds = (topics,)
|
||||||
|
else:
|
||||||
|
topic_inds = tuple(topics)
|
||||||
|
|
||||||
|
# get topic indices in sorted order
|
||||||
|
if sort_topics_by == 'index':
|
||||||
|
topic_inds = sorted(topic_inds)
|
||||||
|
elif sort_topics_by == 'weight':
|
||||||
|
topic_inds = tuple(topic_ind for topic_ind
|
||||||
|
in np.argsort(model.topic_weights(model.transform(doc_term_matrix)))[::-1]
|
||||||
|
if topic_ind in topic_inds)
|
||||||
|
else:
|
||||||
|
msg = 'invalid sort_topics_by value; must be in {}'.format(
|
||||||
|
{'index', 'weight'})
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
# get column index of any topics to highlight in termite plot
|
||||||
|
if highlight_topics is not None:
|
||||||
|
highlight_cols = tuple(i for i in range(len(topic_inds))
|
||||||
|
if topic_inds[i] in highlight_topics)
|
||||||
|
else:
|
||||||
|
highlight_cols = None
|
||||||
|
|
||||||
|
# get top term indices
|
||||||
|
if rank_terms_by == 'corpus_weight':
|
||||||
|
term_inds = np.argsort(np.ravel(doc_term_matrix.sum(axis=0)))[:-n_terms - 1:-1]
|
||||||
|
elif rank_terms_by == 'topic_weight':
|
||||||
|
term_inds = np.argsort(model.model.components_.sum(axis=0))[:-n_terms - 1:-1]
|
||||||
|
else:
|
||||||
|
msg = 'invalid rank_terms_by value; must be in {}'.format(
|
||||||
|
{'corpus_weight', 'topic_weight'})
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
# get top term indices in sorted order
|
||||||
|
if sort_terms_by == 'weight':
|
||||||
|
pass
|
||||||
|
elif sort_terms_by == 'index':
|
||||||
|
term_inds = sorted(term_inds)
|
||||||
|
elif sort_terms_by == 'alphabetical':
|
||||||
|
term_inds = sorted(term_inds, key=lambda x: id2term[x])
|
||||||
|
elif sort_terms_by == 'seriation':
|
||||||
|
topic_term_weights_mat = np.array(
|
||||||
|
np.array([model.model.components_[topic_ind][term_inds]
|
||||||
|
for topic_ind in topic_inds])).T
|
||||||
|
# calculate similarity matrix
|
||||||
|
topic_term_weights_sim = np.dot(topic_term_weights_mat, topic_term_weights_mat.T)
|
||||||
|
# substract minimum of sim mat in order to keep sim mat nonnegative
|
||||||
|
topic_term_weights_sim = topic_term_weights_sim - topic_term_weights_sim.min()
|
||||||
|
# compute Laplacian matrice and its 2nd eigenvector
|
||||||
|
L = np.diag(sum(topic_term_weights_sim, 1)) - topic_term_weights_sim
|
||||||
|
D, V = np.linalg.eigh(L)
|
||||||
|
D = D[np.argsort(D)]
|
||||||
|
V = V[:, np.argsort(D)]
|
||||||
|
fiedler = V[:, 1]
|
||||||
|
# get permutation corresponding to sorting the 2nd eigenvector
|
||||||
|
term_inds = [term_inds[i] for i in np.argsort(fiedler)]
|
||||||
|
else:
|
||||||
|
msg = 'invalid sort_terms_by value; must be in {}'.format(
|
||||||
|
{'weight', 'index', 'alphabetical', 'seriation'})
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
# get topic and term labels
|
||||||
|
topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)
|
||||||
|
term_labels = tuple(id2term[term_ind] for term_ind in term_inds)
|
||||||
|
|
||||||
|
# get topic-term weights to size dots
|
||||||
|
term_topic_weights = np.array([model.model.components_[topic_ind][term_inds]
|
||||||
|
for topic_ind in topic_inds]).T
|
||||||
|
|
||||||
|
return draw.draw_termite(
|
||||||
|
term_topic_weights, topic_labels, term_labels,
|
||||||
|
highlight_cols=highlight_cols, save=save, pow_x = pow_x, pow_y = pow_y)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
31
main.py
31
main.py
|
@ -14,8 +14,27 @@ from miscellaneous import *
|
||||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/printout_main.log &"
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/printout_main.log &"
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
|
# idee http://bigartm.org/
|
||||||
|
# idee http://wiki.languagetool.org/tips-and-tricks
|
||||||
|
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
|
||||||
|
# idee https://gate.ac.uk/family/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# todo llda topics zusammenfassen
|
||||||
|
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
|
||||||
|
# frage wieviele tickets pro topic?
|
||||||
|
# todo modelle testen
|
||||||
|
# frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen
|
||||||
|
# frage wenn 155 versch. kb-einträge benutzt wurden, wieso gibt es nur 139 topics?
|
||||||
|
# idee word vorher mit semantischen netz abgleichen: wenn zu weit entfernt, dann ignore
|
||||||
|
|
||||||
|
#todo FREITAG zeichnen, refactoring
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
init.main()
|
init.main()
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
@ -30,24 +49,26 @@ logprint("")
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
#topicModeling.main(use_cleaned=False,algorithm="lsa")
|
|
||||||
|
|
||||||
|
#topicModeling.main(algorithm="lsa")
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
|
||||||
#topicModeling.main(use_cleaned=False,algorithm="nmf")
|
#topicModeling.main(algorithm="nmf")
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
|
||||||
#topicModeling.main(use_cleaned=False,algorithm="lda")
|
#topicModeling.main(algorithm="lda")
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
|
||||||
topicModeling.main(use_cleaned=False,algorithm="llda")
|
topicModeling.main(algorithm="llda")
|
||||||
logprint("")
|
logprint("")
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))
|
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))
|
||||||
|
|
||||||
|
|
||||||
|
#800*400
|
|
@ -121,6 +121,18 @@ def list_from_files(*paths):
|
||||||
|
|
||||||
return list(map(textacy.preprocess.normalize_whitespace, liste))
|
return list(map(textacy.preprocess.normalize_whitespace, liste))
|
||||||
|
|
||||||
|
def debug():
|
||||||
|
pass
|
||||||
|
|
||||||
|
def normalize(string):
|
||||||
|
# replaceRockDots
|
||||||
|
string = re.sub(r'[ß]', "ss", string.lower())
|
||||||
|
string = re.sub(r'[ö]', "oe", string)
|
||||||
|
string = re.sub(r'[ü]', "ue", string)
|
||||||
|
string = re.sub(r'[ä]', "ae", string)
|
||||||
|
string = textacy.preprocess.normalize_whitespace(string)
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
def deprecated(func):
|
def deprecated(func):
|
||||||
"""This is a decorator which can be used to mark functions
|
"""This is a decorator which can be used to mark functions
|
||||||
|
|
|
@ -364,7 +364,7 @@ def main():
|
||||||
|
|
||||||
removePOS(["PUNCT", "SPACE", "NUM"]),
|
removePOS(["PUNCT", "SPACE", "NUM"]),
|
||||||
|
|
||||||
removeWords(DE_STOP_WORDS + custom_words),
|
removeWords(DE_STOP_WORDS + custom_words + VORNAMEN),
|
||||||
#removeWords(DE_STOP_WORDS),
|
#removeWords(DE_STOP_WORDS),
|
||||||
|
|
||||||
remove_long_words(),
|
remove_long_words(),
|
||||||
|
|
181
test.py
181
test.py
|
@ -1,31 +1,134 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
import matplotlib
|
||||||
|
matplotlib.use('Agg')
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import re
|
|
||||||
import time
|
import time
|
||||||
import json
|
|
||||||
|
|
||||||
# import spacy
|
|
||||||
# import textacy
|
|
||||||
from functools import reduce
|
|
||||||
|
|
||||||
import textacy
|
import textacy
|
||||||
|
|
||||||
start = time.time()
|
|
||||||
|
|
||||||
import enchant
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
import os
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
|
|
||||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
|
||||||
from miscellaneous import *
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
start = time.time()
|
||||||
|
import json
|
||||||
|
import os.path
|
||||||
|
import subprocess
|
||||||
|
from textacy import Vectorizer, viz
|
||||||
|
|
||||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
|
from miscellaneous import *
|
||||||
|
import textacy
|
||||||
|
from scipy import *
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||||
|
import draw
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# kb2keywords_dict
|
||||||
|
|
||||||
|
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv",
|
||||||
|
delimiter=";")
|
||||||
|
next(kb2keywords_gen, None) # skip first
|
||||||
|
used_kb=[]
|
||||||
|
for kb in kb2keywords_gen:
|
||||||
|
used_kb.append(kb[1])
|
||||||
|
print("used_kb: {}".format(len(list(set(used_kb)))))
|
||||||
|
|
||||||
|
# von 260 kb einträgen insg. wurden 155 genutzt
|
||||||
|
|
||||||
|
#"ArticleID";"Subject";"Keywords";"Solution";"SolutionText";"CreatedOn"
|
||||||
|
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", #
|
||||||
|
delimiter=";")
|
||||||
|
next(kb2keywords_gen, None) # skip first
|
||||||
|
cats=[]
|
||||||
|
subjects=[]
|
||||||
|
keywords=[]
|
||||||
|
for kb in kb2keywords_gen:
|
||||||
|
cats.append(kb[0])
|
||||||
|
subjects.append(kb[1])
|
||||||
|
keywords.append(kb[2].split(","))
|
||||||
|
|
||||||
|
cats_lst = list(set(cats))
|
||||||
|
print("cats: {}".format(len(cats_lst)))
|
||||||
|
print(cats_lst[0:20])
|
||||||
|
|
||||||
|
print(len(subjects))
|
||||||
|
subj_lst = list(set(subjects)) #frage: hat wirklich jeder kb_eintrag ein anderesn Betreff?
|
||||||
|
print("subjects: {}".format(len(subj_lst)))
|
||||||
|
print(subj_lst[0:20])
|
||||||
|
|
||||||
|
keywords = [item for sublist in keywords for item in sublist]
|
||||||
|
|
||||||
|
kys_lst = list(set(keywords))
|
||||||
|
print("keywords: {}".format(len(kys_lst)))
|
||||||
|
print(kys_lst[0:20])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
used_list = ['bd_unicard_nicht_eingeschrieben', 'sd_vpn_temporaerer fehler ub', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_beschaedigte unicard', 'sd_boss_notenverbuchung', 'd.3 client installation', 'sd_keine rueckantwort kunde', 'sd_asknet_und_dreamspark', 'sd_beantragung_unicard', 'sd_gastaufenthalter', 'sd_internationaloffice', 'sd_outlook anmeldung gestoert', 'unicard_restbetrag_auszahlung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'ub_unicard_unicard mit vollmacht abholen', 'sd_namensaenderung mitarbeiter', 'sd_itmc kurse anmeldebestaetigung', 'sd_zugriff_onlinedienste_rueckmeldung', 'benutzer zum redmine hinzufuegen', 'sd_unicard_gesperrte unicard entsperre', 'lsf freischaltung als mitarbeiter/in', 'sd_mail_als_anhang', 'sd-e-mail_adresse_funktional_beantragen', 'sd_goeke drucker', 'sd_unimail imap_pop3', 'sd_origin_workaround', 'sd_matlab lizenzdatei pc-pools', 'sd_outlook kontakte automatische aktualisierung', 'sd_sap konteneinsicht antrag', 'ohne betreff', 'sd_telefonantrag_änderung_neuantrag', 'sd_sophos download', 'sd_geraeteausleihe', 'studierendenausweis', 'sd_citavi', 'sd_laufzeit unimail account', 'sd_login_unibib ub-it', 'sd_tu_app_keine internetverbindung', 'sd_unicard_max_laufzeit', 'ub_unicard_zusendung der karte moeglich?', 'sd_telefonbuch-eintrag_änderung', 'ub_drucker kopierer', 'windows 10', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'sd_tu-app feedback standard', 'sd_spam e-mail bekannt meldung', 'sd_spss_online_bestellung', 'sd_apple-on-campus', 'sd_studisek', 'sd_office 365 plus support', 'sd_sap_initialkennwort_englisch', 'sd_office365_asknet', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_login tu portale', 'ungueltiges ticket siehe journal', 'sd_sap_freischaltung ohne passwortaenderung', 'bd_unicard_geldkarte_laden', 'sd_verlust/antrag unicard', 'sd_unimail zu exchange', 'citavi_lizenzschluessel_nicht bekommen', 'sd_plotauftrag_zv', 'sd_citavi_support', 'sd_antworten_korrekt', 'sd_wlan-gastkonto', 'sd_antwort_phishingmail', 'bd_unicard_freigabe_beantragung', 'sd_origin nur noch eine seriennummer', 'cm_asiexception', 'sd_login_tu_portale', 'sd_webmailer_thread-anzeige', 'apps_dms-passwort d.3', 'apps_redmine_repository', 'sd_uniaccount_passwortaenderung', 'sd_phishing', 'sd_sap_firefox_esr', 'vpn verbindung fuer unitymedia kunden', 'sd_kurs-angebote anmeldung', 'sd_unicard fehlerhafte geldbuchung', 'sd_uniaccount_ehemalige_passwortaenderung', 'sd_sap_dienstreise', 'cm_lsf-boss_freischaltung', 'wlan', 'uni card', 'sd_webmailer einrichtung weiterleitung', 'spam ohne tu bezug', 'sd_outlook_in_exchange_einbinden', 'sd_wlan_beratung', 'sd_uniaccount_dauer freischaltung', 'sd_sap_konteneinsicht_ workaround', 'sd_vpn anleitungen', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_unicard_abholung', 'sd_vpn_probleme_mit_unitymedia', 'sd_diensthandy beschaffung', 'sd_unicard_defekt', 'sd_freischaltung uniaccount verzoegert', 'sd_kurs-angebote itmc', 'bd_goeke_allgemein', 'sd_uniaccount_ehemalige_studierende', 'sd_stellenausschreibung schwarzes brett', 'freischaltung uniaccount', 'sd_unicard_workaround_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'ub_geldchip-problem bei uc', 'sd_semesterticket', 'problem mit der beantragung von der unicard', 'sd_citavi bestellung', 'sd_immatrikulationsbescheigung_druckfehler', 'sd_vpn_aktualisierung', 'vpn_ipsec_stoerung', 'sd_dreamspark', 'ub_namensaenderung', 'sd_immatrikulationsbescheinigung_portal', 'ub_prod_neue unicard bei beschaedigung', 'sd_vpn_webvpn', 'sd_telefonbuch_prof_eintragung', 'sd_kontakt_asknet', 'probleme mit unicard', 'sd_office 356 plus bestellung', 'sd_gmx_web.de', 'fehlender eintrag im elektronischen telefonbuch', 'ub_prod_namenskorrektur_student', 'einrichtung des eduroam netzwerks', 'sd_sap_initialkennwort', 'sd_boss-bescheinigung', 'sd_wlan passwort setzen', 'sd_aktivierung uniaccount', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_heirat_namensaenderung_student', 'ub_unicard_spaetere abholung moeglich?', 'unicard nochmal beantragen', 'sd_studisek_buchung_semesterbeitrag', 'sd_pruefungsamt', 'unicard vergessen abzuholen und nicht mehr da', 'sd_antrag funktionale mailadresse', 'sd_email_namensaenderung', 'sd_telefonbuch, neues system', 'sd_account_abmelden', 'ub_unicard_abholungszeiten']
|
||||||
|
labellist = ['sd_antworten_korrekt', 'sd_kurs-angebote anmeldung', 'sd_semesterticket', 'apps_dms-passwort d.3', 'freischaltung uniaccount', 'sd_heirat_namensaenderung_student', 'bd_unicard_freigabe_beantragung', 'sd_uniaccount_ehemalige_studierende', 'sd_sap_dienstreise', 'sd_origin_workaround', 'sd_uniaccount_ehemalige_passwortaenderung', 'fehlender eintrag im elektronischen telefonbuch', 'wlan', 'sd_tu-app feedback standard', 'sd_wlan_beratung', 'sd_uniaccount_passwortaenderung', 're: elektroarbeiten fuer leitsystem 2. und 3. obergeschoss', 'sd_webmailer_threadanzeige und weiterleitung', 'ub_unicard_spaetere abholung moeglich?', 'sd_citavi_support', 'sd_outlook kontakte automatische aktualisierung', 'sd_origin nur noch eine seriennummer', 'lsf freischaltung als mitarbeiter/in', 'cm_asiexception', 'sd_freischaltung uniaccount verzoegert', 'ub_unicard_zusendung der karte moeglich?', 'sd_login_unibib ub-it', 'uni card', 'sd_outlook anmeldung gestoert', 'd.3 client installation', 'ub_unicard_abholungszeiten', 'sd_antwort_phishingmail', 'sd_matlab lizenzdatei pc-pools', 'sd_sap_initialkennwort', 'sd_sap_freischaltung ohne passwortaenderung', 'sd_spss_online_bestellung', 'probleme mit der namensaenderung/ neue unicard', 'sd_keine rueckantwort kunde', 'sd_unimail imap_pop3', 'sd_beantragung_unicard', 'sd_unicard_gesperrte unicard entsperre', 'sd_internationaloffice', 'unicard nochmal beantragen', 'sd_stellenausschreibung schwarzes brett', 'sd_sophos download', 'cm_lsf-boss_freischaltung', 'sd_verlust/antrag unicard', 'vpn_ipsec_stoerung', 'sd_account_abmelden', 'sd_outlook_in_exchange_einbinden', 'ub_namensaenderung', 'sd_telefon (antrag: neuanschluss, umzug, änderung erledigt)', 'unicard vergessen abzuholen und nicht mehr da', 'apps_redmine_repository', 'einrichtung des eduroam netzwerks', 'sd_unicard_max_laufzeit', 'sd_gmx_web.de', 'sd_unicard fehlerhafte geldbuchung', 'sd_geraeteausleihe', 'spam ohne tu bezug', 'sd_uniaccount_dauer freischaltung', 'apps_dms_d.3 client installation/login d.3 funktioniert nicht', 'sd_office 365 plus support', 'sd_unicard_defekt', 'sd_phishing', 'sd_goeke drucker', 'ub_unicard_unicard mit vollmacht abholen', 'sd_gleitzeitanlage_dez3_stoerung', 'sd_pruefungsamt', 'sd_aktivierung uniaccount', 'sd_boss-bescheinigung', 'sd_sap_initialkennwort_englisch', 'bd_unicard_geldkarte_laden', 'sd_telefonbuch-eintrag_änderung', 'vpn verbindung fuer unitymedia kunden', 'sd_studisek', 'sd_antrag funktionale mailadresse', 'sd_asknet_und_dreamspark', 'sd_unicard_workaround_bestellung', 'sd_sap_firefox_esr', 'sd_vpn anleitungen', 'sd_office365_asknet', 'citavi_lizenzschluessel_nicht bekommen', 'sd_sap konteneinsicht antrag', 'sd_spam e-mail bekannt meldung', 'ub_prod_namenskorrektur_student', 'ub_beschaedigte unicard', 'sd_namensaenderung mitarbeiter', 'sd_mail_als_anhang', 'benutzer zum redmine hinzufuegen', 'sd_login_tu_portale', 'sd_email_namensaenderung', 'windows 10', 'ungueltiges ticket siehe journal', 'sd_vpn_temporaerer fehler ub', 'ub_prod_neue unicard bei beschaedigung', 'sd_dreamspark', 'sd_webmailer einrichtung weiterleitung', 'sd_asknet_mitarbeiter_softwarebestellung', 'sd_studisek_buchung_semesterbeitrag', 'sd_immatrikulationsbescheinigung_portal', 'sd_vpn_probleme_mit_unitymedia', 'sd-e-mail_adresse_funktional_beantragen', 'sd_diensthandy beschaffung', 'sd_vpn_webvpn', 'sd_laufzeit unimail account', 'sd_citavi', 'problem mit der beantragung von der unicard', 'sd_kurs-angebote itmc', 'sd_telefonbuch, neues system', 'sd_login tu portale', 'sd_wlan passwort setzen', 'sd_zugriff_onlinedienste_rueckmeldung', 'unicard_restbetrag_auszahlung', 'sd_immatrikulationsbescheigung_druckfehler', 'bd_unicard_nicht_eingeschrieben', 'sd_unimail zu exchange', 'sd_wlan-gastkonto', 'probleme mit unicard', 'sd_telefonbuch_prof_eintragung', 'sd_vpn_aktualisierung', 'sd_apple-on-campus', 'bd_goeke_allgemein', 'studierendenausweis', 'ub_drucker kopierer', 'sd_unicard_abholung', 'sd_office 356 plus bestellung', 'ohne betreff', 'sd_tu_app_keine internetverbindung', 'sd_boss_notenverbuchung', 'ub_geldchip-problem bei uc', 'sd_itmc kurse anmeldebestaetigung', 'sd_citavi bestellung', 'sd_telefonantrag_änderung_neuantrag', 'sd_sap_konteneinsicht_ workaround', 'sd_kontakt_asknet', 'sd_plotauftrag_zv', 'sd_webmailer_thread-anzeige', 'sd_gastaufenthalter']
|
||||||
|
|
||||||
|
for l in used_list:
|
||||||
|
if l not in labellist:
|
||||||
|
print(l)
|
||||||
|
|
||||||
|
print(len(used_list))
|
||||||
|
print(len(labellist))
|
||||||
|
|
||||||
|
# load corpus
|
||||||
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
|
preCorpus_name = "de" + "_pre_ticket"
|
||||||
|
corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
||||||
|
logprint("Corpus loaded: {0}".format(corpus.lang))
|
||||||
|
#
|
||||||
|
#todo randomize
|
||||||
|
|
||||||
|
split_index = int(float(len(corpus)) * 0.8)
|
||||||
|
corpus_train = corpus[0:split_index]
|
||||||
|
corpus_test = corpus[split_index:len(corpus)-1]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# lda bild abdunkeln
|
||||||
|
# auschnitte
|
||||||
|
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
matplotlib.use('Agg')
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start))
|
||||||
|
|
||||||
|
"""
|
||||||
|
vllt kategorien in unterkategorien aufteilen
|
||||||
|
|
||||||
|
allg:
|
||||||
|
utf-korregieren, bei sonderzeichen wörter trennen
|
||||||
|
namen raus, addressen nach grüßen
|
||||||
|
|
||||||
|
emails, urls, nummern raus
|
||||||
|
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
|
||||||
|
|
||||||
|
sinnvoller wörter von müll trennen: 8203;verfügung
|
||||||
|
|
||||||
|
abkürzungen raus: m.a, o.ä.
|
||||||
|
|
||||||
|
|
||||||
|
sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
|
||||||
|
|
||||||
|
"\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
|
||||||
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# load config
|
# load config
|
||||||
|
@ -81,17 +184,6 @@ print(len(set(bla)))
|
||||||
print()
|
print()
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
x = [[1,2,3],[3,4,5]]
|
|
||||||
|
|
||||||
arr = np.array(x)
|
|
||||||
|
|
||||||
print(arr)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name)
|
#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name)
|
||||||
|
|
||||||
|
@ -104,6 +196,7 @@ list = [(key,value) for key,value in dict.items()]
|
||||||
|
|
||||||
list.sort(key=lambda tup : tup[1])
|
list.sort(key=lambda tup : tup[1])
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
from spacy.tokens.doc import Doc as SpacyDoc
|
from spacy.tokens.doc import Doc as SpacyDoc
|
||||||
|
|
||||||
|
@ -137,7 +230,6 @@ textacy.fileio.write_file_lines(readCorpus(filepath),"/home/jannis.grundmann/Pyc
|
||||||
|
|
||||||
# printRandomDoc(raw_corpus)
|
# printRandomDoc(raw_corpus)
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
spacy_doc = PARSER("test")
|
spacy_doc = PARSER("test")
|
||||||
save_obj(spacy_doc, "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
|
save_obj(spacy_doc, "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
|
||||||
|
@ -155,6 +247,7 @@ laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_h
|
||||||
with open(LLDA_filepath, 'w') as file:
|
with open(LLDA_filepath, 'w') as file:
|
||||||
file.write(json.dumps(laveldict))
|
file.write(json.dumps(laveldict))
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -609,25 +702,3 @@ textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
end = time.time()
|
|
||||||
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
|
|
||||||
|
|
||||||
"""
|
|
||||||
vllt kategorien in unterkategorien aufteilen
|
|
||||||
|
|
||||||
allg:
|
|
||||||
utf-korregieren, bei sonderzeichen wörter trennen
|
|
||||||
namen raus, addressen nach grüßen
|
|
||||||
|
|
||||||
emails, urls, nummern raus
|
|
||||||
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
|
|
||||||
|
|
||||||
sinnvoller wörter von müll trennen: 8203;verfügung
|
|
||||||
|
|
||||||
abkürzungen raus: m.a, o.ä.
|
|
||||||
|
|
||||||
|
|
||||||
sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
|
|
||||||
|
|
||||||
"\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
|
|
||||||
"""
|
|
||||||
|
|
364
topicModeling.py
364
topicModeling.py
|
@ -1,7 +1,8 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import draw
|
||||||
|
import draw1
|
||||||
import time
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
@ -30,38 +31,9 @@ with open(config_ini) as f:
|
||||||
config.read_file(f)
|
config.read_file(f)
|
||||||
|
|
||||||
|
|
||||||
def label2ID(label, labeldict):
|
|
||||||
return labeldict.get(label, len(labeldict))
|
|
||||||
|
|
||||||
|
|
||||||
def generate_lablelID_lines(textacyCorpus, labeldict):
|
|
||||||
for doc in textacyCorpus:
|
|
||||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
|
||||||
yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text
|
|
||||||
|
|
||||||
"""
|
|
||||||
def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
|
|
||||||
logprint(str("ngrams: {0}".format(ngrams)))
|
|
||||||
logprint(str("min_df: {0}".format(min_df)))
|
|
||||||
logprint(str("max_df: {0}".format(max_df)))
|
|
||||||
logprint(str("named_entities: {0}".format(named_entities)))
|
|
||||||
|
|
||||||
# printlog("vectorize corpi...")
|
|
||||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
|
||||||
|
|
||||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
|
|
||||||
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
|
||||||
id2term = vectorizer.__getattribute__("id_to_term")
|
|
||||||
|
|
||||||
for t in terms_list:
|
|
||||||
print(t)
|
|
||||||
logprint("doc_term_matrix: {0}".format(doc_term_matrix))
|
|
||||||
logprint("id2term: {0}".format(id2term))
|
|
||||||
"""
|
|
||||||
|
|
||||||
def textacyTopicModeling(corpus,
|
def textacyTopicModeling(corpus,
|
||||||
n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5,
|
n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5,
|
||||||
ngrams = 1, min_df=1, max_df=1.0,
|
ngrams = 1, min_df=1, max_df=0.9,
|
||||||
topicModel='lda'):
|
topicModel='lda'):
|
||||||
|
|
||||||
|
|
||||||
|
@ -73,10 +45,7 @@ def textacyTopicModeling(corpus,
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
logprint("#### Topic Modeling {0}".format(topicModel))
|
||||||
logprint(
|
|
||||||
"############### Topic Modeling {0} ###########################".format(
|
|
||||||
topicModel))
|
|
||||||
logprint(str("ngrams: {0}".format(ngrams)))
|
logprint(str("ngrams: {0}".format(ngrams)))
|
||||||
logprint(str("min_df: {0}".format(min_df)))
|
logprint(str("min_df: {0}".format(min_df)))
|
||||||
logprint(str("max_df: {0}".format(max_df)))
|
logprint(str("max_df: {0}".format(max_df)))
|
||||||
|
@ -93,7 +62,7 @@ def textacyTopicModeling(corpus,
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#################### vectorize corpi ####################
|
###### vectorize corpi
|
||||||
|
|
||||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||||
|
|
||||||
|
@ -101,14 +70,9 @@ def textacyTopicModeling(corpus,
|
||||||
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
||||||
id2term = vectorizer.__getattribute__("id_to_term")
|
id2term = vectorizer.__getattribute__("id_to_term")
|
||||||
|
|
||||||
# printlog("terms_list: {0}".format(list(terms_list)))
|
|
||||||
# printlog("doc_term_matrix: {0}".format(doc_term_matrix))
|
|
||||||
|
|
||||||
|
|
||||||
|
####### Initialize and train a topic model
|
||||||
|
|
||||||
|
|
||||||
##################### Initialize and train a topic model ##############################################
|
|
||||||
|
|
||||||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||||
|
|
||||||
|
@ -118,7 +82,7 @@ def textacyTopicModeling(corpus,
|
||||||
|
|
||||||
|
|
||||||
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
|
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
|
||||||
logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
|
logprint('{0}: {1}'.format(topic_idx, " ".join(top_terms)))
|
||||||
|
|
||||||
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
|
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
|
||||||
logprint(topic_idx)
|
logprint(topic_idx)
|
||||||
|
@ -132,66 +96,44 @@ def textacyTopicModeling(corpus,
|
||||||
|
|
||||||
grams_label = "uni" if ngrams == 1 else "bi"
|
grams_label = "uni" if ngrams == 1 else "bi"
|
||||||
|
|
||||||
model.termite_plot(doc_term_matrix, id2term,
|
draw1.termite_plot(model,doc_term_matrix, id2term,
|
||||||
|
|
||||||
n_terms=n_terms,
|
n_terms=n_terms,
|
||||||
sort_terms_by=sort_terms_by,
|
sort_terms_by=sort_terms_by,
|
||||||
rank_terms_by=rank_terms_by+'_weight',
|
rank_terms_by=rank_terms_by + '_weight',
|
||||||
|
|
||||||
|
|
||||||
save= FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label,topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by))
|
|
||||||
|
|
||||||
|
|
||||||
|
save=FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label, topicModel, n_topics,
|
||||||
|
n_terms, sort_terms_by, rank_terms_by))
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
|
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
|
||||||
|
|
||||||
|
|
||||||
|
def jgibbsLLDA(labeldict,line_gen,path2save_results, top_topic_words=7):
|
||||||
|
|
||||||
def jgibbsLLDA(corpus, path2save_results, top_topic_words=7):
|
labeldict_rev = {v: k for k, v in labeldict.items()}
|
||||||
start = time.time()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
|
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
|
||||||
|
|
||||||
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# build dictionary of ticketcategories
|
textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath)
|
||||||
labelist = []
|
|
||||||
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
|
||||||
labelist.append(texdoc.metadata["categoryName"])
|
|
||||||
|
|
||||||
|
|
||||||
labeldict = {k: v for v, k in enumerate(labelist)}
|
|
||||||
reverse_labeldict = {v: k for k, v in labeldict.items()}
|
|
||||||
|
|
||||||
#and save
|
|
||||||
labeldict_path = FILEPATH + "results/labeldict.txt"
|
|
||||||
with open(labeldict_path, 'w') as file:
|
|
||||||
file.write(json.dumps(labeldict))
|
|
||||||
|
|
||||||
|
|
||||||
n_topics = len(labeldict) #+1 #default-topic
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# create file with label_IDs (input for llda)
|
|
||||||
textacy.fileio.write_file_lines(generate_lablelID_lines(corpus, labeldict), filepath=LLDA_filepath)
|
|
||||||
|
|
||||||
# wait for file to exist
|
# wait for file to exist
|
||||||
while not os.path.exists(LLDA_filepath):
|
while not os.path.exists(LLDA_filepath):
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
logprint("")
|
|
||||||
logprint("start LLDA:")
|
|
||||||
|
|
||||||
|
|
||||||
# run JGibbsLLDA file
|
# run JGibbsLLDA file
|
||||||
|
|
||||||
|
n_topics = len(labeldict) #+1 #default-topic
|
||||||
|
|
||||||
FNULL = open(os.devnull, 'w') # supress output
|
FNULL = open(os.devnull, 'w') # supress output
|
||||||
cmd_jgibbs_java = ["java", "-cp",
|
cmd_jgibbs_java = ["java", "-cp",
|
||||||
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
|
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
|
||||||
|
@ -218,7 +160,7 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7):
|
||||||
if len(findall) != 0:
|
if len(findall) != 0:
|
||||||
try:
|
try:
|
||||||
index = int(findall[0].split()[1])
|
index = int(findall[0].split()[1])
|
||||||
result.append("Topic {} {}:".format(index, reverse_labeldict[index]))
|
result.append("Topic {} {}:".format(index, labeldict_rev[index]))
|
||||||
|
|
||||||
except:
|
except:
|
||||||
result.append(line)
|
result.append(line)
|
||||||
|
@ -243,37 +185,15 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7):
|
||||||
|
|
||||||
index = int(findall[0].split()[1])
|
index = int(findall[0].split()[1])
|
||||||
|
|
||||||
res_dict = {index : str(reverse_labeldict[index]) }
|
res_dict = {index : str(labeldict_rev[index]) }
|
||||||
|
|
||||||
else:
|
else:
|
||||||
splitted = line.split()
|
splitted = line.split()
|
||||||
res_dict[splitted[0]] = float(splitted[1])
|
res_dict[splitted[0]] = float(splitted[1])
|
||||||
|
|
||||||
### print terms that are topics
|
|
||||||
for s in list(res_dict.values()):
|
|
||||||
if isinstance(s,str) and splitted[0] in s:
|
|
||||||
vals = list(res_dict.values())
|
|
||||||
keys = list(res_dict.keys())
|
|
||||||
for v in vals:
|
|
||||||
if not isinstance(v,float):
|
|
||||||
print("{}".format(v))
|
|
||||||
print("{}".format(splitted[0]))
|
|
||||||
count +=1
|
|
||||||
print()
|
|
||||||
###
|
|
||||||
|
|
||||||
|
|
||||||
if len(res_dict) != 0:
|
if len(res_dict) != 0:
|
||||||
results.append(res_dict) # letzes an die liste ran
|
results.append(res_dict) # letzes an die liste ran
|
||||||
|
|
||||||
print(count)
|
|
||||||
print(float(count)/float(len(labelist)))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# {0: 'betrieb', 'service': 0.24162679425837305, 'support': 0.24162679425837305, 'browser': 0.24162679425837305, 'unicard': 0.24162679425837305, 'telefon': 0.0023923444976076593}
|
|
||||||
|
|
||||||
|
|
||||||
# every term in the resulsts to a list
|
# every term in the resulsts to a list
|
||||||
|
|
||||||
|
@ -286,20 +206,12 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7):
|
||||||
term2id = {t:i for i,t in enumerate(terms)} #and to dict
|
term2id = {t:i for i,t in enumerate(terms)} #and to dict
|
||||||
|
|
||||||
################# termite plot #####################################################################
|
################# termite plot #####################################################################
|
||||||
|
topic_labels = list(range(len(labeldict)))
|
||||||
#term_topic_weights.shape = (len(term_ids),len(topic_ids)
|
|
||||||
|
|
||||||
|
|
||||||
#topic_labels = tuple(labelist)
|
|
||||||
|
|
||||||
topic_labels = list(range(len(labelist)))
|
|
||||||
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
|
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
|
||||||
|
|
||||||
|
|
||||||
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
|
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for i,res in enumerate(results):
|
for i,res in enumerate(results):
|
||||||
|
|
||||||
for key,value in res.items():
|
for key,value in res.items():
|
||||||
|
@ -308,77 +220,223 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7):
|
||||||
term_topic_weights[term2id[key]][i] = value
|
term_topic_weights[term2id[key]][i] = value
|
||||||
term_labels[term2id[key]] = key
|
term_labels[term2id[key]] = key
|
||||||
else:
|
else:
|
||||||
topic_labels[i] = reverse_labeldict[key]
|
topic_labels[i] = labeldict_rev[key]
|
||||||
|
|
||||||
|
draw.draw_termite(
|
||||||
viz.draw_termite_plot(
|
|
||||||
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
|
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
|
||||||
|
|
||||||
|
draw.draw_termite(
|
||||||
|
term_topic_weights, topic_labels, term_labels, save=path2save_results+"_spaced.png",pow_x=0.78,pow_y=0.87)
|
||||||
|
|
||||||
|
# save labeldict
|
||||||
|
labeldict_path = path2save_results + "_labeldict.json"
|
||||||
|
with open(labeldict_path, 'w') as file:
|
||||||
|
file.write(json.dumps(labeldict))
|
||||||
|
|
||||||
|
|
||||||
|
def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
logprint("")
|
||||||
|
logprint("start Category-LLDA:")
|
||||||
|
|
||||||
|
# build dictionary of ticketcategories
|
||||||
|
labelist = []
|
||||||
|
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
||||||
|
labelist.append(texdoc.metadata["categoryName"])
|
||||||
|
|
||||||
|
labelist = list(set(labelist))
|
||||||
|
print("len(labelist): {}".format(len(labelist)))
|
||||||
|
|
||||||
|
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||||
|
|
||||||
|
def gen_cat_lines(textacyCorpus, labeldict):
|
||||||
|
""" generates [topic1, topic2....] tok1 tok2 tok3 out of corpi"""
|
||||||
|
|
||||||
|
for doc in textacyCorpus:
|
||||||
|
yield "[" + str(labeldict.get(doc.metadata["categoryName"], len(labeldict))) + "] " + doc.text
|
||||||
|
|
||||||
|
|
||||||
|
line_gen = gen_cat_lines(corpus, labeldict)
|
||||||
|
|
||||||
|
|
||||||
|
path2save_results = path2save_results + "_kb_cat_llda_{}".format("top" + str(top_topic_words))
|
||||||
|
|
||||||
|
|
||||||
|
jgibbsLLDA(labeldict, line_gen, path2save_results, top_topic_words=top_topic_words)
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logprint("Time Elapsed Topic Modeling JGibbsLLDA:{0} min\n".format((end - start) / 60))
|
logprint("\n\n\nTime Elapsed Category-LLDA :{0} min\n\n".format((end - start) / 60))
|
||||||
|
|
||||||
|
|
||||||
|
def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=False):
|
||||||
|
"""ticket_ID -> KB_ID -> keywords / subject -> llda"""
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
logprint("")
|
||||||
|
logprint("start {}-LLDA:".format("Keyword" if kb_keywords else "Subject"))
|
||||||
|
|
||||||
|
# ticket2kb_dict
|
||||||
|
|
||||||
|
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
|
||||||
|
|
||||||
|
ticket2kb_dict = {}
|
||||||
|
|
||||||
|
for line in kb2ticket_gen:
|
||||||
|
|
||||||
|
ticket_id = line[0]
|
||||||
|
kb_id = line[1]
|
||||||
|
|
||||||
|
ticket2kb_dict[ticket_id] = kb_id
|
||||||
|
# {'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...}
|
||||||
|
|
||||||
|
kb_entries_used = len(list(set(ticket2kb_dict.values())))
|
||||||
|
print("kb_entries_used: {}".format(kb_entries_used))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# kb2keywords_dict
|
||||||
|
|
||||||
|
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";")
|
||||||
|
|
||||||
|
next(kb2keywords_gen,None) #skip first line("ArticleID";"Subject";"Keywords";...)
|
||||||
|
|
||||||
|
kb2keywords_dict = {}
|
||||||
|
|
||||||
|
for line in kb2keywords_gen:
|
||||||
|
|
||||||
|
kb_id = line[0]
|
||||||
|
|
||||||
|
subject = line[1]
|
||||||
|
|
||||||
|
keywords = line[2]
|
||||||
|
keywords_list = [normalize(x) for x in str(keywords).split(",")]
|
||||||
|
|
||||||
|
if kb_id not in kb2keywords_dict.keys():
|
||||||
|
kb2keywords_dict[kb_id] = []
|
||||||
|
|
||||||
|
if kb_keywords:
|
||||||
|
for item in keywords_list:
|
||||||
|
if item != "":
|
||||||
|
kb2keywords_dict[kb_id].append(item)
|
||||||
|
|
||||||
|
else:
|
||||||
|
kb2keywords_dict[kb_id].append(subject)
|
||||||
|
|
||||||
|
|
||||||
|
#remove all empty items
|
||||||
|
kb2keywords_dict = { k : v for k,v in kb2keywords_dict.items() if len(v) != 0}
|
||||||
|
# {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#keywords2kb_dict
|
||||||
|
|
||||||
def main(use_cleaned=False, algorithm="llda"):
|
keywords2kb_dict = {}
|
||||||
|
|
||||||
|
for kb_id, lst in kb2keywords_dict.items():
|
||||||
|
for l in lst:
|
||||||
|
if l not in keywords2kb_dict.keys():
|
||||||
|
keywords2kb_dict[l] = [kb_id]
|
||||||
|
else:
|
||||||
|
keywords2kb_dict[l].append(kb_id)
|
||||||
|
# {'unicard namensaenderung': ['KBA10276'], 'vpn': ['KBA10063'], 'outlook_exchange': ['KBA10181'], ...}
|
||||||
|
|
||||||
|
|
||||||
# idee http://bigartm.org/
|
|
||||||
# idee http://wiki.languagetool.org/tips-and-tricks
|
|
||||||
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
|
|
||||||
# idee https://gate.ac.uk/family/
|
|
||||||
|
|
||||||
|
|
||||||
|
# Look for actually used keywords
|
||||||
|
used_keywords = []
|
||||||
|
|
||||||
|
for doc in corpus:
|
||||||
|
|
||||||
|
ticket_number = doc.metadata["TicketNumber"]
|
||||||
|
|
||||||
|
kb_id = ticket2kb_dict.get(ticket_number, None)
|
||||||
|
|
||||||
|
keywords = kb2keywords_dict.get(kb_id, None)
|
||||||
|
|
||||||
|
if keywords and kb_id:
|
||||||
|
used_keywords.append(list(map(normalize,keywords)))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
labelist = [item for sublist in used_keywords for item in sublist] #flatten list
|
||||||
|
labelist = list(set(labelist))
|
||||||
|
print("len(labelist): {}".format(len(labelist)))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def gen_KB_lines(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict):
|
||||||
|
for doc in corpus:
|
||||||
|
|
||||||
|
ticket_number = doc.metadata["TicketNumber"]
|
||||||
|
|
||||||
|
kb_number = ticket2kb_dict.get(ticket_number, None)
|
||||||
|
|
||||||
|
keywords = kb2keywords_dict.get(kb_number, None)
|
||||||
|
|
||||||
|
if keywords:
|
||||||
|
|
||||||
|
label = ""
|
||||||
|
for kw in keywords:
|
||||||
|
label = label + str(labeldict.get(normalize(str(kw)), len(labeldict))) + " "
|
||||||
|
|
||||||
|
yield "[ " + label + "] " + doc.text
|
||||||
|
|
||||||
|
|
||||||
|
line_gen = gen_KB_lines(corpus, labeldict, ticket2kb_dict, kb2keywords_dict)
|
||||||
|
|
||||||
|
|
||||||
|
path2save_results = path2save_results + "_kb_{}_llda_{}".format("keys" if kb_keywords else "subs",
|
||||||
|
"top" + str(top_topic_words))
|
||||||
|
|
||||||
|
|
||||||
|
jgibbsLLDA(labeldict, line_gen, path2save_results, top_topic_words=top_topic_words)
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
logprint("\n\n\nTime Elapsed {1}-LLDA :{0} min\n\n".format((end - start) / 60,"Keyword" if kb_keywords else "Subject"))
|
||||||
|
|
||||||
|
|
||||||
|
def main( algorithm="llda"):
|
||||||
|
|
||||||
|
|
||||||
logprint("Topic Modeling: {0}".format(datetime.now()))
|
logprint("Topic Modeling: {0}".format(datetime.now()))
|
||||||
|
|
||||||
|
|
||||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||||
|
|
||||||
|
|
||||||
if use_cleaned:
|
preCorpus_name = "de" + "_pre_ticket"
|
||||||
preCorpus_name = "de" + "_clean_ticket"
|
resultspath = FILEPATH + "results/pre"
|
||||||
resultspath = FILEPATH + "results/clean"
|
|
||||||
else:
|
|
||||||
preCorpus_name = "de" + "_pre_ticket"
|
|
||||||
resultspath = FILEPATH + "results/pre"
|
|
||||||
|
|
||||||
|
|
||||||
|
# load corpus
|
||||||
# load cleand corpus
|
|
||||||
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
||||||
logprint("Corpus loaded: {0}".format(de_corpus.lang))
|
logprint("Corpus loaded: {0}".format(de_corpus.lang))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# todo llda topics zusammenfassen
|
|
||||||
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
|
|
||||||
# frage wieviele tickets pro topic?
|
|
||||||
|
|
||||||
"""
|
|
||||||
ngrams = 1
|
|
||||||
min_df = 1
|
|
||||||
max_df = 1.0
|
|
||||||
weighting = 'tf'
|
|
||||||
# weighting ='tfidf'
|
|
||||||
named_entities = False
|
|
||||||
|
|
||||||
|
|
||||||
printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
|
|
||||||
printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
|
|
||||||
printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting)
|
|
||||||
|
|
||||||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting)
|
|
||||||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
|
|
||||||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
|
|
||||||
"""
|
|
||||||
|
|
||||||
if algorithm == "llda":
|
if algorithm == "llda":
|
||||||
|
|
||||||
top_topic_words = 5
|
top_topic_words = 5
|
||||||
path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words))
|
jgibbsLLDA_category(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words)
|
||||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
|
|
||||||
|
|
||||||
|
kb_keywords = False
|
||||||
|
jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
|
||||||
|
|
||||||
|
kb_keywords = True
|
||||||
|
jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
top_topic_words = 10
|
top_topic_words = 10
|
||||||
|
@ -399,10 +457,7 @@ def main(use_cleaned=False, algorithm="llda"):
|
||||||
|
|
||||||
|
|
||||||
textacyTopicModeling(ngrams = 1,
|
textacyTopicModeling(ngrams = 1,
|
||||||
min_df = 1,
|
|
||||||
max_df = 0.9,
|
|
||||||
topicModel = algorithm,
|
topicModel = algorithm,
|
||||||
n_topics =15,
|
|
||||||
corpus=de_corpus)
|
corpus=de_corpus)
|
||||||
"""
|
"""
|
||||||
textacyTopicModeling(ngrams=1,
|
textacyTopicModeling(ngrams=1,
|
||||||
|
@ -411,7 +466,7 @@ def main(use_cleaned=False, algorithm="llda"):
|
||||||
topicModel=algorithm,
|
topicModel=algorithm,
|
||||||
n_topics=20,
|
n_topics=20,
|
||||||
corpus=de_corpus)
|
corpus=de_corpus)
|
||||||
|
|
||||||
textacyTopicModeling(ngrams=1,
|
textacyTopicModeling(ngrams=1,
|
||||||
min_df=1,
|
min_df=1,
|
||||||
max_df=0.9,
|
max_df=0.9,
|
||||||
|
@ -430,10 +485,7 @@ def main(use_cleaned=False, algorithm="llda"):
|
||||||
|
|
||||||
|
|
||||||
textacyTopicModeling(ngrams=(1, 2),
|
textacyTopicModeling(ngrams=(1, 2),
|
||||||
min_df=1,
|
|
||||||
max_df=0.9,
|
|
||||||
topicModel=algorithm,
|
topicModel=algorithm,
|
||||||
n_topics=15,
|
|
||||||
corpus=de_corpus)
|
corpus=de_corpus)
|
||||||
"""
|
"""
|
||||||
textacyTopicModeling(ngrams = (1,2),
|
textacyTopicModeling(ngrams = (1,2),
|
||||||
|
@ -442,7 +494,7 @@ def main(use_cleaned=False, algorithm="llda"):
|
||||||
topicModel = algorithm,
|
topicModel = algorithm,
|
||||||
n_topics =20,
|
n_topics =20,
|
||||||
corpus=de_corpus)
|
corpus=de_corpus)
|
||||||
|
|
||||||
textacyTopicModeling(ngrams = (1,2),
|
textacyTopicModeling(ngrams = (1,2),
|
||||||
min_df = 1,
|
min_df = 1,
|
||||||
max_df = 0.9,
|
max_df = 0.9,
|
||||||
|
|
Loading…
Reference in New Issue