start auswertung
This commit is contained in:
parent
7214911606
commit
873e9ff7d2
|
@ -90,6 +90,7 @@ def autocorrectWord(word):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def clean(stringstream,autocorrect=False):
|
def clean(stringstream,autocorrect=False):
|
||||||
|
|
||||||
for string in stringstream:
|
for string in stringstream:
|
||||||
|
@ -165,6 +166,8 @@ autocorrect = config.getboolean("preprocessing", "autocorrect")
|
||||||
|
|
||||||
def cleanCorpus(corpus_path, clean_in_meta, lang="de", printrandom=10,autocorrect=False):
|
def cleanCorpus(corpus_path, clean_in_meta, lang="de", printrandom=10,autocorrect=False):
|
||||||
|
|
||||||
|
autocorrect = False #todo STELLSCHRAUBE
|
||||||
|
|
||||||
logprint("Clean {0}_corpus at {1}".format(lang, datetime.now()))
|
logprint("Clean {0}_corpus at {1}".format(lang, datetime.now()))
|
||||||
|
|
||||||
rawCorpus_name = lang + "_raw_ticket"
|
rawCorpus_name = lang + "_raw_ticket"
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
26
main.py
26
main.py
|
@ -3,7 +3,7 @@ import matplotlib
|
||||||
matplotlib.use('Agg')
|
matplotlib.use('Agg')
|
||||||
import time
|
import time
|
||||||
import init
|
import init
|
||||||
|
from datetime import datetime
|
||||||
import corporization
|
import corporization
|
||||||
import preprocessing
|
import preprocessing
|
||||||
import topicModeling
|
import topicModeling
|
||||||
|
@ -21,6 +21,8 @@ start = time.time()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# idee häufige n-gramme raus (zB damen und herren)
|
||||||
# idee llda topics zusammenfassen
|
# idee llda topics zusammenfassen
|
||||||
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
|
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
|
||||||
# frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen
|
# frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen
|
||||||
|
@ -29,9 +31,10 @@ start = time.time()
|
||||||
# todo modelle testen
|
# todo modelle testen
|
||||||
|
|
||||||
|
|
||||||
|
logprint("main.py started at {}".format(datetime.now()))
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
init.main()
|
init.main()
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
@ -41,13 +44,13 @@ logprint("")
|
||||||
cleaning.main()
|
cleaning.main()
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
preprocessing.main() # ~5h
|
preprocessing.main()
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#topicModeling.main(algorithm="lsa")
|
#topicModeling.main(algorithm="lsa")
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
@ -56,16 +59,17 @@ logprint("")
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
|
||||||
topicModeling.main(algorithm="llda")
|
|
||||||
logprint("")
|
|
||||||
|
|
||||||
|
|
||||||
#topicModeling.main(algorithm="llda")
|
#topicModeling.main(algorithm="llda")
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
|
||||||
|
topicModeling.main(algorithm="lda")
|
||||||
|
logprint("")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
logprint("main.py finished at {}".format(datetime.now()))
|
||||||
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))
|
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))
|
||||||
|
|
||||||
|
|
||||||
#800*400
|
|
|
@ -49,6 +49,10 @@ def filterTokens(tokens, funclist):
|
||||||
for f in funclist:
|
for f in funclist:
|
||||||
tokens = list(filter(f, tokens))
|
tokens = list(filter(f, tokens))
|
||||||
|
|
||||||
|
for tok in tokens:
|
||||||
|
if tok.pos_ =="NOUN":
|
||||||
|
x=0
|
||||||
|
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
@ -57,7 +61,9 @@ def keepPOS(pos_list):
|
||||||
|
|
||||||
|
|
||||||
def keepNouns(noun_list=NOUNS):
|
def keepNouns(noun_list=NOUNS):
|
||||||
return lambda tok: tok.lower_ in noun_list
|
#return lambda tok: tok.lower_ in noun_list
|
||||||
|
return lambda tok: tok.lower_ in noun_list or tok.pos_ == "NOUN"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def removePOS(pos_list):
|
def removePOS(pos_list):
|
||||||
|
@ -204,8 +210,8 @@ def processContentstream(textstream, parser, token_filterlist=None):
|
||||||
tokens = filterTokens(tokens, token_filterlist)
|
tokens = filterTokens(tokens, token_filterlist)
|
||||||
|
|
||||||
# post parse
|
# post parse
|
||||||
tokens = [postparse(tok) for tok in tokens] #todo: informationsverlust von pos,tag etc.!
|
#todo STELLSCHRAUBE tokens = [postparse(tok) for tok in tokens] #todo: informationsverlust von pos,tag etc.!
|
||||||
|
tokens = [tok.lower_ for tok in tokens]
|
||||||
yield " ".join(tokens)
|
yield " ".join(tokens)
|
||||||
|
|
||||||
def preparse(stringstream):
|
def preparse(stringstream):
|
||||||
|
@ -360,16 +366,13 @@ def main():
|
||||||
|
|
||||||
keepNouns(NOUNS),
|
keepNouns(NOUNS),
|
||||||
|
|
||||||
remove_words_containing_Numbers(),
|
removeWords(DE_STOP_WORDS + custom_words + VORNAMEN),
|
||||||
|
|
||||||
removePOS(["PUNCT", "SPACE", "NUM"]),
|
removePOS(["PUNCT", "SPACE", "NUM"]),
|
||||||
|
|
||||||
removeWords(DE_STOP_WORDS + custom_words + VORNAMEN),
|
#todo STELLSCHRAUBE remove_words_containing_Numbers(),
|
||||||
#removeWords(DE_STOP_WORDS),
|
#todo STELLSCHRAUBE remove_long_words(),
|
||||||
|
#todo STELLSCHRAUBE remove_short_words()
|
||||||
remove_long_words(),
|
|
||||||
remove_short_words(),
|
|
||||||
remove_first_names()
|
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
200
test.py
200
test.py
|
@ -27,16 +27,189 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
preCorpus_name = "de" + "_pre_ticket"
|
preCorpus_name = "de" + "_pre_ticket"
|
||||||
corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
||||||
logprint("Corpus loaded: {0}".format(corpus.lang))
|
logprint("Corpus loaded: {0}".format(corpus.lang))
|
||||||
#
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#todo randomize
|
#todo randomize
|
||||||
|
|
||||||
split_index = int(float(len(corpus)) * 0.8)
|
|
||||||
|
split = 0.8
|
||||||
|
weighting = "tf"
|
||||||
|
min_df = 0
|
||||||
|
max_df = 1
|
||||||
|
ngrams = 1
|
||||||
|
n_topics = 3
|
||||||
|
top_n = 7
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
split_index = int(float(len(corpus)) * split)
|
||||||
corpus_train = corpus[0:split_index]
|
corpus_train = corpus[0:split_index]
|
||||||
corpus_test = corpus[split_index:len(corpus)-1]
|
corpus_test = corpus[split_index:len(corpus)-1]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###### Initialize and train a topic model
|
||||||
|
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||||
|
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus_train)
|
||||||
|
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
||||||
|
id2term = vectorizer.__getattribute__("id_to_term")
|
||||||
|
model = textacy.tm.TopicModel("lda", n_topics=n_topics)
|
||||||
|
model.fit(doc_term_matrix)
|
||||||
|
######
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
compenents = model.model.components_
|
||||||
|
|
||||||
|
"""
|
||||||
|
components_ : array, [n_components, n_features]
|
||||||
|
|
||||||
|
Variational parameters for topic word distribution.
|
||||||
|
|
||||||
|
Since the complete conditional for topic word distribution is a Dirichlet,
|
||||||
|
components_[i, j] can be viewed as pseudocount that represents
|
||||||
|
the number of times word j was assigned to topic i.
|
||||||
|
|
||||||
|
It can also be viewed as distribution over the words for each topic after normalization:
|
||||||
|
model.components_ / model.components_.sum(axis=1)[:, np.newaxis].
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
test_doc = corpus_test[0]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
# frage wieviele tickets pro topic?
|
# frage wieviele tickets pro topic?
|
||||||
|
|
||||||
|
|
||||||
|
ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/de_tickets.csv", delimiter=";")
|
||||||
|
|
||||||
|
cat_dict = {}
|
||||||
|
cat2id_dict = {}
|
||||||
|
for line in ticket_gen:
|
||||||
|
tick_id = line[0]
|
||||||
|
cat = normalize(line[3])
|
||||||
|
|
||||||
|
cat2id_dict[cat] = tick_id
|
||||||
|
|
||||||
|
if cat not in cat_dict.keys():
|
||||||
|
cat_dict[cat] = 1
|
||||||
|
else:
|
||||||
|
cat_dict[cat] += 1
|
||||||
|
|
||||||
|
import operator
|
||||||
|
|
||||||
|
sorted_dict = sorted(cat_dict.items(), key=operator.itemgetter(1))
|
||||||
|
|
||||||
|
for k, v in sorted_dict:
|
||||||
|
if k == "sd":
|
||||||
|
print(cat2id_dict[k])
|
||||||
|
print(k, v)
|
||||||
|
|
||||||
|
print(len(sorted_dict))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
|
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
|
||||||
|
|
||||||
ticket2kb_dict = {}
|
ticket2kb_dict = {}
|
||||||
|
@ -118,14 +291,7 @@ for k,v in kb2keywords_dict.items(): #str,list
|
||||||
|
|
||||||
|
|
||||||
import operator
|
import operator
|
||||||
"""
|
|
||||||
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
|
|
||||||
|
|
||||||
for k,v in sorted_dict:
|
|
||||||
print(k,v)
|
|
||||||
|
|
||||||
print(len(sorted_dict))
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -152,6 +318,7 @@ for kb_entry in kb2keywords_gen:
|
||||||
else:
|
else:
|
||||||
count_dict[entry_] += 1
|
count_dict[entry_] += 1
|
||||||
|
|
||||||
|
import operator
|
||||||
|
|
||||||
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
|
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
|
||||||
|
|
||||||
|
@ -159,20 +326,7 @@ sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
|
||||||
# print(k,v)
|
# print(k,v)
|
||||||
|
|
||||||
#print(len(sorted_dict))
|
#print(len(sorted_dict))
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
|
||||||
print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
100
topicModeling.py
100
topicModeling.py
|
@ -5,6 +5,7 @@ import draw
|
||||||
import draw1
|
import draw1
|
||||||
import time
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import operator
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
import sys
|
import sys
|
||||||
|
@ -80,9 +81,8 @@ def textacyTopicModeling(corpus,
|
||||||
|
|
||||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||||
|
|
||||||
|
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words, weights=True):
|
||||||
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
|
logprint('{0}: {1}'.format(topic_idx, str(top_terms)))
|
||||||
logprint('{0}: {1}'.format(topic_idx, " ".join(top_terms)))
|
|
||||||
|
|
||||||
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
|
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
|
||||||
logprint(topic_idx)
|
logprint(topic_idx)
|
||||||
|
@ -96,7 +96,7 @@ def textacyTopicModeling(corpus,
|
||||||
|
|
||||||
grams_label = "uni" if ngrams == 1 else "bi"
|
grams_label = "uni" if ngrams == 1 else "bi"
|
||||||
|
|
||||||
draw1.termite_plot(model,doc_term_matrix, id2term,
|
draw1.termite_plot(model,doc_term_matrix, vectorizer.id_to_term,
|
||||||
|
|
||||||
n_terms=n_terms,
|
n_terms=n_terms,
|
||||||
sort_terms_by=sort_terms_by,
|
sort_terms_by=sort_terms_by,
|
||||||
|
@ -117,8 +117,6 @@ def jgibbsLLDA(labeldict,line_gen,path2save_results, top_topic_words=7):
|
||||||
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath)
|
textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath)
|
||||||
|
|
||||||
|
|
||||||
|
@ -241,24 +239,31 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
|
||||||
logprint("start Category-LLDA:")
|
logprint("start Category-LLDA:")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# build dictionary of ticketcategories
|
# build dictionary of ticketcategories
|
||||||
labelist = []
|
labelist = []
|
||||||
for doc in corpus:
|
for doc in corpus:
|
||||||
labelist.append(normalize(doc.metadata["categoryName"]))
|
|
||||||
|
category = normalize(doc.metadata["categoryName"])
|
||||||
|
labelist.append(category)
|
||||||
|
|
||||||
|
|
||||||
labelist = list(set(labelist))
|
labelist = list(set(labelist))
|
||||||
print("len(labelist): {}".format(len(labelist)))
|
#print("len(labelist): {}".format(len(labelist)))
|
||||||
|
|
||||||
labeldict = {k: v for v, k in enumerate(labelist)}
|
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||||
|
labeldict.update({'DEFAULT' : len(labeldict)})
|
||||||
|
|
||||||
|
|
||||||
def gen_cat_lines(textacyCorpus, labeldict):
|
def gen_cat_lines(textacyCorpus, labeldict):
|
||||||
""" generates [topic1, topic2....] tok1 tok2 tok3 out of corpi"""
|
""" generates [topic1, topic2....] tok1 tok2 tok3 out of corpi"""
|
||||||
|
|
||||||
for doc in textacyCorpus:
|
for doc in textacyCorpus:
|
||||||
yield "[" + str(labeldict.get(doc.metadata["categoryName"], len(labeldict))) + "] " + doc.text
|
label = labeldict.get(normalize(doc.metadata["categoryName"]), labeldict['DEFAULT'])
|
||||||
|
|
||||||
|
# frage nur die x häufigsten labels benutzen, rest raus?
|
||||||
|
|
||||||
|
|
||||||
|
yield "[ " + str(label) + " ] " + doc.text
|
||||||
|
|
||||||
|
|
||||||
line_gen = gen_cat_lines(corpus, labeldict)
|
line_gen = gen_cat_lines(corpus, labeldict)
|
||||||
|
@ -274,6 +279,7 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
|
||||||
logprint("\n\n\nTime Elapsed Category-LLDA :{0} min\n\n".format((end - start) / 60))
|
logprint("\n\n\nTime Elapsed Category-LLDA :{0} min\n\n".format((end - start) / 60))
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated
|
||||||
def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=False):
|
def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=False):
|
||||||
"""ticket_ID -> KB_ID -> keywords / subject -> llda"""
|
"""ticket_ID -> KB_ID -> keywords / subject -> llda"""
|
||||||
|
|
||||||
|
@ -420,7 +426,7 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# kb2keywords_dict / kb2subj_dict {str : [str]}
|
# kb2keywords_dict / kb2subjects_dict --> {str : [str]}
|
||||||
|
|
||||||
kb2keywords_dict = {}
|
kb2keywords_dict = {}
|
||||||
kb2subjects_dict = {}
|
kb2subjects_dict = {}
|
||||||
|
@ -458,7 +464,7 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ticket2kbs_dict
|
# ticket2kbs_dict --> {str : [str]}
|
||||||
ticket2kbs_dict = {}
|
ticket2kbs_dict = {}
|
||||||
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
|
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
|
||||||
next(kb2ticket_gen, None) # skip first line"TicketNumber";"ArticleID"
|
next(kb2ticket_gen, None) # skip first line"TicketNumber";"ArticleID"
|
||||||
|
@ -479,8 +485,8 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ticket2keywords
|
# ticket2keywords --> {str:[str]}
|
||||||
ticket2keywords_dict = {} # {str:[str]}
|
ticket2keywords_dict = {}
|
||||||
|
|
||||||
for ticket_id, kb_ids in ticket2kbs_dict.items():
|
for ticket_id, kb_ids in ticket2kbs_dict.items():
|
||||||
|
|
||||||
|
@ -496,8 +502,8 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ticket2subjects
|
# ticket2subjects --> {str:[str]}
|
||||||
ticket2subjects_dict = {} # {str:[str]}
|
ticket2subjects_dict = {}
|
||||||
|
|
||||||
for ticket_id, kb_ids in ticket2kbs_dict.items():
|
for ticket_id, kb_ids in ticket2kbs_dict.items():
|
||||||
|
|
||||||
|
@ -513,13 +519,12 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
|
|
||||||
|
|
||||||
# kb2keywords_dict {'KBA10230': ['DEFAULT'], 'KBA10129': ['DEFAULT'], 'KBA10287': ['sd_ansys_informationen'], } len = 260
|
# kb2keywords_dict {'KBA10230': ['DEFAULT'], 'KBA10129': ['DEFAULT'], 'KBA10287': ['sd_ansys_informationen'], } len = 260
|
||||||
#kb2subjects_dict {'KBA10230': ['unicard nochmal beantragen'], 'KBA10129': ['sd_entsperrung unicard nach verlust/wiederfinden'], } len = 260
|
# kb2subjects_dict {'KBA10230': ['unicard nochmal beantragen'], 'KBA10129': ['sd_entsperrung unicard nach verlust/wiederfinden'], } len = 260
|
||||||
# ticket2kbs_dict {'INC44526': ['KBA10056'], 'INC67205': ['KBA10056'], } len = 4832
|
# ticket2kbs_dict {'INC44526': ['KBA10056'], 'INC67205': ['KBA10056'], } len = 4832
|
||||||
# ticket2keywords_dict {'INC44526': ['DEFAULT'], 'INC67205': ['DEFAULT'], 'INC71863': ['DEFAULT'], 'INC44392': ['asknet'] } len=4832
|
# ticket2keywords_dict {'INC44526': ['DEFAULT'], 'INC67205': ['DEFAULT'], 'INC71863': ['DEFAULT'], 'INC44392': ['asknet'] } len=4832
|
||||||
#ticket2subjects_dioct {'INC44526': ['sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt)'], len=4832
|
# ticket2subjects_dict {'INC44526': ['sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt)'], len=4832
|
||||||
|
|
||||||
|
|
||||||
# frage wieviele tickets pro topic?
|
|
||||||
count_dict = {}
|
count_dict = {}
|
||||||
for v in ticket2kbs_dict.values():
|
for v in ticket2kbs_dict.values():
|
||||||
for kb in v:
|
for kb in v:
|
||||||
|
@ -527,18 +532,17 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
count_dict[kb] +=1
|
count_dict[kb] +=1
|
||||||
else:
|
else:
|
||||||
count_dict[kb] = 1
|
count_dict[kb] = 1
|
||||||
import operator
|
|
||||||
|
|
||||||
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
|
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
|
||||||
print("kb_entrys used: {}".format(len(sorted_dict)))
|
|
||||||
for k,v in sorted_dict:
|
for k,v in sorted_dict:
|
||||||
print(k,kb2subjects_dict[k],v) #todo das selbe mit keywords
|
subs = kb2subjects_dict[k]
|
||||||
|
keys = kb2keywords_dict[k]
|
||||||
|
print(subs, keys , v) # frage wieviele tickets pro topic?
|
||||||
|
|
||||||
|
print("kb_entrys used: {}".format(len(sorted_dict))) # frage wie viele kb_entry's insg genutzt?: 155
|
||||||
|
|
||||||
|
|
||||||
#todo hier weiter
|
|
||||||
|
|
||||||
|
|
||||||
# todo frage wie viele kb_entry's insg genutzt?
|
|
||||||
|
|
||||||
labelist = ticket2keywords_dict.values()
|
labelist = ticket2keywords_dict.values()
|
||||||
labelist = flatten(labelist)
|
labelist = flatten(labelist)
|
||||||
|
@ -564,6 +568,7 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
|
|
||||||
yield "[ " + label + "] " + doc.text
|
yield "[ " + label + "] " + doc.text
|
||||||
|
|
||||||
|
|
||||||
keys_line_gen = gen_key_lines(corpus, labeldict, ticket2keywords_dict)
|
keys_line_gen = gen_key_lines(corpus, labeldict, ticket2keywords_dict)
|
||||||
|
|
||||||
path2save_keys_results = path2save_results + "_kb_keys_llda_{}".format("top" + str(top_topic_words))
|
path2save_keys_results = path2save_results + "_kb_keys_llda_{}".format("top" + str(top_topic_words))
|
||||||
|
@ -574,28 +579,13 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
def gen_subj_lines(textacyCorpus, labeldict, ticket2subjects_dict):
|
|
||||||
|
|
||||||
for doc in corpus:
|
|
||||||
|
|
||||||
ticket_number = doc.metadata["TicketNumber"]
|
|
||||||
|
|
||||||
keywords = ticket2subjects_dict.get(ticket_number, ['DEFAULT'])
|
|
||||||
|
|
||||||
if keywords != ['DEFAULT']:
|
|
||||||
|
|
||||||
label = ""
|
|
||||||
for kw in keywords:
|
|
||||||
label = label + str(labeldict.get(normalize(str(kw)), len(labeldict))) + " "
|
|
||||||
|
|
||||||
yield "[ " + label + "] " + doc.text
|
|
||||||
"""
|
|
||||||
|
|
||||||
labelist = ticket2subjects_dict.values()
|
labelist = ticket2subjects_dict.values()
|
||||||
labelist = flatten(labelist)
|
labelist = flatten(labelist)
|
||||||
labelist = list(set(labelist))
|
labelist = list(set(labelist))
|
||||||
labeldict = {k: v for v, k in enumerate(labelist)}
|
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||||
|
|
||||||
labeldict.update({'DEFAULT' : len(labeldict)})
|
labeldict.update({'DEFAULT' : len(labeldict)})
|
||||||
|
|
||||||
subj_line_gen = gen_key_lines(corpus, labeldict, ticket2subjects_dict)
|
subj_line_gen = gen_key_lines(corpus, labeldict, ticket2subjects_dict)
|
||||||
|
@ -616,19 +606,13 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
|
|
||||||
|
|
||||||
def main( algorithm="llda"):
|
def main( algorithm="llda"):
|
||||||
|
|
||||||
|
|
||||||
logprint("Topic Modeling: {0}".format(datetime.now()))
|
logprint("Topic Modeling: {0}".format(datetime.now()))
|
||||||
|
|
||||||
|
|
||||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||||
|
preCorpus_name = "de" + "_pre_ticket_old"
|
||||||
|
|
||||||
preCorpus_name = "de" + "_pre_ticket"
|
preCorpus_name = "de" + "_pre_ticket"
|
||||||
|
|
||||||
resultspath = FILEPATH + "results/pre"
|
resultspath = FILEPATH + "results/pre"
|
||||||
|
|
||||||
|
|
||||||
# load corpus
|
# load corpus
|
||||||
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
||||||
logprint("Corpus loaded: {0}".format(de_corpus.lang))
|
logprint("Corpus loaded: {0}".format(de_corpus.lang))
|
||||||
|
@ -643,15 +627,15 @@ def main( algorithm="llda"):
|
||||||
|
|
||||||
jgibbsLLDA_KB_v2(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words)
|
jgibbsLLDA_KB_v2(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words)
|
||||||
|
|
||||||
kb_keywords = False
|
|
||||||
#jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
|
|
||||||
|
|
||||||
kb_keywords = True
|
|
||||||
#jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
kb_keywords = False
|
||||||
|
jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
|
||||||
|
|
||||||
|
kb_keywords = True
|
||||||
|
jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
|
||||||
|
|
||||||
|
|
||||||
top_topic_words = 10
|
top_topic_words = 10
|
||||||
path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words))
|
path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words))
|
||||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
|
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
|
||||||
|
|
Loading…
Reference in New Issue