topicModelingTickets/topicModeling_1711_0846.py

728 lines
22 KiB
Python

# -*- coding: utf-8 -*-
from datetime import datetime
import draw
import draw1
import time
import numpy as np
import csv
import sys
import json
import os.path
import subprocess
from textacy import Vectorizer, viz
from miscellaneous import *
import textacy
from scipy import *
import os
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
def label2ID(label, labeldict):
return labeldict.get(label, len(labeldict))
def generate_lablelID_lines(textacyCorpus, labeldict):
for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text
"""
def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
logprint(str("max_df: {0}".format(max_df)))
logprint(str("named_entities: {0}".format(named_entities)))
# printlog("vectorize corpi...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
for t in terms_list:
print(t)
logprint("doc_term_matrix: {0}".format(doc_term_matrix))
logprint("id2term: {0}".format(id2term))
"""
def textacyTopicModeling(corpus,
n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5,
ngrams = 1, min_df=1, max_df=1.0,
topicModel='lda'):
n_terms = int(n_topics * top_topic_words)
sort_terms_by = 'seriation' # 'seriation', 'weight', 'index', 'alphabetical'
rank_terms_by = 'corpus' # 'corpus', 'topic'
logprint(
"############### Topic Modeling {0} ###########################".format(
topicModel))
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
logprint(str("max_df: {0}".format(max_df)))
logprint(str("n_topics: {0}".format(n_topics)))
logprint("\n")
start = time.time()
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
#################### vectorize corpi ####################
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
# printlog("terms_list: {0}".format(list(terms_list)))
# printlog("doc_term_matrix: {0}".format(doc_term_matrix))
##################### Initialize and train a topic model ##############################################
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
logprint(topic_idx)
for j in top_docs:
logprint(corpus[j].metadata['categoryName'])
####################### termite plot ###################################################################
grams_label = "uni" if ngrams == 1 else "bi"
"""
model.termite_plot(doc_term_matrix, id2term,
n_terms=n_terms,
sort_terms_by=sort_terms_by,
rank_terms_by=rank_terms_by+'_weight',
save= FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label,topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by))
"""
draw1.termite_plot(model,doc_term_matrix, id2term,
n_terms=n_terms,
sort_terms_by=sort_terms_by,
rank_terms_by=rank_terms_by + '_weight',
save=FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label, topicModel, n_topics,
n_terms, sort_terms_by, rank_terms_by))
end = time.time()
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
start = time.time()
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
# build dictionary of ticketcategories
labelist = []
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
labeldict = {k: v for v, k in enumerate(labelist)}
reverse_labeldict = {v: k for k, v in labeldict.items()}
#and save
labeldict_path = FILEPATH + "results/labeldict.txt"
with open(labeldict_path, 'w') as file:
file.write(json.dumps(labeldict))
n_topics = len(labeldict) #+1 #default-topic
# create file with label_IDs (input for llda)
textacy.fileio.write_file_lines(generate_lablelID_lines(corpus, labeldict), filepath=LLDA_filepath)
# wait for file to exist
while not os.path.exists(LLDA_filepath):
time.sleep(1)
logprint("")
logprint("start LLDA:")
# run JGibbsLLDA file
FNULL = open(os.devnull, 'w') # supress output
cmd_jgibbs_java = ["java", "-cp",
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
jgibbsLLDA_root),
"jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
"tickets.gz",
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
output = subprocess.check_output(cmd_gzip).decode("utf-8")
topic_regex = re.compile(r'Topic [0-9]*')
#####################################
# todo save results in file aufgrund von results
result = []
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
try:
index = int(findall[0].split()[1])
result.append("Topic {} {}:".format(index, reverse_labeldict[index]))
except:
result.append(line)
else:
result.append(line)
textacy.fileio.write_file_lines(result, path2save_results+".txt")
#####################################
results = []
res_dict = {}
count =0
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
if len(res_dict) != 0:
results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig)
index = int(findall[0].split()[1])
res_dict = {index : str(reverse_labeldict[index]) }
else:
splitted = line.split()
res_dict[splitted[0]] = float(splitted[1])
"""
### print terms that are topics
for s in list(res_dict.values()):
if isinstance(s,str) and splitted[0] in s:
vals = list(res_dict.values())
keys = list(res_dict.keys())
for v in vals:
if not isinstance(v,float):
print("{}".format(v))
print("{}".format(splitted[0]))
count +=1
print()
###
"""
if len(res_dict) != 0:
results.append(res_dict) # letzes an die liste ran
#print(count)
#print(float(count)/float(len(labelist)))
# {0: 'betrieb', 'service': 0.24162679425837305, 'support': 0.24162679425837305, 'browser': 0.24162679425837305, 'unicard': 0.24162679425837305, 'telefon': 0.0023923444976076593}
# every term in the resulsts to a list
terms=[]
for res in results:
for key,value in res.items():
if not isinstance(key, int) and not key in terms:
terms.append(key)
term2id = {t:i for i,t in enumerate(terms)} #and to dict
################# termite plot #####################################################################
#term_topic_weights.shape = (len(term_ids),len(topic_ids)
#topic_labels = tuple(labelist)
topic_labels = list(range(len(labelist)))
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
for i,res in enumerate(results):
for key,value in res.items():
if not isinstance(key, int):
term_topic_weights[term2id[key]][i] = value
term_labels[term2id[key]] = key
else:
topic_labels[i] = reverse_labeldict[key]
#viz.draw_termite_plot(term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
draw.draw_termite(
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
end = time.time()
logprint("Time Elapsed Topic Modeling JGibbsLLDA:{0} min\n".format((end - start) / 60))
def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words=7, kb_keywords=False):
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
# ticket2kb_dict
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
ticket2kb_dict = {} #{'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...}
for line in kb2ticket_gen:
ticket_id = line[0]
kb_id = line[1]
ticket2kb_dict[ticket_id] = kb_id
#############
# kb2keywords_dict
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";") #"ArticleID";"Subject";"Keywords";.....
next(kb2keywords_gen,None) #skip first
kb2keywords_dict = {}
for lino in kb2keywords_gen:
kb_id = lino[0]
kb2keywords_dict[kb_id] = []
subject = lino[1]
keywords = lino[2]
keywords_list = [x.lower().strip() for x in map(replaceRockDots_lambda(), str(keywords).split(","))]
if kb_keywords:
for item in keywords_list:
if item != "":
kb2keywords_dict[kb_id].append(item)
else:
kb2keywords_dict[kb_id].append(subject)
#remove all empty items
kb2keywords_dict = { k : v for k,v in kb2keywords_dict.items() if len(v) != 0}
###############
#keywords2kb_dict
keywords2kb_dict = {}
for kb_id, lst in kb2keywords_dict.items():
for l in lst:
if l not in keywords2kb_dict.keys():
keywords2kb_dict[l] = [kb_id]
else:
keywords2kb_dict[l].append(kb_id)
############
# idee topic_ID -> KB_ID -> keywords / subject -> llda
# ticket2kb_dict {'INC65627': 'KBA10044', 'INC66057': 'KBA10009', ...}
# kb2keywords_dict {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...}
# keywords2kb_dict {'unicard namensaenderung': ['KBA10276'], 'vpn': ['KBA10063'], 'outlook_exchange': ['KBA10181'], ...}
# Look for actually used keywords
used_keywords = []
for doc in corpus:
ticket_number = doc.metadata["TicketNumber"]
kb_number = ticket2kb_dict.get(ticket_number, None)
keywords = kb2keywords_dict.get(kb_number, None)
if keywords and kb_number:
used_keywords.append(list(map(normalize_str, keywords)))
kb_entries_used = (len(list(set([kb for kb in ticket2kb_dict.values()]))))
print("kb_entries_used: {}".format(kb_entries_used))
labelist = [item for sublist in used_keywords for item in sublist]
labelist = list(set(labelist))
print("len(labelist): {}".format(len(labelist)))
labeldict = {k: v for v, k in enumerate(labelist)}
labeldict_rev = {v: k for k, v in labeldict.items()}
print("labledict created")
def genos_linos(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict):
for doc in textacyCorpus:
ticket_number = doc.metadata["TicketNumber"]
kb_number = ticket2kb_dict.get(ticket_number, None)
keywords = kb2keywords_dict.get(kb_number, None)
if keywords is not None:
pass
if keywords and kb_number:
label = ""
for kw in keywords:
label = label + str(labeldict.get(normalize_str(str(kw)), len(labeldict))) + " "
yield "[ " + label + "] " + doc.text
line_gen = genos_linos(corpus, labeldict, ticket2kb_dict, kb2keywords_dict)
textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath)
# wait for file to exist
while not os.path.exists(LLDA_filepath):
time.sleep(1)
logprint("")
logprint("start LLDA:")
# run JGibbsLLDA file
n_topics = len(labeldict) #+1 #default-topic
FNULL = open(os.devnull, 'w') # supress output
cmd_jgibbs_java = ["java", "-cp",
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
jgibbsLLDA_root),
"jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
"tickets.gz",
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
output = subprocess.check_output(cmd_gzip).decode("utf-8")
topic_regex = re.compile(r'Topic [0-9]*')
#####################################
# todo save results in file aufgrund von results
result = []
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
try:
index = int(findall[0].split()[1])
result.append("Topic {} {}:".format(index, labeldict_rev[index]))
except:
result.append(line)
else:
result.append(line)
textacy.fileio.write_file_lines(result, path2save_results+".txt")
#####################################
results = []
res_dict = {}
count =0
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
if len(res_dict) != 0:
results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig)
index = int(findall[0].split()[1])
res_dict = {index : str(labeldict_rev[index]) }
else:
splitted = line.split()
res_dict[splitted[0]] = float(splitted[1])
if len(res_dict) != 0:
results.append(res_dict) # letzes an die liste ran
# every term in the resulsts to a list
terms=[]
for res in results:
for key,value in res.items():
if not isinstance(key, int) and not key in terms:
terms.append(key)
term2id = {t:i for i,t in enumerate(terms)} #and to dict
################# termite plot #####################################################################
topic_labels = list(range(len(labelist)))
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
for i,res in enumerate(results):
for key,value in res.items():
if not isinstance(key, int):
term_topic_weights[term2id[key]][i] = value
term_labels[term2id[key]] = key
else:
topic_labels[i] = labeldict_rev[key]
draw.draw_termite(
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
end = time.time()
def main(use_cleaned=False, algorithm="llda"):
logprint("Topic Modeling: {0}".format(datetime.now()))
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
if use_cleaned:
preCorpus_name = "de" + "_clean_ticket"
resultspath = FILEPATH + "results/clean"
else:
preCorpus_name = "de" + "_pre_ticket"
resultspath = FILEPATH + "results/pre"
# load cleand corpus
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
logprint("Corpus loaded: {0}".format(de_corpus.lang))
"""
ngrams = 1
min_df = 1
max_df = 1.0
weighting = 'tf'
# weighting ='tfidf'
named_entities = False
printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
"""
if algorithm == "llda":
top_topic_words = 5
path2save_results = resultspath + "_cat_{}_{}".format(algorithm,"top"+str(top_topic_words))
jgibbsLLDA_category(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
kb_keywords = False
path2save_results = resultspath + "_kb_{}_{}_{}".format("keys" if kb_keywords else "subs",algorithm,"top"+str(top_topic_words))
jgibbsLLDA_KB(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
kb_keywords = True
path2save_results = resultspath + "_kb_{}_{}_{}".format("keys" if kb_keywords else "subs", algorithm,
"top" + str(top_topic_words))
jgibbsLLDA_KB(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
kb_keywords=kb_keywords)
"""
top_topic_words = 10
path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
top_topic_words = 15
path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
top_topic_words = 20
path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
"""
else:
textacyTopicModeling(ngrams = 1,
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =15,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=20,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=25,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=30,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams=(1, 2),
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=15,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =20,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =25,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =30,
corpus=de_corpus)
"""
if __name__ == "__main__":
main()