topicModelingTickets/topicModeling.py

1062 lines
31 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import matplotlib
matplotlib.use('Agg')
from datetime import datetime
import draw
import draw1
import time
import numpy as np
import operator
import csv
import sys
import json
import os.path
import subprocess
from textacy import Vectorizer, viz
from miscellaneous import *
import textacy
from scipy import *
import os
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
def textacyTopicModeling(corpus,
n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5,
ngrams = 1, min_df=1, max_df=0.9,
topicModel='lda'):
n_terms = int(n_topics * top_topic_words)
sort_terms_by = 'seriation' # 'seriation', 'weight', 'index', 'alphabetical'
rank_terms_by = 'corpus' # 'corpus', 'topic'
logprint("#### Topic Modeling {0}".format(topicModel))
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
logprint(str("max_df: {0}".format(max_df)))
logprint(str("n_topics: {0}".format(n_topics)))
logprint("\n")
start = time.time()
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
###### vectorize corpi
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
#id2term = vectorizer.__getattribute__("id_to_term")
####### Initialize and train a topic model
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words, weights=True):
logprint('{0}: {1}'.format(topic_idx, str(top_terms)))
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
logprint(topic_idx)
for j in top_docs:
logprint(corpus[j].metadata['categoryName'])
####################### termite plot ###################################################################
grams_label = "uni" if ngrams == 1 else "bi"
draw1.termite_plot(model,doc_term_matrix, vectorizer.id_to_term,
n_terms=n_terms,
sort_terms_by=sort_terms_by,
rank_terms_by=rank_terms_by + '_weight',
save=FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label, topicModel, n_topics,
n_terms, sort_terms_by, rank_terms_by))
end = time.time()
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
def textacyTopicModeling_v2(doc_term_matrix, id_to_term,
n_topics = 15, top_topic_words = 3,
topicModel='lda'):
n_terms = int(n_topics * top_topic_words)
sort_terms_by = 'seriation' # 'seriation', 'weight', 'index', 'alphabetical'
rank_terms_by = 'corpus' # 'corpus', 'topic'
logprint("#### Topic Modeling {0}".format(topicModel))
logprint(str("n_topics: {0}".format(n_topics)))
logprint("\n")
start = time.time()
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
####### Initialize and train a topic model
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)
for topic_idx, top_terms in model.top_topic_terms(id_to_term, top_n=top_topic_words, weights=True):
logprint('{0}: {1}'.format(topic_idx, str(top_terms)))
####################### termite plot ###################################################################
draw1.termite_plot(model,doc_term_matrix, id_to_term,
n_terms=n_terms,
sort_terms_by=sort_terms_by,
rank_terms_by=rank_terms_by + '_weight',
save=FILEPATH + "results/{}.png".format(topicModel))
end = time.time()
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
def create_ticket2label_dict(ticket2chunk_dict,corpus):
"""
Creates a dictionary to map a TicketNumber to a label
:param ticket2chunk_dict: e.g. { TicketNumber : KB_entries }
:return: {TicketNumber : label }
"""
labelist = ticket2chunk_dict.values()
labelist = flatten(labelist)
labeldict = create_labeldict(labelist, min_label_freq=1, add_default_label=True)
ticket2label = {}
for doc in corpus:
ticketID = doc.metadata["TicketNumber"]
keywords = ticket2chunk_dict.get(ticketID, ['DEFAULT'])
label = ""
for kw in keywords:
label = label + str(labeldict.get(normalize_str(str(kw)), labeldict['DEFAULT'])) + " "
ticket2label.update({ticketID: label})
return ticket2label
def create_labeldict(labelist, min_label_freq=1, add_default_label=True):
# nur die x häufigsten labels benutzen
labelist = [l for l in labelist if labelist.count(l) >= min_label_freq]
in_labelist_ = {k: labelist.count(k) for k in labelist} # { label1 : 3 , label2 : 5, label3 : 1 }
labelist = sort_dictionary(in_labelist_) # [ (label3, 1), (label1, 3), (label2, 5) ]
labelist.reverse() # [ (label2, 5), (label1, 3), (label3, 1) ]
labeldict = {elem[0]: i for i, elem in enumerate(labelist)} # { label2 : 0, label1 : 1 , label3 : 2 }
if add_default_label:
if 'DEFAULT' not in labeldict.keys():
labeldict.update({'DEFAULT': len(labelist)}) # { label2 : 0, label1 : 1 , label3 : 2 , DEFAULT : 3 }
return labeldict
#todo
def jgibbsLLDAv2(labeled_lines_path,ticket2kbs_dict, cleaned_corpus, path2save_results, top_topic_words=7):
ticket2label_dict = create_ticket2label_dict(ticket2kbs_dict, cleaned_corpus)
# reduce ticket2label_dict
labeldict = {}
label_list = list(set(ticket2label_dict.values()))
lbl_dict = {elem : i for i,elem in enumerate(label_list)}
labeldict = {k : lbl_dict[v] for k,v in ticket2label_dict.items()}
labeldict.update({"DEFAULT" : len(labeldict)})
def gen_lines_from_labeled_lines(input,ticket2label_dict):
line_gen = textacy.fileio.read_file_lines(input)
for line in line_gen:
label = re.findall(r'\[(.*?)\]',line)
new_label = "[ "
for lbl in label:
new_label = new_label + str(ticket2label_dict.get(str(lbl),"")).strip() + " "
new_label = new_label + "] "
result = new_label + str(line.rpartition("]")[2])
# new_label = str([ticket2label_dict.get(str(lbl),"") for lbl in label])
# result = "[ " + new_label + " ] " + line.rpartition("]")[2]
#print(result)
yield result
labeldict_rev = {v: k for k, v in labeldict.items()}
#line_gen = gen_lines_from_labeled_lines(labeled_lines_path,ticket2label_dict)
line_gen = gen_lines_from_labeled_lines(labeled_lines_path,labeldict)
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath)
# wait for file to exist
while not os.path.exists(LLDA_filepath):
time.sleep(1)
# run JGibbsLLDA file
n_topics = len(labeldict) #+1 #default-topic
FNULL = open(os.devnull, 'w') # supress output
cmd_jgibbs_java = ["java", "-cp",
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
jgibbsLLDA_root),
"jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
"tickets.gz",
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
output = subprocess.check_output(cmd_gzip).decode("utf-8")
topic_regex = re.compile(r'Topic [0-9]*')
#####################################
# todo save results in file aufgrund von results
result = []
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
try:
index = int(findall[0].split()[1])
result.append("Topic {} {}:".format(index, str(ticket2kbs_dict[labeldict_rev[index]])))
except:
result.append(line)
else:
result.append(line)
textacy.fileio.write_file_lines(result, path2save_results+".txt")
#####################################
results = []
res_dict = {}
count =0
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
if len(res_dict) != 0:
results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig)
index = int(findall[0].split()[1])
res_dict = {index : str(labeldict_rev[index]) }
else:
splitted = line.split()
res_dict[splitted[0]] = float(splitted[1])
if len(res_dict) != 0:
results.append(res_dict) # letzes an die liste ran
# every term in the resulsts to a list
terms=[]
for res in results:
for key,value in res.items():
if not isinstance(key, int) and not key in terms:
terms.append(key)
term2id = {t:i for i,t in enumerate(terms)} #and to dict
################# termite plot #####################################################################
topic_labels = list(range(len(labeldict)))
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
for i,res in enumerate(results):
for key,value in res.items():
if not isinstance(key, int):
term_topic_weights[term2id[key]][i] = value
term_labels[term2id[key]] = key
else:
topic_labels[i] = labeldict_rev[key]
draw.draw_termite(
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
draw.draw_termite(
term_topic_weights, topic_labels, term_labels, save=path2save_results+"_spaced.png",pow_x=0.78,pow_y=0.87)
# save labeldict
labeldict_path = path2save_results + "_labeldict.json"
with open(labeldict_path, 'w') as file:
file.write(json.dumps(labeldict))
def jgibbsLLDA(labeldict,line_gen,path2save_results, top_topic_words=7):
#labeldict = {k : labelist.count(k) for k in labelist}
#max=0
#for v in labeldict.values():
# max = v if v > max else max
#labelist = sort_dictionary(labeldict)
#labeldict.update({'DEFAULT' : max+1})
labeldict_rev = {v: k for k, v in labeldict.items()}
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath)
# wait for file to exist
while not os.path.exists(LLDA_filepath):
time.sleep(1)
# run JGibbsLLDA file
n_topics = len(labeldict) #+1 #default-topic
FNULL = open(os.devnull, 'w') # supress output
cmd_jgibbs_java = ["java", "-cp",
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
jgibbsLLDA_root),
"jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
"tickets.gz",
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
output = subprocess.check_output(cmd_gzip).decode("utf-8")
topic_regex = re.compile(r'Topic [0-9]*')
#####################################
# todo save results in file aufgrund von results
result = []
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
try:
index = int(findall[0].split()[1])
result.append("Topic {} {}:".format(index, labeldict_rev[index]))
except:
result.append(line)
else:
result.append(line)
textacy.fileio.write_file_lines(result, path2save_results+".txt")
#####################################
results = []
res_dict = {}
count =0
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
if len(res_dict) != 0:
results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig)
index = int(findall[0].split()[1])
res_dict = {index : str(labeldict_rev[index]) }
else:
splitted = line.split()
res_dict[splitted[0]] = float(splitted[1])
if len(res_dict) != 0:
results.append(res_dict) # letzes an die liste ran
# every term in the resulsts to a list
terms=[]
for res in results:
for key,value in res.items():
if not isinstance(key, int) and not key in terms:
terms.append(key)
term2id = {t:i for i,t in enumerate(terms)} #and to dict
################# termite plot #####################################################################
topic_labels = list(range(len(labeldict)))
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
for i,res in enumerate(results):
for key,value in res.items():
if not isinstance(key, int):
term_topic_weights[term2id[key]][i] = value
term_labels[term2id[key]] = key
else:
topic_labels[i] = labeldict_rev[key]
draw.draw_termite(
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
draw.draw_termite(
term_topic_weights, topic_labels, term_labels, save=path2save_results+"_spaced.png",pow_x=0.78,pow_y=0.87)
# save labeldict
labeldict_path = path2save_results + "_labeldict.json"
with open(labeldict_path, 'w') as file:
file.write(json.dumps(labeldict))
def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
start = time.time()
logprint("")
logprint("start Category-LLDA:")
# labeldict ############################################################################################
# build dictionary of ticketcategories
labelist = []
for doc in corpus:
category = normalize_str(doc.metadata["categoryName"])
labelist.append(category)
x = 50 # frage nur die x häufigsten labels benutzen, rest raus?
labelist = [l for l in labelist if labelist.count(l) > x ]
in_labelist_ = {k: labelist.count(k) for k in labelist}
labelist = sort_dictionary(in_labelist_)
labelist.reverse()
labeldict = {elem[0] : i for i, elem in enumerate(labelist)}
#for elem in labelist:
# l = elem[0]
# c = elem[1]
#labeldict = {elem[0] : len(labelist)-(i+1) for i, elem in enumerate(labelist)}
#labelist = list(set(labelist))
#labeldict = {k: v for v, k in enumerate(labelist)}
labeldict.update({'DEFAULT': len(labelist)})
##############################################################################################
def gen_cat_lines(textacyCorpus, labeldict):
""" generates [topic1, topic2....] tok1 tok2 tok3 out of corpi"""
for doc in textacyCorpus:
label = labeldict.get(normalize_str(doc.metadata["categoryName"]), labeldict['DEFAULT'])
if label is not 'DEFAULT':
yield "[ " + str(label) + " ] " + doc.text
line_gen = gen_cat_lines(corpus, labeldict)
path2save_results = path2save_results + "_kb_cat_llda_{}".format("top" + str(top_topic_words))
jgibbsLLDA(labeldict, line_gen, path2save_results, top_topic_words=top_topic_words)
end = time.time()
logprint("\n\n\nTime Elapsed Category-LLDA :{0} min\n\n".format((end - start) / 60))
@deprecated
def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=False):
"""ticket_ID -> KB_ID -> keywords / subject -> llda"""
start = time.time()
logprint("")
logprint("start {}-LLDA:".format("Keyword" if kb_keywords else "Subject"))
# labeldict ############################################################################################
# ticket2kb_dict
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
ticket2kb_dict = {}
for line in kb2ticket_gen:
ticket_id = line[0]
kb_id = line[1]
ticket2kb_dict[ticket_id] = kb_id
# {'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...}
kb_entries_used = len(list(set(ticket2kb_dict.values())))
print("kb_entries_used: {}".format(kb_entries_used))
# kb2keywords_dict
kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";")
next(kb2keywords_gen,None) #skip first line("ArticleID";"Subject";"Keywords";...)
kb2keywords_dict = {}
for line in kb2keywords_gen:
kb_id = line[0]
subject = line[1]
keywords = line[2]
keywords_list = [normalize_str(x) for x in str(keywords).split(",")]
if kb_id not in kb2keywords_dict.keys():
kb2keywords_dict[kb_id] = []
if kb_keywords:
for item in keywords_list:
if item != "":
kb2keywords_dict[kb_id].append(item)
else:
kb2keywords_dict[kb_id].append(subject)
#remove all empty items
kb2keywords_dict = { k : v for k,v in kb2keywords_dict.items() if len(v) != 0}
# {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...}
#keywords2kb_dict
keywords2kb_dict = {}
for kb_id, lst in kb2keywords_dict.items():
for l in lst:
if l not in keywords2kb_dict.keys():
keywords2kb_dict[l] = [kb_id]
else:
keywords2kb_dict[l].append(kb_id)
# {'unicard namensaenderung': ['KBA10276'], 'vpn': ['KBA10063'], 'outlook_exchange': ['KBA10181'], ...}
# Look for actually used keywords
used_keywords = []
for doc in corpus:
ticket_number = doc.metadata["TicketNumber"]
kb_id = ticket2kb_dict.get(ticket_number, None)
keywords = kb2keywords_dict.get(kb_id, None)
if keywords and kb_id:
used_keywords.append(list(map(normalize_str, keywords)))
labelist = [item for sublist in used_keywords for item in sublist] #flatten list
labelist = list(set(labelist))
print("len(labelist): {}".format(len(labelist)))
labeldict = {k: v for v, k in enumerate(labelist)}
##############################################################################################
def gen_KB_lines(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict):
for doc in corpus:
ticket_number = doc.metadata["TicketNumber"]
kb_number = ticket2kb_dict.get(ticket_number, None)
keywords = kb2keywords_dict.get(kb_number, None)
if keywords:
label = ""
for kw in keywords:
label = label + str(labeldict.get(normalize_str(str(kw)), len(labeldict))) + " "
yield "[ " + label + "] " + doc.text
line_gen = gen_KB_lines(corpus, labeldict, ticket2kb_dict, kb2keywords_dict)
path2save_results = path2save_results + "_kb_{}_llda_{}".format("keys" if kb_keywords else "subs",
"top" + str(top_topic_words))
jgibbsLLDA(labeldict, line_gen, path2save_results, top_topic_words=top_topic_words)
end = time.time()
logprint("\n\n\nTime Elapsed {1}-LLDA :{0} min\n\n".format((end - start) / 60,"Keyword" if kb_keywords else "Subject"))
def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
start = time.time()
logprint("")
logprint("start LLDA:")
# labeldict ############################################################################################
# kb2keywords_dict / kb2subjects_dict --> {str : [str]}
kb2keywords_dict = {}
kb2subjects_dict = {}
kb_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";")
next(kb_gen, None) # skip first line "ArticleID";"Subject";"Keywords";...
for line in kb_gen:
kb_id = line[0]
subject = normalize_str(line[1])
keywords = [normalize_str(x) for x in str(line[2]).split(",")]
if kb_id not in kb2keywords_dict.keys():
kb2keywords_dict[kb_id] = keywords if keywords != [''] else ["DEFAULT"]
else:
kb2keywords_dict[kb_id] = kb2keywords_dict[kb_id] + keywords
if kb_id not in kb2subjects_dict.keys():
kb2subjects_dict[kb_id] = [normalize_str(subject) if subject != [''] else "DEFAULT"]
else:
kb2subjects_dict[kb_id].append(normalize_str(subject))
# ticket2kbs_dict --> {str : [str]}
ticket2kbs_dict = {}
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
next(kb2ticket_gen, None) # skip first line"TicketNumber";"ArticleID"
for line in kb2ticket_gen:
ticket_id = line[0]
kb_id = line[1]
if ticket_id not in ticket2kbs_dict.keys():
ticket2kbs_dict[ticket_id] = [kb_id]
else:
ticket2kbs_dict[ticket_id].append(kb_id)
# ticket2keywords --> {str:[str]}
ticket2keywords_dict = {}
for ticket_id, kb_ids in ticket2kbs_dict.items():
if ticket_id not in ticket2keywords_dict.keys():
ticket2keywords_dict[ticket_id] = []
for kb_id in kb_ids:
ticket2keywords_dict[ticket_id].append(kb2keywords_dict[kb_id])
ticket2keywords_dict[ticket_id] = flatten(ticket2keywords_dict[ticket_id])
# ticket2subjects --> {str:[str]}
ticket2subjects_dict = {}
for ticket_id, kb_ids in ticket2kbs_dict.items():
if ticket_id not in ticket2subjects_dict.keys():
ticket2subjects_dict[ticket_id] = []
for kb_id in kb_ids:
ticket2subjects_dict[ticket_id].append(kb2subjects_dict[kb_id])
ticket2subjects_dict[ticket_id] = flatten(ticket2subjects_dict[ticket_id])
# kb2keywords_dict {'KBA10230': ['DEFAULT'], 'KBA10129': ['DEFAULT'], 'KBA10287': ['sd_ansys_informationen'], } len = 260
# kb2subjects_dict {'KBA10230': ['unicard nochmal beantragen'], 'KBA10129': ['sd_entsperrung unicard nach verlust/wiederfinden'], } len = 260
# ticket2kbs_dict {'INC44526': ['KBA10056'], 'INC67205': ['KBA10056'], } len = 4832
# ticket2keywords_dict {'INC44526': ['DEFAULT'], 'INC67205': ['DEFAULT'], 'INC71863': ['DEFAULT'], 'INC44392': ['asknet'] } len=4832
# ticket2subjects_dict {'INC44526': ['sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt)'], len=4832
count_dict = {}
for v in ticket2kbs_dict.values():
for kb in v:
if kb in count_dict.keys():
count_dict[kb] +=1
else:
count_dict[kb] = 1
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
"""
for k,v in sorted_dict:
subs = kb2subjects_dict[k]
keys = kb2keywords_dict[k]
print(subs, keys , v) # frage wieviele tickets pro topic?
print("kb_entrys used: {}".format(len(sorted_dict))) # frage wie viele kb_entry's insg genutzt?: 155
"""
labelist = ticket2keywords_dict.values()
labelist = flatten(labelist)
labelist = list(set(labelist))
labeldict = {k: v for v, k in enumerate(labelist)}
##############################################################################################
def gen_key_lines(textacyCorpus, labeldict, ticket2keywords_dict):
for doc in corpus:
ticket_number = doc.metadata["TicketNumber"]
keywords = ticket2keywords_dict.get(ticket_number, ['DEFAULT'])
if keywords != ['DEFAULT']:
label = ""
for kw in keywords:
label = label + str(labeldict.get(normalize_str(str(kw)), labeldict['DEFAULT'])) + " "
yield "[ " + label + "] " + doc.text
keys_line_gen = gen_key_lines(corpus, labeldict, ticket2keywords_dict)
path2save_keys_results = path2save_results + "_kb_keys_llda_{}".format("top" + str(top_topic_words))
jgibbsLLDA(labeldict, keys_line_gen, path2save_keys_results, top_topic_words=top_topic_words)
labelist = ticket2subjects_dict.values()
labelist = flatten(labelist)
labelist = list(set(labelist))
labeldict = {k: v for v, k in enumerate(labelist)}
labeldict.update({'DEFAULT' : len(labeldict)})
subj_line_gen = gen_key_lines(corpus, labeldict, ticket2subjects_dict)
path2save_subj_results = path2save_results + "_kb_subj_llda_{}".format("top" + str(top_topic_words))
jgibbsLLDA(labeldict, subj_line_gen, path2save_subj_results, top_topic_words=top_topic_words)
end = time.time()
logprint("\n\n\nTime Elapsed LLDA :{0} min\n\n".format((end - start) / 60))
def load_from_labled_lines(path):
path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/pre_labled_lines_wo_lemma_061217.txt"
#idee
# clean laden, pre laden
# unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee bigramme mit _ verbinden # nimm nur ngrams wo midn. ein token in pre vorkommt
def main(cleaned_corpus, pre_corpus, algorithm="llda"):
logprint("Topic Modeling: {0}".format(datetime.now()))
#todo von labled_lines laden ??
#idee thesaurus vor id2term
#todo akronyme & abk. drin lassen
#todo bigramme nicht auf pre, sondern auf cleaned
#todo zahlen drin lassen, bigramme: NUM wort kombis
#todo levenstein/hamming distanz statt autokorrekt #idee oder word2vec
#todo ticket-subj mit einbeziehen
resultspath = FILEPATH + "results/pre"
de_corpus = pre_corpus
if algorithm == "llda":
top_topic_words = 3
jgibbsLLDA_category(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words)
jgibbsLLDA_KB_v2(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words)
"""
kb_keywords = False
jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
kb_keywords = True
jgibbsLLDA_KB(de_corpus, path2save_results=resultspath, top_topic_words=top_topic_words, kb_keywords=kb_keywords)
top_topic_words = 10
path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
top_topic_words = 15
path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
top_topic_words = 20
path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
"""
else:
textacyTopicModeling(ngrams = 1,
topicModel = algorithm,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=20,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=25,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=30,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams=(1, 2),
topicModel=algorithm,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =20,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =25,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =30,
corpus=de_corpus)
"""
if __name__ == "__main__":
# load corpus
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
pre_corpus_name = "de" + "_pre"
pre_corpus, parser = load_corpus(corpus_name=pre_corpus_name, corpus_path=corpus_de_path)
logprint("Corpus loaded: {0}".format(pre_corpus_name))
cleaned_corpus_name = "de" + "_raw"
#cleaned_corpus, parser = load_corpus(corpus_name=cleaned_corpus_name, corpus_path=corpus_de_path)
logprint("Corpus loaded: {0}".format(cleaned_corpus_name))
cleaned_corpus = None
main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="llda")
main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="lda")