topicModelingTickets/topicModeling.py

493 lines
18 KiB
Python
Raw Normal View History

2017-10-10 14:42:09 +02:00
# -*- coding: utf-8 -*-
from datetime import datetime
import time
2017-10-16 14:01:38 +02:00
import csv
import sys
2017-10-18 17:37:20 +02:00
import json
import os.path
import subprocess
2017-11-03 11:49:26 +01:00
from textacy import Vectorizer, viz
2017-10-16 14:01:38 +02:00
2017-10-18 17:37:20 +02:00
from miscellaneous import *
2017-10-16 14:01:38 +02:00
import textacy
from scipy import *
2017-10-18 17:37:20 +02:00
import os
2017-10-16 14:01:38 +02:00
csv.field_size_limit(sys.maxsize)
2017-10-18 17:37:20 +02:00
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
# load config
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
2017-10-10 14:42:09 +02:00
2017-10-25 09:46:44 +02:00
def label2ID(label, labeldict):
return labeldict.get(label, len(labeldict))
2017-10-10 14:42:09 +02:00
2017-10-25 09:46:44 +02:00
def generate_labled_lines(textacyCorpus, labeldict):
for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text
def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
logprint(str("max_df: {0}".format(max_df)))
logprint(str("named_entities: {0}".format(named_entities)))
2017-10-10 14:42:09 +02:00
2017-10-16 14:01:38 +02:00
# printlog("vectorize corpi...")
2017-10-10 14:42:09 +02:00
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
for t in terms_list:
print(t)
2017-10-25 09:46:44 +02:00
logprint("doc_term_matrix: {0}".format(doc_term_matrix))
logprint("id2term: {0}".format(id2term))
2017-10-10 14:42:09 +02:00
2017-10-25 09:46:44 +02:00
def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda', named_entities=False):
logprint(
2017-10-10 14:42:09 +02:00
"############################################ Topic Modeling {0} #############################################".format(
topicModel))
print("\n\n")
2017-10-25 09:46:44 +02:00
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
logprint(str("max_df: {0}".format(max_df)))
logprint(str("n_topics: {0}".format(n_topics)))
logprint(str("named_entities: {0}".format(named_entities)))
2017-10-10 14:42:09 +02:00
start = time.time()
2017-10-30 12:56:52 +01:00
top_topic_words = 7
2017-10-10 14:42:09 +02:00
top_document_labels_per_topic = 5
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
2017-10-30 12:56:52 +01:00
2017-10-10 14:42:09 +02:00
####################'####################
2017-10-16 14:01:38 +02:00
# printlog("vectorize corpi...")
2017-10-10 14:42:09 +02:00
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
# printlog("terms_list: {0}".format(list(terms_list)))
# printlog("doc_term_matrix: {0}".format(doc_term_matrix))
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
# Initialize and train a topic model
# printlog("Initialize and train a topic model..")
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix)
2017-10-16 14:01:38 +02:00
# Transform the corpi and interpret our model:
# printlog("Transform the corpi and interpret our model..")
2017-10-10 14:42:09 +02:00
doc_topic_matrix = model.transform(doc_term_matrix)
print()
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
2017-10-25 09:46:44 +02:00
logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
2017-10-10 14:42:09 +02:00
print()
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
2017-10-25 09:46:44 +02:00
logprint(topic_idx)
2017-10-10 14:42:09 +02:00
for j in top_docs:
2017-10-25 09:46:44 +02:00
logprint(corpus[j].metadata['categoryName'])
2017-10-10 14:42:09 +02:00
print()
#####################################################################################################################
print()
print()
2017-10-30 12:56:52 +01:00
# termite plot
n_terms = int(n_topics*top_topic_words)
sort_terms_by = 'seriation' #'seriation', 'weight', 'index', 'alphabetical'
rank_terms_by = 'corpus' # 'corpus', 'topic'
model.termite_plot(doc_term_matrix, id2term,
n_terms=n_terms,
sort_terms_by=sort_terms_by,
rank_terms_by=rank_terms_by+'_weight',
save="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/results/{}_{}_{}_{}_{}.png".format(topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by))
2017-10-10 14:42:09 +02:00
end = time.time()
2017-10-25 09:46:44 +02:00
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
2017-10-10 14:42:09 +02:00
2017-10-30 12:56:52 +01:00
def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=False):
2017-10-18 17:37:20 +02:00
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
start = time.time()
2017-10-10 14:42:09 +02:00
2017-10-30 12:56:52 +01:00
# build dictionary of ticketcategories
2017-10-18 17:37:20 +02:00
labelist = []
2017-10-10 14:42:09 +02:00
2017-10-25 09:46:44 +02:00
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
2017-10-18 17:37:20 +02:00
labelist.append(texdoc.metadata["categoryName"])
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
labeldict = {k: v for v, k in enumerate(labelist)}
2017-10-10 14:42:09 +02:00
2017-10-25 09:46:44 +02:00
if add_default_topic:
n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
else:
n_topics = len(labeldict) # + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
2017-10-10 14:42:09 +02:00
2017-10-25 09:46:44 +02:00
jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/"
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
2017-11-03 11:49:26 +01:00
#dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
dict_path = FILEPATH +"results/labeldict.txt".format(jgibbsLLDA_root)
2017-10-10 14:42:09 +02:00
2017-10-25 09:46:44 +02:00
# printlog(str("LABELDICT: {0}".format(labeldict)))
#logprint(str("LABELDICT-length: {0}".format(len(labeldict))))
2017-10-18 17:37:20 +02:00
with open(dict_path, 'w') as file:
file.write(json.dumps(labeldict))
2017-10-10 14:42:09 +02:00
2017-10-25 09:46:44 +02:00
# for line in generate_labled_lines(de_corpus,labeldict):
2017-10-18 17:37:20 +02:00
# print(line)
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
# create file
2017-10-25 09:46:44 +02:00
textacy.fileio.write_file_lines(generate_labled_lines(corpus, labeldict), filepath=LLDA_filepath)
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
# wait for file to exist
while not os.path.exists(LLDA_filepath):
time.sleep(1)
2017-10-30 12:56:52 +01:00
#top_topic_words=1
2017-10-25 09:46:44 +02:00
logprint("")
logprint("start LLDA:")
2017-10-18 17:37:20 +02:00
# run JGibsslda file
FNULL = open(os.devnull, 'w') # supress output
2017-10-25 09:46:44 +02:00
cmd_jgibbs_java = ["java", "-cp",
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
jgibbsLLDA_root),
"jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
"tickets.gz",
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
# twords
2017-10-25 09:46:44 +02:00
"""
2017-10-18 17:37:20 +02:00
subprocess.call(["gzip",
"-dc",
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
"""
2017-10-25 09:46:44 +02:00
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
"""
proc = subprocess.Popen(cmd_gzip, stdout=subprocess.PIPE)
process = subprocess.Popen(cmd_gzip, shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# wait for the process to terminate
out, err = process.communicate()
errcode = process.returncode
result = subprocess.check_output(cmd_gzip)
#result = proc.stdout.read()
result = proc.communicate()
out=[]
for line in result:
out.append(line)
"""
output = subprocess.check_output(cmd_gzip).decode("utf-8")
2017-10-18 17:37:20 +02:00
2017-10-25 09:46:44 +02:00
reverse_labeldict = {v: k for k, v in labeldict.items()}
result = []
2017-10-30 12:56:52 +01:00
regex = re.compile(r'Topic [0-9]*')
2017-10-25 09:46:44 +02:00
for line in output.splitlines():
2017-10-18 17:37:20 +02:00
2017-10-25 09:46:44 +02:00
findall = regex.findall(line)
if len(findall) != 0:
try:
index = int(findall[0].split()[1])
result.append("Topic {} {}:".format(index, reverse_labeldict[index]))
except:
result.append(line)
else:
result.append(line)
textacy.fileio.write_file_lines(result, path2save_results)
#####################################################################################################################
2017-11-03 11:49:26 +01:00
#todo llda termite plot
"""
topic_inds=[] #<class 'list'>: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
# get topic and term labels
# <class 'tuple'>: ('topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14')
topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)
# <class 'tuple'>: ('hardware', 'raum', 'adresse', 'gebaeude', 'tu', 'uni', 'ticket', 'email', 'account', 'nummer', 'mail', 'outlook', 'karte', 'drucker', 'server', 'service', 'antwort', 'verbindung', 'herzliche', 'einrichten', 'vergessen', 'wenden', 'ews', 'anhang', 'form', 'konto', 'nachricht', 'unterstuetzung', 'passwort', 'unicard', 'semester', 'system', 'aenderung', 'rueckmeldung', 'meldung', 'zugreifen', 'login', 'adressat', 'sender', 'kurs', 'made', 'mittwoch', 'note', 'our', 'korrespondenz', 'unbeschadet', 'boss', 'unterrichten', 'telefax', 'zugang', 'probleme', 'zugriff', 'mitarbeiterin', 'internet', 'daten', 'anmeldung', 'aendern', 'unterschrift', 'loeschen', 'anmelden', 'datei', 'message', 'laptop', 'benoetigt', 'link', 'montag', 'programm', 'ordner', 'personal', 'rechner', 'veranstaltung', 'august', 'lizenz', 'anschluss', 'mitarbeiter', 'erwuenscht', 'umzug', 'pc', 'uniaccount', 'amt', 'fax', 'it', 'institut', 'nutzer', 'bild', 'type', 'prof', 'verantwortlicher', 'bemerkung', 'antragsteller', 'element', 'hahn', 'eintrag', 'telefonbuch', 'ansprechpartner', 'universitaet', 'physik', 'abteilung', 'fakultaet', 'software', 'dezernat', 'einrichtung', 'telefon', 'lehrstuhl', 'buero')
term_labels = tuple(id2term[term_ind] for term_ind in term_inds)
# get topic-term weights to size dots
#[[ 0.02721858 -0.03898025 0.00047936 ..., 0.05862538 -0.07742336 0.04761928]
# [ 0.14977875 -0.24192522 -0.00620335 ..., -0.0497216 0.08269951 -0.05715901]
# [ 0.04977951 0.02296709 0.01214562 ..., 0.11444371 -0.15212482 0.21481788]
# ...,
# [
term_topic_weights = np.array([self.model.components_[topic_ind][term_inds]
for topic_ind in topic_inds]).T
viz.draw_termite_plot(
term_topic_weights, topic_labels, term_labels, save=path2save_results)
"""
2017-10-25 09:46:44 +02:00
logprint("")
end = time.time()
logprint("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60))
2017-10-18 17:37:20 +02:00
2017-10-25 09:46:44 +02:00
2017-10-30 12:56:52 +01:00
def main(use_raw=False, algorithm="llda"):
2017-10-25 09:46:44 +02:00
logprint("Topic Modeling: {0}".format(datetime.now()))
2017-10-18 17:37:20 +02:00
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
2017-10-30 12:56:52 +01:00
2017-10-25 09:46:44 +02:00
if use_raw:
2017-10-30 12:56:52 +01:00
# fehler Unknown document label ( X ) for document 352.
2017-10-25 09:46:44 +02:00
preCorpus_name = "de" + "_raw_ticket"
2017-10-30 12:56:52 +01:00
resultspath = FILEPATH + "results/raw"
2017-10-25 09:46:44 +02:00
else:
preCorpus_name = "de" + "_pre_ticket"
2017-10-30 12:56:52 +01:00
resultspath = FILEPATH + "results/pre"
2017-10-18 17:37:20 +02:00
2017-10-25 09:46:44 +02:00
# load raw corpus and create new one
2017-10-18 17:37:20 +02:00
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
2017-10-25 09:46:44 +02:00
logprint("Corpus loaded: {0}".format(de_corpus.lang))
2017-10-18 17:37:20 +02:00
2017-10-25 09:46:44 +02:00
# idee http://bigartm.org/
# idee http://wiki.languagetool.org/tips-and-tricks
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
# idee https://gate.ac.uk/family/
2017-10-18 17:37:20 +02:00
2017-11-03 11:49:26 +01:00
# todo llda topics zusammenfassen
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
# frage lda wieviele tickets pro topic?
2017-10-25 09:46:44 +02:00
2017-10-30 12:56:52 +01:00
"""
2017-10-18 17:37:20 +02:00
ngrams = 1
min_df = 1
max_df = 1.0
weighting = 'tf'
# weighting ='tfidf'
named_entities = False
2017-10-30 12:56:52 +01:00
2017-10-18 17:37:20 +02:00
printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting)
2017-10-25 09:46:44 +02:00
2017-10-18 17:37:20 +02:00
printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
"""
2017-10-30 12:56:52 +01:00
if algorithm == "llda":
top_topic_words = 5
add_default_topic = False
2017-11-03 11:49:26 +01:00
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
2017-10-30 12:56:52 +01:00
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
top_topic_words = 5
add_default_topic = True
2017-11-03 11:49:26 +01:00
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
2017-10-30 12:56:52 +01:00
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
top_topic_words = 10
add_default_topic = False
2017-11-03 11:49:26 +01:00
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
2017-10-30 12:56:52 +01:00
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
top_topic_words = 10
add_default_topic = True
2017-11-03 11:49:26 +01:00
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
2017-10-30 12:56:52 +01:00
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
2017-10-18 17:37:20 +02:00
# no_below = 20
# no_above = 0.5
# n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
2017-10-30 12:56:52 +01:00
else:
# build dictionary of ticketcategories
labelist = []
2017-10-18 17:37:20 +02:00
2017-10-30 12:56:52 +01:00
for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
2017-10-25 09:46:44 +02:00
2017-10-30 12:56:52 +01:00
labeldict = {k: v for v, k in enumerate(labelist)}
2017-10-25 09:46:44 +02:00
2017-10-30 12:56:52 +01:00
textacyTopicModeling(ngrams = 1,
min_df = 1,
2017-11-03 11:49:26 +01:00
max_df = 0.9,
2017-10-30 12:56:52 +01:00
topicModel = algorithm,
2017-11-03 11:49:26 +01:00
n_topics =15,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=20,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=25,
2017-10-30 12:56:52 +01:00
corpus=de_corpus)
2017-11-03 11:49:26 +01:00
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=30,
corpus=de_corpus)
textacyTopicModeling(ngrams=(1, 2),
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=15,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =20,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =25,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =30,
corpus=de_corpus)
2017-10-30 12:56:52 +01:00
"""
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.8,
topicModel = algorithm,
n_topics =n_topics,
corpus=de_corpus)
"""
"""
textacyTopicModeling(ngrams = 1,
min_df = 0.1,
max_df = 0.6,
topicModel = algorithm,
n_topics =n_topics,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 1.0,
topicModel = algorithm,
n_topics =n_topics,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 0.1,
max_df = 0.6,
topicModel = algorithm,
n_topics =n_topics,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 0.2,
max_df = 0.8,
topicModel = algorithm,
n_topics = 20,
corpus=de_corpus)
"""
2017-10-18 17:37:20 +02:00
2017-10-25 09:46:44 +02:00
2017-10-18 17:37:20 +02:00
if __name__ == "__main__":
main()
2017-10-10 14:42:09 +02:00