commit vor refactoring

This commit is contained in:
jannis.grundmann 2017-11-03 11:49:26 +01:00
parent 6ea03b2f65
commit ecc8c0c54a
10 changed files with 148 additions and 97 deletions

View File

@ -41,8 +41,8 @@ filename=topicModelTickets.log
[de_corpus] [de_corpus]
#input=M42-Export/Tickets_small.csv input=M42-Export/Tickets_small.csv
input=M42-Export/de_tickets.csv #input=M42-Export/de_tickets.csv
path=corpi/ path=corpi/
@ -64,7 +64,7 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC #ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
custom_words=geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok custom_words=eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok

31
main.py
View File

@ -2,18 +2,21 @@
import matplotlib import matplotlib
matplotlib.use('Agg') matplotlib.use('Agg')
import time import time
import init
import corporization import corporization
import preprocessing import preprocessing
import topicModeling import topicModeling
import cleaning import cleaning
from miscellaneous import *
from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &" # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
start = time.time() start = time.time()
import init
init.main()
#init.main()
logprint("") logprint("")
corporization.main() corporization.main()
@ -25,14 +28,32 @@ logprint("")
preprocessing.main() # ~5h preprocessing.main() # ~5h
logprint("") logprint("")
#topicModeling.main(use_raw=False,algorithm="llda")
"""
topicModeling.main(use_raw=False,algorithm="lsa")
logprint("") logprint("")
#topicModeling.main(use_raw=True)
topicModeling.main(use_raw=False,algorithm="lda") topicModeling.main(use_raw=False,algorithm="lda")
logprint("") logprint("")
topicModeling.main(use_raw=False,algorithm="nmf")
logprint("")
topicModeling.main(use_raw=False,algorithm="llda")
logprint("")
"""
logprint("") logprint("")
end = time.time() end = time.time()

View File

@ -154,18 +154,6 @@ def printRandomDoc(textacyCorpus):
print() print()
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
def saveplaincorpustext(corpus,path):
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
def save_corpus(corpus, corpus_path, corpus_name): def save_corpus(corpus, corpus_path, corpus_name):
""" """
@ -175,42 +163,22 @@ def save_corpus(corpus, corpus_path, corpus_name):
:param corpus_name: str (should content the language like "_de_") :param corpus_name: str (should content the language like "_de_")
""" """
"""
# save stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path, "w") as file:
parser.vocab.strings.dump(file)
#todo save vocab?
"""
# save parser # save parser
parser = corpus.spacy_lang parser = corpus.spacy_lang
parserpath = corpus_path + str(parser.lang) + '_parser' parserpath = corpus_path + str(parser.lang) + '_parser'
parser.save_to_directory(parserpath) parser.save_to_directory(parserpath)
## # save plain content + meta
# save content
contentpath = corpus_path + corpus_name + "_content.bin"
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
#save plain content
plainpath = corpus_path + corpus_name + "_content.json" plainpath = corpus_path + corpus_name + "_content.json"
textacy.fileio.write_json_lines(({"index" : doc.corpus_index, "content" : doc.text} for doc in corpus), plainpath) textacy.fileio.write_json_lines(gen_dicts(corpus), plainpath)
# save meta
metapath = corpus_path + corpus_name + "_meta.json"
#meta_gen = (doc.metadata.update({"index": doc.corpus_index}) for doc in corpus)
meta_gen = gen_meta(corpus)
textacy.fileio.write_json_lines(meta_gen, metapath)
def gen_meta(corpus):
def gen_dicts(corpus):
for doc in corpus: for doc in corpus:
meta = doc.metadata dict = {"index" : doc.corpus_index, "content" : doc.text}
meta.update({"index": doc.corpus_index}) dict.update(doc.metadata)
yield meta yield dict
@ -233,7 +201,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
# load parser # load parser
parser = spacy.load(lang) parser = spacy.load(lang)
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json' stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
with open(stringstorepath) as file: with open(stringstorepath) as file:
parser.vocab.strings.load(file) parser.vocab.strings.load(file)
@ -244,46 +211,35 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
#load corpus #load corpus
corpus = textacy.Corpus(parser) corpus = textacy.Corpus(parser)
contentpath = corpus_path + corpus_name + "_content.bin"
plainpath = corpus_path + corpus_name + "_content.json" plainpath = corpus_path + corpus_name + "_content.json"
metapath = corpus_path + corpus_name + "_meta.json"
"""
try:
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
metadata_stream = textacy.fileio.read_json_lines(metapath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
except:
"""
# neu init!!
#corpus = textacy.Corpus(parser)
plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str} plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str}
metadata_stream = textacy.fileio.read_json_lines(metapath)
for plain, metadata in zip(plain_stream, metadata_stream):
if plain["index"] == metadata["index"]:
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=metadata))
else:
raise IndexError
for plain in plain_stream:
meta = {}
for key,value in plain.items():
if key != "content" and key != "index":
meta[key] = value
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
return corpus, corpus.spacy_lang return corpus, corpus.spacy_lang
"""
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
def saveplaincorpustext(corpus,path):
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
def save_corpusV2(corpus, corpus_path, corpus_name): def save_corpusV2(corpus, corpus_path, corpus_name):
"""
saves a textacy-corpus including spacy-parser
:param corpus: textacy-Corpus
:param corpus_path: str
:param corpus_name: str (should content the language like "_de_")
"""
# save parser # save parser
parser = corpus.spacy_lang parser = corpus.spacy_lang
@ -302,13 +258,7 @@ def save_corpusV2(corpus, corpus_path, corpus_name):
file.write(json.dumps(doc.metadata)) file.write(json.dumps(doc.metadata))
def load_corpusV2(corpus_path, corpus_name, lang="de"): def load_corpusV2(corpus_path, corpus_name, lang="de"):
"""
Load textacy-Corpus including spacy-parser out from file
:param corpus_path: str
:param corpus_name: str (should content the language like "_de_")
:param lang: str (language code) ir spacy.Language
:return: texracy.Corpus, spacy.language
"""
# ckeck for language # ckeck for language
if "de_" in corpus_name: if "de_" in corpus_name:
@ -359,5 +309,5 @@ def yield_fromdir(path,spacy_vocab=None,type=".pkl"):
else: else:
for filename in filelist: for filename in filelist:
yield load_obj(path+filename) yield load_obj(path+filename)
"""

View File

@ -9,7 +9,7 @@ import sys
import json import json
import os.path import os.path
import subprocess import subprocess
from textacy import Vectorizer from textacy import Vectorizer, viz
from miscellaneous import * from miscellaneous import *
import textacy import textacy
@ -163,7 +163,8 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/" jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root) #dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
dict_path = FILEPATH +"results/labeldict.txt".format(jgibbsLLDA_root)
# printlog(str("LABELDICT: {0}".format(labeldict))) # printlog(str("LABELDICT: {0}".format(labeldict)))
#logprint(str("LABELDICT-length: {0}".format(len(labeldict)))) #logprint(str("LABELDICT-length: {0}".format(len(labeldict))))
@ -243,6 +244,30 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
textacy.fileio.write_file_lines(result, path2save_results) textacy.fileio.write_file_lines(result, path2save_results)
##################################################################################################################### #####################################################################################################################
#todo llda termite plot
"""
topic_inds=[] #<class 'list'>: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
# get topic and term labels
# <class 'tuple'>: ('topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14')
topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)
# <class 'tuple'>: ('hardware', 'raum', 'adresse', 'gebaeude', 'tu', 'uni', 'ticket', 'email', 'account', 'nummer', 'mail', 'outlook', 'karte', 'drucker', 'server', 'service', 'antwort', 'verbindung', 'herzliche', 'einrichten', 'vergessen', 'wenden', 'ews', 'anhang', 'form', 'konto', 'nachricht', 'unterstuetzung', 'passwort', 'unicard', 'semester', 'system', 'aenderung', 'rueckmeldung', 'meldung', 'zugreifen', 'login', 'adressat', 'sender', 'kurs', 'made', 'mittwoch', 'note', 'our', 'korrespondenz', 'unbeschadet', 'boss', 'unterrichten', 'telefax', 'zugang', 'probleme', 'zugriff', 'mitarbeiterin', 'internet', 'daten', 'anmeldung', 'aendern', 'unterschrift', 'loeschen', 'anmelden', 'datei', 'message', 'laptop', 'benoetigt', 'link', 'montag', 'programm', 'ordner', 'personal', 'rechner', 'veranstaltung', 'august', 'lizenz', 'anschluss', 'mitarbeiter', 'erwuenscht', 'umzug', 'pc', 'uniaccount', 'amt', 'fax', 'it', 'institut', 'nutzer', 'bild', 'type', 'prof', 'verantwortlicher', 'bemerkung', 'antragsteller', 'element', 'hahn', 'eintrag', 'telefonbuch', 'ansprechpartner', 'universitaet', 'physik', 'abteilung', 'fakultaet', 'software', 'dezernat', 'einrichtung', 'telefon', 'lehrstuhl', 'buero')
term_labels = tuple(id2term[term_ind] for term_ind in term_inds)
# get topic-term weights to size dots
#[[ 0.02721858 -0.03898025 0.00047936 ..., 0.05862538 -0.07742336 0.04761928]
# [ 0.14977875 -0.24192522 -0.00620335 ..., -0.0497216 0.08269951 -0.05715901]
# [ 0.04977951 0.02296709 0.01214562 ..., 0.11444371 -0.15212482 0.21481788]
# ...,
# [
term_topic_weights = np.array([self.model.components_[topic_ind][term_inds]
for topic_ind in topic_inds]).T
viz.draw_termite_plot(
term_topic_weights, topic_labels, term_labels, save=path2save_results)
"""
logprint("") logprint("")
end = time.time() end = time.time()
@ -275,9 +300,9 @@ def main(use_raw=False, algorithm="llda"):
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics # idee https://en.wikipedia.org/wiki/Noisy_text_analytics
# idee https://gate.ac.uk/family/ # idee https://gate.ac.uk/family/
# todo gescheites tf(-idf) maß finden # todo llda topics zusammenfassen
# todo topics zusammenfassen # idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
# frage wieviele tickets pro topic? # frage lda wieviele tickets pro topic?
""" """
ngrams = 1 ngrams = 1
@ -300,25 +325,25 @@ def main(use_raw=False, algorithm="llda"):
if algorithm == "llda": if algorithm == "llda":
top_topic_words = 5 top_topic_words = 5
add_default_topic = False add_default_topic = False
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "") path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic) add_default_topic=add_default_topic)
top_topic_words = 5 top_topic_words = 5
add_default_topic = True add_default_topic = True
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "") path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic) add_default_topic=add_default_topic)
top_topic_words = 10 top_topic_words = 10
add_default_topic = False add_default_topic = False
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "") path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic) add_default_topic=add_default_topic)
top_topic_words = 10 top_topic_words = 10
add_default_topic = True add_default_topic = True
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "") path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic) add_default_topic=add_default_topic)
@ -339,15 +364,70 @@ def main(use_raw=False, algorithm="llda"):
labeldict = {k: v for v, k in enumerate(labelist)} labeldict = {k: v for v, k in enumerate(labelist)}
n_topics = 15
textacyTopicModeling(ngrams = 1, textacyTopicModeling(ngrams = 1,
min_df = 1, min_df = 1,
max_df = 0.8, max_df = 0.9,
topicModel = algorithm, topicModel = algorithm,
n_topics =n_topics, n_topics =15,
corpus=de_corpus) corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=20,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=25,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=30,
corpus=de_corpus)
textacyTopicModeling(ngrams=(1, 2),
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=15,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =20,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =25,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =30,
corpus=de_corpus)
""" """
textacyTopicModeling(ngrams = (1,2), textacyTopicModeling(ngrams = (1,2),
min_df = 1, min_df = 1,