commit vor refactoring

This commit is contained in:
jannis.grundmann 2017-11-03 11:49:26 +01:00
parent 6ea03b2f65
commit ecc8c0c54a
10 changed files with 148 additions and 97 deletions

View File

@ -41,8 +41,8 @@ filename=topicModelTickets.log
[de_corpus]
#input=M42-Export/Tickets_small.csv
input=M42-Export/de_tickets.csv
input=M42-Export/Tickets_small.csv
#input=M42-Export/de_tickets.csv
path=corpi/
@ -64,7 +64,7 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
custom_words=geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok
custom_words=eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok

31
main.py
View File

@ -2,18 +2,21 @@
import matplotlib
matplotlib.use('Agg')
import time
import init
import corporization
import preprocessing
import topicModeling
import cleaning
from miscellaneous import *
from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
start = time.time()
import init
init.main()
#init.main()
logprint("")
corporization.main()
@ -25,14 +28,32 @@ logprint("")
preprocessing.main() # ~5h
logprint("")
#topicModeling.main(use_raw=False,algorithm="llda")
"""
topicModeling.main(use_raw=False,algorithm="lsa")
logprint("")
#topicModeling.main(use_raw=True)
topicModeling.main(use_raw=False,algorithm="lda")
logprint("")
topicModeling.main(use_raw=False,algorithm="nmf")
logprint("")
topicModeling.main(use_raw=False,algorithm="llda")
logprint("")
"""
logprint("")
end = time.time()

View File

@ -154,18 +154,6 @@ def printRandomDoc(textacyCorpus):
print()
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
def saveplaincorpustext(corpus,path):
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
def save_corpus(corpus, corpus_path, corpus_name):
"""
@ -175,42 +163,22 @@ def save_corpus(corpus, corpus_path, corpus_name):
:param corpus_name: str (should content the language like "_de_")
"""
"""
# save stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path, "w") as file:
parser.vocab.strings.dump(file)
#todo save vocab?
"""
# save parser
parser = corpus.spacy_lang
parserpath = corpus_path + str(parser.lang) + '_parser'
parser.save_to_directory(parserpath)
##
# save content
contentpath = corpus_path + corpus_name + "_content.bin"
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
#save plain content
# save plain content + meta
plainpath = corpus_path + corpus_name + "_content.json"
textacy.fileio.write_json_lines(({"index" : doc.corpus_index, "content" : doc.text} for doc in corpus), plainpath)
# save meta
metapath = corpus_path + corpus_name + "_meta.json"
#meta_gen = (doc.metadata.update({"index": doc.corpus_index}) for doc in corpus)
meta_gen = gen_meta(corpus)
textacy.fileio.write_json_lines(meta_gen, metapath)
textacy.fileio.write_json_lines(gen_dicts(corpus), plainpath)
def gen_meta(corpus):
def gen_dicts(corpus):
for doc in corpus:
meta = doc.metadata
meta.update({"index": doc.corpus_index})
yield meta
dict = {"index" : doc.corpus_index, "content" : doc.text}
dict.update(doc.metadata)
yield dict
@ -233,7 +201,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
# load parser
parser = spacy.load(lang)
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
with open(stringstorepath) as file:
parser.vocab.strings.load(file)
@ -244,46 +211,35 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
#load corpus
corpus = textacy.Corpus(parser)
contentpath = corpus_path + corpus_name + "_content.bin"
plainpath = corpus_path + corpus_name + "_content.json"
metapath = corpus_path + corpus_name + "_meta.json"
"""
try:
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
metadata_stream = textacy.fileio.read_json_lines(metapath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
except:
"""
# neu init!!
#corpus = textacy.Corpus(parser)
plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str}
metadata_stream = textacy.fileio.read_json_lines(metapath)
for plain, metadata in zip(plain_stream, metadata_stream):
if plain["index"] == metadata["index"]:
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=metadata))
else:
raise IndexError
for plain in plain_stream:
meta = {}
for key,value in plain.items():
if key != "content" and key != "index":
meta[key] = value
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
return corpus, corpus.spacy_lang
"""
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
def saveplaincorpustext(corpus,path):
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
def save_corpusV2(corpus, corpus_path, corpus_name):
"""
saves a textacy-corpus including spacy-parser
:param corpus: textacy-Corpus
:param corpus_path: str
:param corpus_name: str (should content the language like "_de_")
"""
# save parser
parser = corpus.spacy_lang
@ -302,13 +258,7 @@ def save_corpusV2(corpus, corpus_path, corpus_name):
file.write(json.dumps(doc.metadata))
def load_corpusV2(corpus_path, corpus_name, lang="de"):
"""
Load textacy-Corpus including spacy-parser out from file
:param corpus_path: str
:param corpus_name: str (should content the language like "_de_")
:param lang: str (language code) ir spacy.Language
:return: texracy.Corpus, spacy.language
"""
# ckeck for language
if "de_" in corpus_name:
@ -359,5 +309,5 @@ def yield_fromdir(path,spacy_vocab=None,type=".pkl"):
else:
for filename in filelist:
yield load_obj(path+filename)
"""

View File

@ -9,7 +9,7 @@ import sys
import json
import os.path
import subprocess
from textacy import Vectorizer
from textacy import Vectorizer, viz
from miscellaneous import *
import textacy
@ -163,7 +163,8 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
#dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
dict_path = FILEPATH +"results/labeldict.txt".format(jgibbsLLDA_root)
# printlog(str("LABELDICT: {0}".format(labeldict)))
#logprint(str("LABELDICT-length: {0}".format(len(labeldict))))
@ -243,6 +244,30 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
textacy.fileio.write_file_lines(result, path2save_results)
#####################################################################################################################
#todo llda termite plot
"""
topic_inds=[] #<class 'list'>: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
# get topic and term labels
# <class 'tuple'>: ('topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14')
topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)
# <class 'tuple'>: ('hardware', 'raum', 'adresse', 'gebaeude', 'tu', 'uni', 'ticket', 'email', 'account', 'nummer', 'mail', 'outlook', 'karte', 'drucker', 'server', 'service', 'antwort', 'verbindung', 'herzliche', 'einrichten', 'vergessen', 'wenden', 'ews', 'anhang', 'form', 'konto', 'nachricht', 'unterstuetzung', 'passwort', 'unicard', 'semester', 'system', 'aenderung', 'rueckmeldung', 'meldung', 'zugreifen', 'login', 'adressat', 'sender', 'kurs', 'made', 'mittwoch', 'note', 'our', 'korrespondenz', 'unbeschadet', 'boss', 'unterrichten', 'telefax', 'zugang', 'probleme', 'zugriff', 'mitarbeiterin', 'internet', 'daten', 'anmeldung', 'aendern', 'unterschrift', 'loeschen', 'anmelden', 'datei', 'message', 'laptop', 'benoetigt', 'link', 'montag', 'programm', 'ordner', 'personal', 'rechner', 'veranstaltung', 'august', 'lizenz', 'anschluss', 'mitarbeiter', 'erwuenscht', 'umzug', 'pc', 'uniaccount', 'amt', 'fax', 'it', 'institut', 'nutzer', 'bild', 'type', 'prof', 'verantwortlicher', 'bemerkung', 'antragsteller', 'element', 'hahn', 'eintrag', 'telefonbuch', 'ansprechpartner', 'universitaet', 'physik', 'abteilung', 'fakultaet', 'software', 'dezernat', 'einrichtung', 'telefon', 'lehrstuhl', 'buero')
term_labels = tuple(id2term[term_ind] for term_ind in term_inds)
# get topic-term weights to size dots
#[[ 0.02721858 -0.03898025 0.00047936 ..., 0.05862538 -0.07742336 0.04761928]
# [ 0.14977875 -0.24192522 -0.00620335 ..., -0.0497216 0.08269951 -0.05715901]
# [ 0.04977951 0.02296709 0.01214562 ..., 0.11444371 -0.15212482 0.21481788]
# ...,
# [
term_topic_weights = np.array([self.model.components_[topic_ind][term_inds]
for topic_ind in topic_inds]).T
viz.draw_termite_plot(
term_topic_weights, topic_labels, term_labels, save=path2save_results)
"""
logprint("")
end = time.time()
@ -275,9 +300,9 @@ def main(use_raw=False, algorithm="llda"):
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
# idee https://gate.ac.uk/family/
# todo gescheites tf(-idf) maß finden
# todo topics zusammenfassen
# frage wieviele tickets pro topic?
# todo llda topics zusammenfassen
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
# frage lda wieviele tickets pro topic?
"""
ngrams = 1
@ -300,25 +325,25 @@ def main(use_raw=False, algorithm="llda"):
if algorithm == "llda":
top_topic_words = 5
add_default_topic = False
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
top_topic_words = 5
add_default_topic = True
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
top_topic_words = 10
add_default_topic = False
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
top_topic_words = 10
add_default_topic = True
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
@ -339,15 +364,70 @@ def main(use_raw=False, algorithm="llda"):
labeldict = {k: v for v, k in enumerate(labelist)}
n_topics = 15
textacyTopicModeling(ngrams = 1,
min_df = 1,
max_df = 0.8,
max_df = 0.9,
topicModel = algorithm,
n_topics =n_topics,
n_topics =15,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=20,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=25,
corpus=de_corpus)
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=30,
corpus=de_corpus)
textacyTopicModeling(ngrams=(1, 2),
min_df=1,
max_df=0.9,
topicModel=algorithm,
n_topics=15,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =20,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =25,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
topicModel = algorithm,
n_topics =30,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams = (1,2),
min_df = 1,