commit vor refactoring

2017-11-03 11:49:26 +01:00 · 2017-11-03 11:49:26 +01:00 · ecc8c0c54a
parent 6ea03b2f65
commit ecc8c0c54a
10 changed files with 148 additions and 97 deletions
--- a/config.ini
+++ b/config.ini
@ -41,8 +41,8 @@ filename=topicModelTickets.log
 [de_corpus]
-#input=M42-Export/Tickets_small.csv
+input=M42-Export/Tickets_small.csv
-input=M42-Export/de_tickets.csv
+#input=M42-Export/de_tickets.csv
 path=corpi/
@ -64,7 +64,7 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
 #ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
-custom_words=geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok
+custom_words=eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok
--- a/java_LabledLDA/models/tickets/.others.gz
+++ b/java_LabledLDA/models/tickets/.others.gz
--- a/java_LabledLDA/models/tickets/.tassign.gz
+++ b/java_LabledLDA/models/tickets/.tassign.gz
--- a/java_LabledLDA/models/tickets/.theta.gz
+++ b/java_LabledLDA/models/tickets/.theta.gz
--- a/java_LabledLDA/models/tickets/.twords.gz
+++ b/java_LabledLDA/models/tickets/.twords.gz
--- a/java_LabledLDA/models/tickets/.wordmap.gz
+++ b/java_LabledLDA/models/tickets/.wordmap.gz
--- a/java_LabledLDA/models/tickets/tickets.gz
+++ b/java_LabledLDA/models/tickets/tickets.gz
--- a/main.py
+++ b/main.py
@ -2,18 +2,21 @@
 import matplotlib
 matplotlib.use('Agg')
 import time
 import init
 import corporization
 import preprocessing
 import topicModeling
 import cleaning
 from miscellaneous import *
 from miscellaneous import *
 # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
 start = time.time()
-import init
+
-init.main()
+
 #init.main()
 logprint("")
 corporization.main()
@ -25,14 +28,32 @@ logprint("")
 preprocessing.main()   # ~5h
 logprint("")
-#topicModeling.main(use_raw=False,algorithm="llda")
+
 """
 topicModeling.main(use_raw=False,algorithm="lsa")
 logprint("")
 #topicModeling.main(use_raw=True)
 topicModeling.main(use_raw=False,algorithm="lda")
 logprint("")
 topicModeling.main(use_raw=False,algorithm="nmf")
 logprint("")
 topicModeling.main(use_raw=False,algorithm="llda")
 logprint("")
 """
 logprint("")
 end = time.time()
--- a/miscellaneous.py
+++ b/miscellaneous.py
@ -154,18 +154,6 @@ def printRandomDoc(textacyCorpus):
    print()
 def corpus2Text(corpus):
    for doc in corpus:
        yield doc.text
 def corpus2Meta(corpus):
    for doc in corpus:
        yield doc.metadata
 def saveplaincorpustext(corpus,path):
    textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
 def save_corpus(corpus, corpus_path, corpus_name):
    """
@ -175,42 +163,22 @@ def save_corpus(corpus, corpus_path, corpus_name):
    :param corpus_name: str (should content the language like "_de_")
    """
    """
    # save stringstore
    stringstore_path = corpus_path + corpus_name + '_strings.json'
    with open(stringstore_path, "w") as file:
        parser.vocab.strings.dump(file)
    #todo save vocab?
   """
    # save parser
    parser = corpus.spacy_lang
    parserpath = corpus_path + str(parser.lang) + '_parser'
    parser.save_to_directory(parserpath)
-    ##
+    # save plain content + meta
    # save content
    contentpath = corpus_path + corpus_name + "_content.bin"
    textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
    #save plain content
    plainpath = corpus_path + corpus_name + "_content.json"
-    textacy.fileio.write_json_lines(({"index" : doc.corpus_index, "content" : doc.text} for doc in corpus), plainpath)
+    textacy.fileio.write_json_lines(gen_dicts(corpus), plainpath)
    # save meta
    metapath = corpus_path + corpus_name + "_meta.json"
    #meta_gen = (doc.metadata.update({"index": doc.corpus_index}) for doc in corpus)
    meta_gen = gen_meta(corpus)
    textacy.fileio.write_json_lines(meta_gen, metapath)
-def gen_meta(corpus):
+
 def gen_dicts(corpus):
    for doc in corpus:
-        meta = doc.metadata
+        dict = {"index" : doc.corpus_index, "content" : doc.text}
-        meta.update({"index": doc.corpus_index})
+        dict.update(doc.metadata)
-        yield meta
+        yield dict
@ -233,7 +201,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
    # load parser
    parser = spacy.load(lang)
    stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
    with open(stringstorepath) as file:
        parser.vocab.strings.load(file)
@ -244,46 +211,35 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
    #load corpus
    corpus = textacy.Corpus(parser)
    contentpath = corpus_path + corpus_name + "_content.bin"
    plainpath = corpus_path + corpus_name + "_content.json"
    metapath = corpus_path + corpus_name + "_meta.json"
    """
    try:
        spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
        metadata_stream = textacy.fileio.read_json_lines(metapath)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            corpus.add_doc(
            textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
    except:
    """
    # neu init!!
    #corpus = textacy.Corpus(parser)
    plain_stream = textacy.fileio.read_json_lines(plainpath)  # yields {int : str}
    metadata_stream = textacy.fileio.read_json_lines(metapath)
    for plain, metadata in zip(plain_stream, metadata_stream):
        if plain["index"] == metadata["index"]:
            corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=metadata))
        else:
            raise IndexError
    for plain in plain_stream:
        meta = {}
        for key,value in plain.items():
            if key != "content" and key != "index":
                meta[key] = value
            corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
    return corpus, corpus.spacy_lang
 """
 def corpus2Text(corpus):
    for doc in corpus:
        yield doc.text
 def corpus2Meta(corpus):
    for doc in corpus:
        yield doc.metadata
 def saveplaincorpustext(corpus,path):
    textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
 def save_corpusV2(corpus, corpus_path, corpus_name):
-    """
+
    saves a textacy-corpus including spacy-parser
    :param corpus: textacy-Corpus
    :param corpus_path: str
    :param corpus_name: str (should content the language like "_de_")
    """
    # save parser
    parser = corpus.spacy_lang
@ -302,13 +258,7 @@ def save_corpusV2(corpus, corpus_path, corpus_name):
            file.write(json.dumps(doc.metadata))
 def load_corpusV2(corpus_path, corpus_name, lang="de"):
-    """
+
    Load textacy-Corpus including spacy-parser out from file
    :param corpus_path: str
    :param corpus_name: str (should content the language like "_de_")
    :param lang: str (language code) ir spacy.Language
    :return: texracy.Corpus, spacy.language
    """
    # ckeck for language
    if "de_" in corpus_name:
@ -359,5 +309,5 @@ def yield_fromdir(path,spacy_vocab=None,type=".pkl"):
    else:
        for filename in filelist:
            yield load_obj(path+filename)
-
+"""
--- a/topicModeling.py
+++ b/topicModeling.py
@ -9,7 +9,7 @@ import sys
 import json
 import os.path
 import subprocess
-from textacy import Vectorizer
+from textacy import Vectorizer, viz
 from miscellaneous import *
 import textacy
@ -163,7 +163,8 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
    jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/"
    LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
-    dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
+    #dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
    dict_path = FILEPATH +"results/labeldict.txt".format(jgibbsLLDA_root)
    # printlog(str("LABELDICT: {0}".format(labeldict)))
    #logprint(str("LABELDICT-length: {0}".format(len(labeldict))))
@ -243,6 +244,30 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
    textacy.fileio.write_file_lines(result, path2save_results)
    #####################################################################################################################
    #todo llda termite plot
    """
    topic_inds=[] #<class 'list'>: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    # get topic and term labels
    # <class 'tuple'>: ('topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14')
    topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)
    # <class 'tuple'>: ('hardware', 'raum', 'adresse', 'gebaeude', 'tu', 'uni', 'ticket', 'email', 'account', 'nummer', 'mail', 'outlook', 'karte', 'drucker', 'server', 'service', 'antwort', 'verbindung', 'herzliche', 'einrichten', 'vergessen', 'wenden', 'ews', 'anhang', 'form', 'konto', 'nachricht', 'unterstuetzung', 'passwort', 'unicard', 'semester', 'system', 'aenderung', 'rueckmeldung', 'meldung', 'zugreifen', 'login', 'adressat', 'sender', 'kurs', 'made', 'mittwoch', 'note', 'our', 'korrespondenz', 'unbeschadet', 'boss', 'unterrichten', 'telefax', 'zugang', 'probleme', 'zugriff', 'mitarbeiterin', 'internet', 'daten', 'anmeldung', 'aendern', 'unterschrift', 'loeschen', 'anmelden', 'datei', 'message', 'laptop', 'benoetigt', 'link', 'montag', 'programm', 'ordner', 'personal', 'rechner', 'veranstaltung', 'august', 'lizenz', 'anschluss', 'mitarbeiter', 'erwuenscht', 'umzug', 'pc', 'uniaccount', 'amt', 'fax', 'it', 'institut', 'nutzer', 'bild', 'type', 'prof', 'verantwortlicher', 'bemerkung', 'antragsteller', 'element', 'hahn', 'eintrag', 'telefonbuch', 'ansprechpartner', 'universitaet', 'physik', 'abteilung', 'fakultaet', 'software', 'dezernat', 'einrichtung', 'telefon', 'lehrstuhl', 'buero')
    term_labels = tuple(id2term[term_ind] for term_ind in term_inds)
    # get topic-term weights to size dots
    #[[ 0.02721858 -0.03898025  0.00047936 ...,  0.05862538 -0.07742336  0.04761928]
    # [ 0.14977875 -0.24192522 -0.00620335 ..., -0.0497216   0.08269951    -0.05715901]
    # [ 0.04977951  0.02296709  0.01214562 ...,  0.11444371 -0.15212482     0.21481788]
    # ..., 
    # [ 
    term_topic_weights = np.array([self.model.components_[topic_ind][term_inds]
                                   for topic_ind in topic_inds]).T
    viz.draw_termite_plot(
        term_topic_weights, topic_labels, term_labels, save=path2save_results)
    """
    logprint("")
    end = time.time()
@ -275,9 +300,9 @@ def main(use_raw=False, algorithm="llda"):
    # idee https://en.wikipedia.org/wiki/Noisy_text_analytics
    # idee https://gate.ac.uk/family/
-    # todo gescheites tf(-idf) maß finden
+    # todo llda topics zusammenfassen
-    # todo topics zusammenfassen
+    # idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
-    # frage wieviele tickets pro topic?
+    # frage lda wieviele tickets pro topic?
    """
    ngrams = 1
@ -300,25 +325,25 @@ def main(use_raw=False, algorithm="llda"):
    if algorithm == "llda":
        top_topic_words = 5
        add_default_topic = False
-        path2save_results = resultspath +  "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
+        path2save_results = resultspath +  "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
        jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
                   add_default_topic=add_default_topic)
        top_topic_words = 5
        add_default_topic = True
-        path2save_results = resultspath +  "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
+        path2save_results = resultspath +  "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
        jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
                   add_default_topic=add_default_topic)
        top_topic_words = 10
        add_default_topic = False
-        path2save_results = resultspath +  "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
+        path2save_results = resultspath +  "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
        jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
                   add_default_topic=add_default_topic)
        top_topic_words = 10
        add_default_topic = True
-        path2save_results = resultspath +  "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
+        path2save_results = resultspath +  "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
        jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
                   add_default_topic=add_default_topic)
@ -339,15 +364,70 @@ def main(use_raw=False, algorithm="llda"):
        labeldict = {k: v for v, k in enumerate(labelist)}
        n_topics = 15
        textacyTopicModeling(ngrams = 1,
                             min_df = 1,
-                             max_df = 0.8,
+                             max_df = 0.9,
                             topicModel = algorithm,
-                             n_topics =n_topics,
+                             n_topics =15,
                             corpus=de_corpus)
        textacyTopicModeling(ngrams=1,
                             min_df=1,
                             max_df=0.9,
                             topicModel=algorithm,
                             n_topics=20,
                             corpus=de_corpus)
        textacyTopicModeling(ngrams=1,
                             min_df=1,
                             max_df=0.9,
                             topicModel=algorithm,
                             n_topics=25,
                             corpus=de_corpus)
        textacyTopicModeling(ngrams=1,
                             min_df=1,
                             max_df=0.9,
                             topicModel=algorithm,
                             n_topics=30,
                             corpus=de_corpus)
        textacyTopicModeling(ngrams=(1, 2),
                             min_df=1,
                             max_df=0.9,
                             topicModel=algorithm,
                             n_topics=15,
                             corpus=de_corpus)
        textacyTopicModeling(ngrams = (1,2),
                             min_df = 1,
                             max_df = 0.9,
                             topicModel = algorithm,
                             n_topics =20,
                             corpus=de_corpus)
        textacyTopicModeling(ngrams = (1,2),
                             min_df = 1,
                             max_df = 0.9,
                             topicModel = algorithm,
                             n_topics =25,
                             corpus=de_corpus)
        textacyTopicModeling(ngrams = (1,2),
                             min_df = 1,
                             max_df = 0.9,
                             topicModel = algorithm,
                             n_topics =30,
                             corpus=de_corpus)
        """
        textacyTopicModeling(ngrams = (1,2),
                             min_df = 1,