unsupervised und supervised-topic-training eingebaut. sollte man jez auf den datensatz loslassen können

2017-09-11 13:00:03 +02:00 · 2017-09-11 13:00:03 +02:00 · 991353b1bb
parent 4dbb07ae3f
commit 991353b1bb
1 changed files with 148 additions and 29 deletions
--- a/preprocessing.py
+++ b/preprocessing.py
@ -1,12 +1,17 @@
 # -*- coding: utf-8 -*-
 import csv
 import functools
+import os.path
 import re
-import spacy
-import sys
-import textacy
+import subprocess
+import time
 import xml.etree.ElementTree as ET
-import io
+
+import spacy
+import textacy
+from scipy import *
+from textacy import Vectorizer
+
 csv.field_size_limit(sys.maxsize)


@ -18,12 +23,16 @@ with open("config.ini") as f:
    config.read_file(f)


+
+path2xml = config.get("default","path2xml")
+
 PARSER = spacy.load(config.get("default","language"))
 corpus = textacy.Corpus(PARSER)

 thesauruspath = config.get("default","thesauruspath")
 THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))

+stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",")


 def compose(*functions):
@ -32,7 +41,6 @@ def compose(*functions):
    return functools.reduce(compose2, functions, lambda x: x)


-################ generate Content and Metadata ########################

 def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'):
    """
@ -73,8 +81,6 @@ def printRandomDoc(textacyCorpus):

    print()

-################ Preprocess#########################
-
 def processDictstream(dictstream, funcdict, parser=PARSER):
    for dic in dictstream:
        result = {}
@ -82,7 +88,7 @@ def processDictstream(dictstream, funcdict, parser=PARSER):
            if key in funcdict:
                result[key] = funcdict[key](parser(value))
            else:
-                result[key] = key
+                result[key] = value
        yield result

 def processTextstream(textstream, func, parser=PARSER):
@ -109,7 +115,6 @@ def removeAllENT(ent_list, parser=PARSER):



-
 doc2Set = lambda doc: str(set([tok.text for tok in doc]))
 doc2String = lambda doc : doc.text

@ -137,13 +142,9 @@ def replacePhonenumbers(replace_with="PHONE",parser=PARSER):



-
-
-
 def resolveAbbreviations(parser=PARSER):
    pass #todo

-
 def removeWords(words, keep=None,parser=PARSER):
    if hasattr(keep, '__iter__'):
        for k in keep:
@ -155,7 +156,6 @@ def removeWords(words, keep=None,parser=PARSER):



-
 def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
    #return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
    return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))
@ -196,23 +196,27 @@ def getHauptform(syn_block, word, default_return_first_Syn=False):
                return w
    return word  # zur Not, das ursrpüngliche Wort zurückgeben

+def label2ID(label):
+    return {
+    'Neuanschluss' : 0,
+    'LSF' : 1,
+    'Video' : 2,
+    }.get(label,3)

-
-stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",")
-
-path2xml = config.get("default","path2xml")
+def generate_labled_lines(textacyCorpus):
+    for doc in textacyCorpus:
+        # generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
+        yield "[" + str(label2ID(doc.metadata["Kategorie"])) + "] " + doc.text



-content_generator = generateMainTextfromTicketXML(path2xml)
-metadata_generator = generateMetadatafromTicketXML(path2xml)
+
+####################'####################'####################'####################'####################'##############



 ents = config.get("preprocessing","ents").split(",")

-
-
 clean_in_content=compose(

    doc2String,
@ -232,19 +236,134 @@ clean_in_meta = {
 }


-contentStream = processTextstream(content_generator, func=clean_in_content)
-metaStream = processDictstream(metadata_generator, funcdict=clean_in_meta)


-corpus.add_texts(contentStream,metaStream)
-print(corpus[0].text)
+
+
+
+## add files to textacy-corpus,
+print("add texts to textacy-corpus...")
+corpus.add_texts(
+    processTextstream(generateMainTextfromTicketXML(path2xml), func=clean_in_content),
+    processDictstream(generateMetadatafromTicketXML(path2xml), funcdict=clean_in_meta)
+)
+
 printRandomDoc(corpus)



-
-
-
+####################'####################'      Variablen todo alles in config
+
+ngrams = (1,2)
+
+min_df = 0
+max_df = 1.0
+no_below = 20
+no_above = 0.5
+
+topicModel = 'lda'
+# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
+weighting = ('tf' if topicModel == 'lda' else 'tfidf')
+
+top_topic_words = 5
+top_document_labels_per_topic = 2
+
+n_topics = 4
+
+
+
+
+
+
+####################'####################
+
+
+
+
+print("vectorize corpus...")
+vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
+
+terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
+doc_term_matrix = vectorizer.fit_transform(terms_list)
+id2term = vectorizer.__getattribute__("id_to_term")
+
+
+
+
+
+
+
+
+
+
+
+#####################     LSA, LDA, NMF         Topic Modeling via Textacy         ##############################################
+
+# Initialize and train a topic model
+print("Initialize and train a topic model")
+model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
+model.fit(doc_term_matrix)
+
+#Transform the corpus and interpret our model:
+print("Transform the corpus and interpret our model")
+doc_topic_matrix = model.transform(doc_term_matrix)
+print()
+
+
+for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
+    print('topic', topic_idx, ':', '   '.join(top_terms))
+
+
+print()
+for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
+     print(topic_idx)
+     for j in top_docs:
+        print(corpus[j].metadata['Kategorie'])
+
+#####################################################################################################################
+print()
+print()
+
+
+
+
+
+#####################   LLDA           Topic Modeling via JGibbsLabledLDA     ##############################################
+
+
+
+jgibbsLLDA_root = "java_LabledLDA/"
+filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
+
+
+#create file
+textacy.fileio.write_file_lines(generate_labled_lines(corpus),filepath=filepath)
+
+
+# wait for file to exist
+while not os.path.exists(filepath):
+    time.sleep(1)
+
+print("start LLDA..")
+#run JGibsslda file
+FNULL = open(os.devnull, 'w') # supress output
+subprocess.call(["java",
+                 "-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root),
+                 "jgibblda.LDA",
+                 "-est",
+                 "-dir", "{0}models/tickets".format(jgibbsLLDA_root),
+                 "-dfile","tickets.gz",
+                 "-ntopics", str(n_topics)], stdout = FNULL)
+
+# ANMERKUNG: Dateien sind versteckt. zu finden in models/
+
+#print twords
+subprocess.call(["gzip",
+                 "-dc",
+                 "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
+#####################################################################################################################
+print()
+print()