unsupervised und supervised-topic-training eingebaut. sollte man jez auf den datensatz loslassen können

2017-09-11 13:00:03 +02:00 · 2017-09-11 13:00:03 +02:00 · 991353b1bb
parent 4dbb07ae3f
commit 991353b1bb
1 changed files with 148 additions and 29 deletions
--- a/preprocessing.py
+++ b/preprocessing.py
@ -1,12 +1,17 @@
 # -*- coding: utf-8 -*-
 import csv
 import functools
 import os.path
 import re
-import spacy
+import subprocess
-import sys
+import time
 import textacy
 import xml.etree.ElementTree as ET
-import io
+
 import spacy
 import textacy
 from scipy import *
 from textacy import Vectorizer
 csv.field_size_limit(sys.maxsize)
@ -18,12 +23,16 @@ with open("config.ini") as f:
    config.read_file(f)
 path2xml = config.get("default","path2xml")
 PARSER = spacy.load(config.get("default","language"))
 corpus = textacy.Corpus(PARSER)
 thesauruspath = config.get("default","thesauruspath")
 THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
 stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",")
 def compose(*functions):
@ -32,7 +41,6 @@ def compose(*functions):
    return functools.reduce(compose2, functions, lambda x: x)
 ################ generate Content and Metadata ########################
 def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'):
    """
@ -73,8 +81,6 @@ def printRandomDoc(textacyCorpus):
    print()
 ################ Preprocess#########################
 def processDictstream(dictstream, funcdict, parser=PARSER):
    for dic in dictstream:
        result = {}
@ -82,7 +88,7 @@ def processDictstream(dictstream, funcdict, parser=PARSER):
            if key in funcdict:
                result[key] = funcdict[key](parser(value))
            else:
-                result[key] = key
+                result[key] = value
        yield result
 def processTextstream(textstream, func, parser=PARSER):
@ -109,7 +115,6 @@ def removeAllENT(ent_list, parser=PARSER):
 doc2Set = lambda doc: str(set([tok.text for tok in doc]))
 doc2String = lambda doc : doc.text
@ -137,13 +142,9 @@ def replacePhonenumbers(replace_with="PHONE",parser=PARSER):
 def resolveAbbreviations(parser=PARSER):
    pass #todo
 def removeWords(words, keep=None,parser=PARSER):
    if hasattr(keep, '__iter__'):
        for k in keep:
@ -155,7 +156,6 @@ def removeWords(words, keep=None,parser=PARSER):
 def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
    #return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
    return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))
@ -196,23 +196,27 @@ def getHauptform(syn_block, word, default_return_first_Syn=False):
                return w
    return word  # zur Not, das ursrpüngliche Wort zurückgeben
 def label2ID(label):
    return {
    'Neuanschluss' : 0,
    'LSF' : 1,
    'Video' : 2,
    }.get(label,3)
-
+def generate_labled_lines(textacyCorpus):
-stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",")
+    for doc in textacyCorpus:
-
+        # generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
-path2xml = config.get("default","path2xml")
+        yield "[" + str(label2ID(doc.metadata["Kategorie"])) + "] " + doc.text
-content_generator = generateMainTextfromTicketXML(path2xml)
+
-metadata_generator = generateMetadatafromTicketXML(path2xml)
+####################'####################'####################'####################'####################'##############
 ents = config.get("preprocessing","ents").split(",")
 clean_in_content=compose(
    doc2String,
@ -232,19 +236,134 @@ clean_in_meta = {
 }
 contentStream = processTextstream(content_generator, func=clean_in_content)
 metaStream = processDictstream(metadata_generator, funcdict=clean_in_meta)
-corpus.add_texts(contentStream,metaStream)
+
-print(corpus[0].text)
+
 ## add files to textacy-corpus,
 print("add texts to textacy-corpus...")
 corpus.add_texts(
    processTextstream(generateMainTextfromTicketXML(path2xml), func=clean_in_content),
    processDictstream(generateMetadatafromTicketXML(path2xml), funcdict=clean_in_meta)
 )
 printRandomDoc(corpus)
-
+####################'####################'      Variablen todo alles in config
-
+
-
+ngrams = (1,2)
 min_df = 0
 max_df = 1.0
 no_below = 20
 no_above = 0.5
 topicModel = 'lda'
 # http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
 weighting = ('tf' if topicModel == 'lda' else 'tfidf')
 top_topic_words = 5
 top_document_labels_per_topic = 2
 n_topics = 4
 ####################'####################
 print("vectorize corpus...")
 vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
 terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
 doc_term_matrix = vectorizer.fit_transform(terms_list)
 id2term = vectorizer.__getattribute__("id_to_term")
 #####################     LSA, LDA, NMF         Topic Modeling via Textacy         ##############################################
 # Initialize and train a topic model
 print("Initialize and train a topic model")
 model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
 model.fit(doc_term_matrix)
 #Transform the corpus and interpret our model:
 print("Transform the corpus and interpret our model")
 doc_topic_matrix = model.transform(doc_term_matrix)
 print()
 for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
    print('topic', topic_idx, ':', '   '.join(top_terms))
 print()
 for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
     print(topic_idx)
     for j in top_docs:
        print(corpus[j].metadata['Kategorie'])
 #####################################################################################################################
 print()
 print()
 #####################   LLDA           Topic Modeling via JGibbsLabledLDA     ##############################################
 jgibbsLLDA_root = "java_LabledLDA/"
 filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
 #create file
 textacy.fileio.write_file_lines(generate_labled_lines(corpus),filepath=filepath)
 # wait for file to exist
 while not os.path.exists(filepath):
    time.sleep(1)
 print("start LLDA..")
 #run JGibsslda file
 FNULL = open(os.devnull, 'w') # supress output
 subprocess.call(["java",
                 "-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root),
                 "jgibblda.LDA",
                 "-est",
                 "-dir", "{0}models/tickets".format(jgibbsLLDA_root),
                 "-dfile","tickets.gz",
                 "-ntopics", str(n_topics)], stdout = FNULL)
 # ANMERKUNG: Dateien sind versteckt. zu finden in models/
 #print twords
 subprocess.call(["gzip",
                 "-dc",
                 "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
 #####################################################################################################################
 print()
 print()