diff --git a/preprocessing.py b/preprocessing.py index 70cb127..61c1709 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -1,12 +1,17 @@ # -*- coding: utf-8 -*- import csv import functools +import os.path import re -import spacy -import sys -import textacy +import subprocess +import time import xml.etree.ElementTree as ET -import io + +import spacy +import textacy +from scipy import * +from textacy import Vectorizer + csv.field_size_limit(sys.maxsize) @@ -18,12 +23,16 @@ with open("config.ini") as f: config.read_file(f) + +path2xml = config.get("default","path2xml") + PARSER = spacy.load(config.get("default","language")) corpus = textacy.Corpus(PARSER) thesauruspath = config.get("default","thesauruspath") THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) +stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",") def compose(*functions): @@ -32,7 +41,6 @@ def compose(*functions): return functools.reduce(compose2, functions, lambda x: x) -################ generate Content and Metadata ######################## def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'): """ @@ -73,8 +81,6 @@ def printRandomDoc(textacyCorpus): print() -################ Preprocess######################### - def processDictstream(dictstream, funcdict, parser=PARSER): for dic in dictstream: result = {} @@ -82,7 +88,7 @@ def processDictstream(dictstream, funcdict, parser=PARSER): if key in funcdict: result[key] = funcdict[key](parser(value)) else: - result[key] = key + result[key] = value yield result def processTextstream(textstream, func, parser=PARSER): @@ -109,7 +115,6 @@ def removeAllENT(ent_list, parser=PARSER): - doc2Set = lambda doc: str(set([tok.text for tok in doc])) doc2String = lambda doc : doc.text @@ -137,13 +142,9 @@ def replacePhonenumbers(replace_with="PHONE",parser=PARSER): - - - def resolveAbbreviations(parser=PARSER): pass #todo - def removeWords(words, keep=None,parser=PARSER): if hasattr(keep, '__iter__'): for k in keep: @@ -155,7 +156,6 @@ def removeWords(words, keep=None,parser=PARSER): - def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER): #return lambda doc : parser(" ".join([tok.lower_ for tok in doc])) return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc])) @@ -196,23 +196,27 @@ def getHauptform(syn_block, word, default_return_first_Syn=False): return w return word # zur Not, das ursrpüngliche Wort zurückgeben +def label2ID(label): + return { + 'Neuanschluss' : 0, + 'LSF' : 1, + 'Video' : 2, + }.get(label,3) - -stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",") - -path2xml = config.get("default","path2xml") +def generate_labled_lines(textacyCorpus): + for doc in textacyCorpus: + # generate [topic1, topic2....] tok1 tok2 tok3 out of corpus + yield "[" + str(label2ID(doc.metadata["Kategorie"])) + "] " + doc.text -content_generator = generateMainTextfromTicketXML(path2xml) -metadata_generator = generateMetadatafromTicketXML(path2xml) + +####################'####################'####################'####################'####################'############## ents = config.get("preprocessing","ents").split(",") - - clean_in_content=compose( doc2String, @@ -232,19 +236,134 @@ clean_in_meta = { } -contentStream = processTextstream(content_generator, func=clean_in_content) -metaStream = processDictstream(metadata_generator, funcdict=clean_in_meta) -corpus.add_texts(contentStream,metaStream) -print(corpus[0].text) + + + +## add files to textacy-corpus, +print("add texts to textacy-corpus...") +corpus.add_texts( + processTextstream(generateMainTextfromTicketXML(path2xml), func=clean_in_content), + processDictstream(generateMetadatafromTicketXML(path2xml), funcdict=clean_in_meta) +) + printRandomDoc(corpus) - - - +####################'####################' Variablen todo alles in config + +ngrams = (1,2) + +min_df = 0 +max_df = 1.0 +no_below = 20 +no_above = 0.5 + +topicModel = 'lda' +# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix +weighting = ('tf' if topicModel == 'lda' else 'tfidf') + +top_topic_words = 5 +top_document_labels_per_topic = 2 + +n_topics = 4 + + + + + + +####################'#################### + + + + +print("vectorize corpus...") +vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) + +terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus) +doc_term_matrix = vectorizer.fit_transform(terms_list) +id2term = vectorizer.__getattribute__("id_to_term") + + + + + + + + + + + +##################### LSA, LDA, NMF Topic Modeling via Textacy ############################################## + +# Initialize and train a topic model +print("Initialize and train a topic model") +model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) +model.fit(doc_term_matrix) + +#Transform the corpus and interpret our model: +print("Transform the corpus and interpret our model") +doc_topic_matrix = model.transform(doc_term_matrix) +print() + + +for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words): + print('topic', topic_idx, ':', ' '.join(top_terms)) + + +print() +for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic): + print(topic_idx) + for j in top_docs: + print(corpus[j].metadata['Kategorie']) + +##################################################################################################################### +print() +print() + + + + + +##################### LLDA Topic Modeling via JGibbsLabledLDA ############################################## + + + +jgibbsLLDA_root = "java_LabledLDA/" +filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) + + +#create file +textacy.fileio.write_file_lines(generate_labled_lines(corpus),filepath=filepath) + + +# wait for file to exist +while not os.path.exists(filepath): + time.sleep(1) + +print("start LLDA..") +#run JGibsslda file +FNULL = open(os.devnull, 'w') # supress output +subprocess.call(["java", + "-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root), + "jgibblda.LDA", + "-est", + "-dir", "{0}models/tickets".format(jgibbsLLDA_root), + "-dfile","tickets.gz", + "-ntopics", str(n_topics)], stdout = FNULL) + +# ANMERKUNG: Dateien sind versteckt. zu finden in models/ + +#print twords +subprocess.call(["gzip", + "-dc", + "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]) +##################################################################################################################### +print() +print()