unsupervised und supervised-topic-training eingebaut. sollte man jez auf den datensatz loslassen können

This commit is contained in:
jannis.grundmann 2017-09-11 13:00:03 +02:00
parent 4dbb07ae3f
commit 991353b1bb
1 changed files with 148 additions and 29 deletions

View File

@ -1,12 +1,17 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import csv import csv
import functools import functools
import os.path
import re import re
import spacy import subprocess
import sys import time
import textacy
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import io
import spacy
import textacy
from scipy import *
from textacy import Vectorizer
csv.field_size_limit(sys.maxsize) csv.field_size_limit(sys.maxsize)
@ -18,12 +23,16 @@ with open("config.ini") as f:
config.read_file(f) config.read_file(f)
path2xml = config.get("default","path2xml")
PARSER = spacy.load(config.get("default","language")) PARSER = spacy.load(config.get("default","language"))
corpus = textacy.Corpus(PARSER) corpus = textacy.Corpus(PARSER)
thesauruspath = config.get("default","thesauruspath") thesauruspath = config.get("default","thesauruspath")
THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",")
def compose(*functions): def compose(*functions):
@ -32,7 +41,6 @@ def compose(*functions):
return functools.reduce(compose2, functions, lambda x: x) return functools.reduce(compose2, functions, lambda x: x)
################ generate Content and Metadata ########################
def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'): def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'):
""" """
@ -73,8 +81,6 @@ def printRandomDoc(textacyCorpus):
print() print()
################ Preprocess#########################
def processDictstream(dictstream, funcdict, parser=PARSER): def processDictstream(dictstream, funcdict, parser=PARSER):
for dic in dictstream: for dic in dictstream:
result = {} result = {}
@ -82,7 +88,7 @@ def processDictstream(dictstream, funcdict, parser=PARSER):
if key in funcdict: if key in funcdict:
result[key] = funcdict[key](parser(value)) result[key] = funcdict[key](parser(value))
else: else:
result[key] = key result[key] = value
yield result yield result
def processTextstream(textstream, func, parser=PARSER): def processTextstream(textstream, func, parser=PARSER):
@ -109,7 +115,6 @@ def removeAllENT(ent_list, parser=PARSER):
doc2Set = lambda doc: str(set([tok.text for tok in doc])) doc2Set = lambda doc: str(set([tok.text for tok in doc]))
doc2String = lambda doc : doc.text doc2String = lambda doc : doc.text
@ -137,13 +142,9 @@ def replacePhonenumbers(replace_with="PHONE",parser=PARSER):
def resolveAbbreviations(parser=PARSER): def resolveAbbreviations(parser=PARSER):
pass #todo pass #todo
def removeWords(words, keep=None,parser=PARSER): def removeWords(words, keep=None,parser=PARSER):
if hasattr(keep, '__iter__'): if hasattr(keep, '__iter__'):
for k in keep: for k in keep:
@ -155,7 +156,6 @@ def removeWords(words, keep=None,parser=PARSER):
def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER): def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc])) #return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc])) return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))
@ -196,23 +196,27 @@ def getHauptform(syn_block, word, default_return_first_Syn=False):
return w return w
return word # zur Not, das ursrpüngliche Wort zurückgeben return word # zur Not, das ursrpüngliche Wort zurückgeben
def label2ID(label):
return {
'Neuanschluss' : 0,
'LSF' : 1,
'Video' : 2,
}.get(label,3)
def generate_labled_lines(textacyCorpus):
stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",") for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
path2xml = config.get("default","path2xml") yield "[" + str(label2ID(doc.metadata["Kategorie"])) + "] " + doc.text
content_generator = generateMainTextfromTicketXML(path2xml)
metadata_generator = generateMetadatafromTicketXML(path2xml) ####################'####################'####################'####################'####################'##############
ents = config.get("preprocessing","ents").split(",") ents = config.get("preprocessing","ents").split(",")
clean_in_content=compose( clean_in_content=compose(
doc2String, doc2String,
@ -232,19 +236,134 @@ clean_in_meta = {
} }
contentStream = processTextstream(content_generator, func=clean_in_content)
metaStream = processDictstream(metadata_generator, funcdict=clean_in_meta)
corpus.add_texts(contentStream,metaStream)
print(corpus[0].text)
## add files to textacy-corpus,
print("add texts to textacy-corpus...")
corpus.add_texts(
processTextstream(generateMainTextfromTicketXML(path2xml), func=clean_in_content),
processDictstream(generateMetadatafromTicketXML(path2xml), funcdict=clean_in_meta)
)
printRandomDoc(corpus) printRandomDoc(corpus)
####################'####################' Variablen todo alles in config
ngrams = (1,2)
min_df = 0
max_df = 1.0
no_below = 20
no_above = 0.5
topicModel = 'lda'
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
top_topic_words = 5
top_document_labels_per_topic = 2
n_topics = 4
####################'####################
print("vectorize corpus...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
# Initialize and train a topic model
print("Initialize and train a topic model")
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix)
#Transform the corpus and interpret our model:
print("Transform the corpus and interpret our model")
doc_topic_matrix = model.transform(doc_term_matrix)
print()
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
print('topic', topic_idx, ':', ' '.join(top_terms))
print()
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
print(topic_idx)
for j in top_docs:
print(corpus[j].metadata['Kategorie'])
#####################################################################################################################
print()
print()
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
jgibbsLLDA_root = "java_LabledLDA/"
filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
#create file
textacy.fileio.write_file_lines(generate_labled_lines(corpus),filepath=filepath)
# wait for file to exist
while not os.path.exists(filepath):
time.sleep(1)
print("start LLDA..")
#run JGibsslda file
FNULL = open(os.devnull, 'w') # supress output
subprocess.call(["java",
"-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root),
"jgibblda.LDA",
"-est",
"-dir", "{0}models/tickets".format(jgibbsLLDA_root),
"-dfile","tickets.gz",
"-ntopics", str(n_topics)], stdout = FNULL)
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
#print twords
subprocess.call(["gzip",
"-dc",
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
#####################################################################################################################
print()
print()