bereit für weitern testrun

This commit is contained in:
jannis.grundmann 2017-09-25 13:12:23 +02:00
parent 2ee9937d23
commit 6b8785d987
4 changed files with 809968 additions and 165 deletions

750692
deWordNet.xml Normal file

File diff suppressed because it is too large Load Diff

58784
nomen.txt Normal file

File diff suppressed because it is too large Load Diff

525
testo.py
View File

@ -1,6 +1,9 @@
# -*- coding: utf-8 -*-
from datetime import datetime
print(datetime.now())
import time
import enchant
@ -31,25 +34,28 @@ from postal.parser import parse_address
csv.field_size_limit(sys.maxsize)
#ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/testo.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout.log &"
# Load the configuration file
# todo configuration file
"""
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
"""
logile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
# config logging
logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
logging.basicConfig(filename=logile, level=logging.INFO)
#logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
thesauruspath = config.get("filepath","thesauruspath")
thesauruspath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv"
#thesauruspath = config.get("filepath","thesauruspath")
THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
from langdetect import detect
@ -77,21 +83,25 @@ LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt"))
VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
"""
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("de_stop_words.txt")))
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt")))
#en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS))
LEMMAS = list(textacy.fileio.read_file_lines("lemmas.txt"))
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("firstnames.txt")))
LEMMAS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
NOUNS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt"))
NOUNS = NOUNS +list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
print(de_stop_words[10:30])
print(LEMMAS[10:30])
print(VORNAMEN[10:30])
print(NOUNS[10:30])
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
regex_topLvl = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
@ -111,6 +121,9 @@ def printlog(string, level="INFO"):
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
printlog(str(datetime.now()))
printlog("Load functions")
def compose(*functions):
@ -142,14 +155,13 @@ def get_calling_function():
return func
raise AttributeError("func not found")
def printRandomDoc(textacyCorpus):
import random
print()
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
print()
@ -173,6 +185,31 @@ def csv_to_contentStream(path2csv: str, content_collumn_name: str):
else:
yield lst[content_collumn]
def csv_to_metaStream(path2csv: str, metalist: [str]):
"""
:param path2csv: string
:param metalist: list of strings
:return: dict-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
metaindices = []
metadata_temp = {}
for i, lst in enumerate(stream):
if i == 0:
for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
for key in metalist:
if key == col:
metaindices.append(j)
metadata_temp = dict(
zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
else:
metadata = metadata_temp.copy()
for key, value in metadata.items():
metadata[key] = lst[value]
yield metadata
############# filter tokens
@ -180,6 +217,9 @@ def csv_to_contentStream(path2csv: str, content_collumn_name: str):
def keepPOS(pos_list):
return lambda tok : tok.pos_ in pos_list
def keepNouns(noun_list=NOUNS):
return lambda tok : tok.lower_ in noun_list
def removePOS(pos_list):
return lambda tok : tok.pos_ not in pos_list
@ -210,11 +250,11 @@ def remove_words_containing_Numbers():
"""
def remove_words_containing_topLVL():
return lambda tok: not bool(re.search(regex_topLvl, tok.lower_))
"""
def remove_words_containing_specialCharacters():
return lambda tok: not bool(re.search(regex_specialChars, tok.lower_))
"""
def remove_long_words():
return lambda tok: not len(tok.lower_) < 2
@ -234,9 +274,12 @@ def remove_first_names():
def remove_addresses(string):
pass #todo
"""
def stringcleaning(stringstream, funclist):
for string in stringstream:
for f in funclist:
string = f(string)
yield string
@ -267,11 +310,9 @@ def replacePhonenumbers(replace_with="PHONENUMBER"):
def replaceSharpS(replace_with="ss"):
return lambda string: re.sub(r'[ß]',replace_with,string.lower())
def fixUnicode():
return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
"""
"""
def lemmatizeWord(word,filepath=LEMMAS):
@ -279,7 +320,7 @@ def lemmatizeWord(word,filepath=LEMMAS):
if word.lower() == line.split()[1].strip().lower():
return line.split()[0].strip().lower()
return word.lower() # falls nix gefunden wurde
"""
def create_lemma_dicts(lemmalist=LEMMAS):
w_dict = {}
@ -320,10 +361,10 @@ def lemmatizeWord(word,l_dict=lemma_dict,w_dict=word_dict):
except:
print(word)
return word
"""
def lemmatize():
return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
"""
def lemmatize():
return lambda string: " ".join([lemmatizeWord(s.lower()) for s in string.split()])
@ -341,78 +382,116 @@ def autocorrectWord(word,spellchecker=DE_SPELLCHECKER):
def autocorrect():
return lambda string: " ".join([autocorrectWord(s.lower()) for s in string.split()])
"""
def processTextstream(textstream, pre_parse=None, on_tokens=None, post_parse=None, parser=DE_PARSER):
def create_lemma_dicts(lemmalist=LEMMAS):
w_dict = {}
lem_dict = {}
for i, line in enumerate(lemmalist):
try:
lem_word_pair = line.split()
if len(lem_word_pair) != 2:
print(line)
lemma = lem_word_pair[0].strip().lower()
word = lem_word_pair[1].strip().lower()
except:
print(line)
if lemma not in lem_dict:
lem_dict[lemma] = i
if word not in w_dict:
w_dict[word] = lem_dict[lemma]
l_dict = {v: k for k, v in lem_dict.items()} # switch key/values
return l_dict, w_dict
lemma_dict, word_dict = create_lemma_dicts()
def lemmatizeWord(word, l_dict=lemma_dict, w_dict=word_dict, n=3):
# mehrmals machen
for i in range(n):
try:
word = l_dict[w_dict[word.lower()]] if word.lower() in w_dict else word.lower()
except:
print(word)
return word
DE_SPELLCHECKER = enchant.Dict("de_DE")
EN_SPELLCHECKER = enchant.Dict("en_US")
def autocorrectWord(word, spellchecker=DE_SPELLCHECKER):
try:
return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word
except:
return word
############# stringcleaning
def stringcleaning(stringstream):
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
regex_topLvl = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
for string in stringstream:
string = string.lower()
# fixUnicode
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
# remove_words_containing_topLVL
string = " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w)])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# seperate_words_on_regex:
string = " ".join(re.compile(regex_specialChars).split(string))
# cut_after
word = "gruss"
string = string.rpartition(word)[0] if word in string else string
# lemmatize
string = " ".join([lemmatizeWord(word) for word in string.split()])
# autocorrect
#string = " ".join([autocorrectWord(word) for word in string.split()])
yield string
def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
"""
:param textstream: string-gen
:param funclist: [func]
:param parser: spacy-parser
:return: string-gen
"""
#pre_parse
if pre_parse is not None:
textstream = stringcleaning(textstream, pre_parse)
pipe = parser.pipe(textstream)
tokens=[]
for doc in pipe:
tokens = [tok for tok in doc]
# in_parse
if on_tokens is not None:
tokens = processTokens(tokens, on_tokens)
# post_parse
if post_parse is not None:
#todo vllt doch lieber eine große funktion basteln, dieses zusammenfrickeln nervt
yield post_parse(parser(" ".join([tok.lower_ for tok in tokens])))
else:
yield " ".join([tok.lower_ for tok in tokens])
def processTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
return tokens
pre_parse=[
fixUnicode(),
replaceRockDots(),
remove_words_containing_topLVL(),
seperate_words_on_regex(),
lemmatize(),
cut_after(),
autocorrect()
]
custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus",
"hallo","gerne","freundlich","fragen","fehler","bitten","ehre", "lieb",
"versuchen","unbestimmt","woche","tadelos", "klappen" ,"mittlerweile", "bekommen","erreichbar"
]
on_tokens=[
"""
filter_tokens=[
#removeENT(["PERSON"]),
#idee addressen enfernen #bisher mit cut_after("gruss")
#idee rechtschreibkorrektur
#idee thesaurus
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
#idee rechtschreibkorrektur --> PyEnchant
#idee thesaurus --> WordNet, eigener
remove_words_containing_Numbers(),
@ -424,14 +503,122 @@ on_tokens=[
remove_short_words(),
remove_first_names(),
keepPOS(["NOUN"]),
]
"""
#pre_parse
textstream = stringcleaning(textstream)
pipe = parser.pipe(textstream)
tokens=[]
for doc in pipe:
tokens = [tok for tok in doc]
print(" ".join([tok.lower_ for tok in tokens]))
# in_parse
if token_filterlist is not None:
tokens = filterTokens(tokens, token_filterlist)
yield " ".join([tok.lower_ for tok in tokens])
def processDictstream(dictstream, funcdict, parser=DE_PARSER):
"""
:param dictstream: dict-gen
:param funcdict:
clean_in_meta = {
"Solution":funclist,
...
}
:param parser: spacy-parser
:return: dict-gen
"""
for dic in dictstream:
result = {}
for key, value in dic.items():
if key in funcdict:
doc = parser(value)
tokens = [tok for tok in doc]
funclist = funcdict[key]
tokens = filterTokens(tokens, funclist)
result[key] = " ".join([tok.lower_ for tok in tokens])
else:
result[key] = value
yield result
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
return tokens
custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus",
"hallo","gerne","freundlich","fragen","fehler","bitten","ehre", "lieb",
"versuchen","unbestimmt","woche","tadelos", "klappen" ,"mittlerweile", "bekommen","erreichbar"
]
filter_tokens=[
#removeENT(["PERSON"]),
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
#idee rechtschreibkorrektur --> PyEnchant
#idee thesaurus --> WordNet
keepNouns(),
remove_words_containing_Numbers(),
removePOS(["PUNCT","SPACE","NUM"]),
removeWords(de_stop_words+custom_words),
remove_long_words(),
remove_short_words(),
remove_first_names()
#keepPOS(["NOUN"]),
]
post_parse=None
metaliste = [
"Subject",
"categoryName",
"Solution"
]
clean_in_meta = {
"Solution":[removePOS(["SPACE"])],
"Subject":[removePOS(["SPACE","PUNCT"])],
"categoryName": [removePOS(["SPACE", "PUNCT"])]
}
"""
@ -476,8 +663,13 @@ pipe=[
path2csv = "M42-Export/Tickets_med.csv"
path2csv = "M42-Export/de_tickets.csv"
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
de_corpus = textacy.Corpus(DE_PARSER)
#en_corpus = textacy.Corpus(EN_PARSER)
@ -487,7 +679,8 @@ de_corpus = textacy.Corpus(DE_PARSER)
## add files to textacy-corpus,
printlog("add texts to textacy-corpus")
de_corpus.add_texts(
processTextstream(csv_to_contentStream(path2csv,"Description"), pre_parse=pre_parse, on_tokens=on_tokens, post_parse=post_parse)
processContentstream(csv_to_contentStream(path2csv,"Description"), token_filterlist=filter_tokens),
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
)
for i in range(10):
@ -496,28 +689,6 @@ for i in range(10):
"""
spracherkennung
@ -540,6 +711,9 @@ wörter korrigieren
sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
"""
end = time.time()
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
@ -550,36 +724,6 @@ print("\n\n")
start = time.time()
# build citionary of ticketcategories
labelist = []
for texdoc in de_corpus.get(lambda texdoc : texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
LABELDICT = {k: v for v, k in enumerate(labelist)}
print(LABELDICT)
def label2ID(label,labeldict=LABELDICT):
return labeldict.get(label,len(labeldict))
def generate_labled_lines(textacyCorpus):
for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
####################'####################' todo alles in config
ngrams = 1
min_df = 0.1
@ -594,13 +738,10 @@ weighting = ('tf' if topicModel == 'lda' else 'tfidf')
top_topic_words = 10
top_document_labels_per_topic = 5
n_topics = 20 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
n_topics = 15 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
end = time.time()
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
@ -640,14 +781,14 @@ print()
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
print('topic', topic_idx, ':', ' '.join(top_terms))
printlog('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
print()
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
print(topic_idx)
printlog(topic_idx)
for j in top_docs:
print(de_corpus[j].metadata['categoryName'])
printlog(de_corpus[j].metadata['categoryName'])
#####################################################################################################################
print()
@ -656,4 +797,92 @@ print()
end = time.time()
printlog("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start)/60,topicModel))
"""
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
print("\n\n")
start = time.time()
n_topics = len(LABELDICT) #len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
# build citionary of ticketcategories
labelist = []
for texdoc in de_corpus.get(lambda texdoc : texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
LABELDICT = {k: v for v, k in enumerate(labelist)}
print(LABELDICT)
def label2ID(label,labeldict=LABELDICT):
return labeldict.get(label,len(labeldict))
def generate_labled_lines(textacyCorpus):
for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
#create file
textacy.fileio.write_file_lines(generate_labled_lines(de_corpus), filepath=LLDA_filepath)
# wait for file to exist
while not os.path.exists(LLDA_filepath):
time.sleep(1)
print("\n\n")
printlog("start LLDA:")
#run JGibsslda file
FNULL = open(os.devnull, 'w') # supress output
subprocess.call(["java",
"-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root),
"jgibblda.LDA",
"-est",
"-dir", "{0}models/tickets".format(jgibbsLLDA_root),
"-dfile","tickets.gz",
"-twords",str(top_topic_words),
"-ntopics", str(n_topics)], stdout = FNULL)
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
#twords
subprocess.call(["gzip",
"-dc",
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
#####################################################################################################################
print()
print()
end = time.time()
printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start)/60))
"""

132
testra.py
View File

@ -1,34 +1,132 @@
# -*- coding: utf-8 -*-
import time
start = time.time()
import corenlp as corenlp
import os
import re
import time
import spacy
import textacy
import nltk
from textblob_de import TextBlobDE
from textblob_de import PatternParser
#from polyglot.text import Text
import hunspell
from postal.parser import parse_address
import langdetect
import enchant
start = time.time()
from datetime import datetime
#todo ticket.csv aufteilen in de und en
import xml.etree.ElementTree as ET
print(datetime.now())
path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml"
#print(parse_address(str(textacy.fileio.read_file("teststring.txt"))))
from langdetect import detect
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
nomen=[]
### extract from derewo
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
for line in raw:
line_list=line.split()
if line_list[2] == "NN":
string = line_list[1].lower()
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
nomen.append(string.lower().strip())
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
"""
### extract from deWordNet.xml
#https://github.com/hdaSprachtechnologie/odenet
for r in root:
for element in r:
if element.tag == "LexicalEntry":
for i,subentry in enumerate(element):
if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n":
string = (subentry.attrib["writtenForm"])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# seperate_words_on_regex:
string = " ".join(re.compile(regex_specialChars).split(string))
string_list=string.split()
if len(string_list) == 1:
nomen.append(string.lower().strip())
textacy.fileio.write_file_lines(nomen,"nomen.txt")
"""
"""
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
content_collumn_name = "Description"
content_collumn = 9 # standardvalue
@ -64,7 +162,7 @@ textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")
textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")
"""