bereit für weitern testrun

This commit is contained in:
jannis.grundmann 2017-09-21 12:05:32 +02:00
parent 89ea2a863d
commit 2ee9937d23
6 changed files with 384119 additions and 39 deletions

3020
de_stop_words.txt Normal file

File diff suppressed because it is too large Load Diff

20561
firstnames.txt Normal file

File diff suppressed because it is too large Load Diff

1855
german_stopwords_full.txt Normal file

File diff suppressed because it is too large Load Diff

358474
lemmas.txt Normal file

File diff suppressed because it is too large Load Diff

179
testo.py
View File

@ -2,6 +2,9 @@
import time import time
import enchant
start = time.time() start = time.time()
import logging import logging
@ -49,11 +52,14 @@ logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INF
thesauruspath = config.get("filepath","thesauruspath") thesauruspath = config.get("filepath","thesauruspath")
THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
from langdetect import detect
DE_PARSER = spacy.load("de") #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen DE_PARSER = spacy.load("de") #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen
#EN_PARSER = spacy.load("en")
SPELLCHECKER = hunspell.HunSpell('/usr/share/hunspell/de_DE.dic',
'/usr/share/hunspell/de_DE.aff')
def replaceRockDots(): def replaceRockDots():
@ -72,10 +78,15 @@ VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
""" """
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("de_stop_words.txt"))) de_stop_words = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("de_stop_words.txt")))
#en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS))
LEMMAS = list(textacy.fileio.read_file_lines("lemmas.txt")) LEMMAS = list(textacy.fileio.read_file_lines("lemmas.txt"))
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("firstnames.txt"))) VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("firstnames.txt")))
print(de_stop_words[10:30]) print(de_stop_words[10:30])
print(LEMMAS[10:30]) print(LEMMAS[10:30])
print(VORNAMEN[10:30]) print(VORNAMEN[10:30])
@ -164,7 +175,7 @@ def csv_to_contentStream(path2csv: str, content_collumn_name: str):
############# return bool ############# filter tokens
def keepPOS(pos_list): def keepPOS(pos_list):
return lambda tok : tok.pos_ in pos_list return lambda tok : tok.pos_ in pos_list
@ -195,10 +206,12 @@ def removeENT(ent_list):
def remove_words_containing_Numbers(): def remove_words_containing_Numbers():
return lambda tok: not bool(re.search('\d', tok.lower_)) return lambda tok: not bool(re.search('\d', tok.lower_))
""" """
def remove_words_containing_topLVL(): def remove_words_containing_topLVL():
return lambda tok: not bool(re.search(regex_topLvl, tok.lower_)) return lambda tok: not bool(re.search(regex_topLvl, tok.lower_))
""" """
def remove_words_containing_specialCharacters(): def remove_words_containing_specialCharacters():
return lambda tok: not bool(re.search(regex_specialChars, tok.lower_)) return lambda tok: not bool(re.search(regex_specialChars, tok.lower_))
@ -214,9 +227,6 @@ def remove_first_names():
#falls wort nicht in vocab, erst schauen ob teilwort bekannt ist, falls ja, schauen ob es davor oder danach bullshit stehen hat. ggf trennen
############# strings ############# strings
@ -310,14 +320,6 @@ def lemmatizeWord(word,l_dict=lemma_dict,w_dict=word_dict):
except: except:
print(word) print(word)
return word return word
def autocorrectWord(word,spellchecker=SPELLCHECKER):
try:
return spellchecker.suggest(word)[0]
except:
return word
""" """
def lemmatize(): def lemmatize():
return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc]) return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
@ -325,10 +327,22 @@ def lemmatize():
def lemmatize(): def lemmatize():
return lambda string: " ".join([lemmatizeWord(s.lower()) for s in string.split()]) return lambda string: " ".join([lemmatizeWord(s.lower()) for s in string.split()])
DE_SPELLCHECKER = enchant.Dict("de_DE")
EN_SPELLCHECKER = enchant.Dict("en_US")
def autocorrectWord(word,spellchecker=DE_SPELLCHECKER):
try:
return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word
except:
return word
def autocorrect(): def autocorrect():
return lambda string: " ".join([autocorrectWord(s.lower()) for s in string.split()]) return lambda string: " ".join([autocorrectWord(s.lower()) for s in string.split()])
def processTextstream(textstream, pre_parse=None, in_parse=None, post_parse=None, parser=DE_PARSER): def processTextstream(textstream, pre_parse=None, on_tokens=None, post_parse=None, parser=DE_PARSER):
""" """
:param textstream: string-gen :param textstream: string-gen
:param funclist: [func] :param funclist: [func]
@ -348,8 +362,8 @@ def processTextstream(textstream, pre_parse=None, in_parse=None, post_parse=None
tokens = [tok for tok in doc] tokens = [tok for tok in doc]
# in_parse # in_parse
if in_parse is not None: if on_tokens is not None:
tokens = processTokens(tokens, in_parse) tokens = processTokens(tokens, on_tokens)
# post_parse # post_parse
@ -388,13 +402,15 @@ pre_parse=[
] ]
custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus","hallo","gerne","freundlich","fragen","fehler","bitten","ehre" custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus",
"hallo","gerne","freundlich","fragen","fehler","bitten","ehre", "lieb",
"versuchen","unbestimmt","woche","tadelos", "klappen" ,"mittlerweile", "bekommen","erreichbar"
] ]
in_parse=[ on_tokens=[
#removeENT(["PERSON"]), #removeENT(["PERSON"]),
#todo addressen enfernen #idee addressen enfernen #bisher mit cut_after("gruss")
#idee rechtschreibkorrektur #idee rechtschreibkorrektur
#idee thesaurus #idee thesaurus
@ -461,18 +477,21 @@ pipe=[
path2csv = "M42-Export/Tickets_med.csv" path2csv = "M42-Export/Tickets_med.csv"
path2csv = "M42-Export/de_tickets.csv"
de_corpus = textacy.Corpus(DE_PARSER)
#en_corpus = textacy.Corpus(EN_PARSER)
ticketcorpus = textacy.Corpus(DE_PARSER)
## add files to textacy-corpus, ## add files to textacy-corpus,
printlog("add texts to textacy-corpus") printlog("add texts to textacy-corpus")
ticketcorpus.add_texts( de_corpus.add_texts(
processTextstream(csv_to_contentStream(path2csv,"Description"), pre_parse=pre_parse, in_parse=in_parse, post_parse=post_parse) processTextstream(csv_to_contentStream(path2csv,"Description"), pre_parse=pre_parse, on_tokens=on_tokens, post_parse=post_parse)
) )
for i in range(10): for i in range(10):
printRandomDoc(ticketcorpus) printRandomDoc(de_corpus)
@ -526,3 +545,115 @@ sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
############################################ Topic Modeling #############################################
print("\n\n")
start = time.time()
# build citionary of ticketcategories
labelist = []
for texdoc in de_corpus.get(lambda texdoc : texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
LABELDICT = {k: v for v, k in enumerate(labelist)}
print(LABELDICT)
def label2ID(label,labeldict=LABELDICT):
return labeldict.get(label,len(labeldict))
def generate_labled_lines(textacyCorpus):
for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
####################'####################' todo alles in config
ngrams = 1
min_df = 0.1
max_df = 0.9
no_below = 20
no_above = 0.5
topicModel = 'lda'
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
top_topic_words = 10
top_document_labels_per_topic = 5
n_topics = 20 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
end = time.time()
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
####################'####################
printlog("vectorize corpus...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in de_corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
# Initialize and train a topic model
printlog("Initialize and train a topic model..")
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix)
#Transform the corpus and interpret our model:
printlog("Transform the corpus and interpret our model..")
doc_topic_matrix = model.transform(doc_term_matrix)
print()
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
print('topic', topic_idx, ':', ' '.join(top_terms))
print()
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
print(topic_idx)
for j in top_docs:
print(de_corpus[j].metadata['categoryName'])
#####################################################################################################################
print()
print()
end = time.time()
printlog("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))

View File

@ -14,13 +14,54 @@ from textblob_de import PatternParser
import hunspell import hunspell
from postal.parser import parse_address from postal.parser import parse_address
import langdetect
import enchant
print(parse_address(str(textacy.fileio.read_file("teststring.txt"))))
#todo ticket.csv aufteilen in de und en
#print(parse_address(str(textacy.fileio.read_file("teststring.txt"))))
from langdetect import detect
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
content_collumn_name = "Description"
content_collumn = 9 # standardvalue
de_tickets=[]
en_tickets=[]
misc_tickets=[]
error_count = 0
for i, lst in enumerate(stream):
if i == 0:
de_tickets.append(lst)
en_tickets.append(lst)
misc_tickets.append(lst)
else:
try:
content_collumn_ = lst[content_collumn]
if detect(content_collumn_) == "de":
de_tickets.append(lst)
elif detect(content_collumn_) == "en":
en_tickets.append(lst)
else:
misc_tickets.append(lst)
except:
misc_tickets.append(lst)
error_count += 1
print(error_count)
textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";")
textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")
textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")
@ -72,16 +113,14 @@ for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
""" """
""" """
de_stop_words= set( def replaceRockDots():
list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
list(textacy.fileio.read_file_lines("stopwords-de.txt"))
)
LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt"))
VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt")) de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt"))
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True)) #blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
@ -89,21 +128,21 @@ VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
#print(blob.entities) #print(blob.entities)
de_stop_words = list(map(replaceRockDots(),de_stop_words)) de_stop_words = list(map(replaceRockDots(),de_stop_words))
LEMMAS = list(map(replaceRockDots(),LEMMAS)) #LEMMAS = list(map(replaceRockDots(),LEMMAS))
VORNAMEN = list(map(replaceRockDots(),VORNAMEN)) #VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words)) de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS)) #LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN)) #VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt") #textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt") #textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
textacy.fileio.write_file_lines(de_stop_words,"de_stop_words.txt") textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
""" """
end = time.time() end = time.time()
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start)) print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))