bereit für weitern testrun
This commit is contained in:
parent
89ea2a863d
commit
2ee9937d23
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
179
testo.py
179
testo.py
|
@ -2,6 +2,9 @@
|
||||||
|
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
import enchant
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
@ -49,11 +52,14 @@ logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INF
|
||||||
thesauruspath = config.get("filepath","thesauruspath")
|
thesauruspath = config.get("filepath","thesauruspath")
|
||||||
THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
|
THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
|
||||||
|
|
||||||
|
from langdetect import detect
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
DE_PARSER = spacy.load("de") #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen
|
DE_PARSER = spacy.load("de") #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen
|
||||||
|
#EN_PARSER = spacy.load("en")
|
||||||
|
|
||||||
|
|
||||||
SPELLCHECKER = hunspell.HunSpell('/usr/share/hunspell/de_DE.dic',
|
|
||||||
'/usr/share/hunspell/de_DE.aff')
|
|
||||||
|
|
||||||
|
|
||||||
def replaceRockDots():
|
def replaceRockDots():
|
||||||
|
@ -72,10 +78,15 @@ VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
|
||||||
"""
|
"""
|
||||||
|
|
||||||
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("de_stop_words.txt")))
|
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("de_stop_words.txt")))
|
||||||
|
|
||||||
|
#en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS))
|
||||||
|
|
||||||
LEMMAS = list(textacy.fileio.read_file_lines("lemmas.txt"))
|
LEMMAS = list(textacy.fileio.read_file_lines("lemmas.txt"))
|
||||||
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("firstnames.txt")))
|
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("firstnames.txt")))
|
||||||
|
|
||||||
print(de_stop_words[10:30])
|
print(de_stop_words[10:30])
|
||||||
|
|
||||||
|
|
||||||
print(LEMMAS[10:30])
|
print(LEMMAS[10:30])
|
||||||
print(VORNAMEN[10:30])
|
print(VORNAMEN[10:30])
|
||||||
|
|
||||||
|
@ -164,7 +175,7 @@ def csv_to_contentStream(path2csv: str, content_collumn_name: str):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
############# return bool
|
############# filter tokens
|
||||||
|
|
||||||
def keepPOS(pos_list):
|
def keepPOS(pos_list):
|
||||||
return lambda tok : tok.pos_ in pos_list
|
return lambda tok : tok.pos_ in pos_list
|
||||||
|
@ -195,10 +206,12 @@ def removeENT(ent_list):
|
||||||
|
|
||||||
def remove_words_containing_Numbers():
|
def remove_words_containing_Numbers():
|
||||||
return lambda tok: not bool(re.search('\d', tok.lower_))
|
return lambda tok: not bool(re.search('\d', tok.lower_))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def remove_words_containing_topLVL():
|
def remove_words_containing_topLVL():
|
||||||
return lambda tok: not bool(re.search(regex_topLvl, tok.lower_))
|
return lambda tok: not bool(re.search(regex_topLvl, tok.lower_))
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def remove_words_containing_specialCharacters():
|
def remove_words_containing_specialCharacters():
|
||||||
return lambda tok: not bool(re.search(regex_specialChars, tok.lower_))
|
return lambda tok: not bool(re.search(regex_specialChars, tok.lower_))
|
||||||
|
|
||||||
|
@ -214,9 +227,6 @@ def remove_first_names():
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#falls wort nicht in vocab, erst schauen ob teilwort bekannt ist, falls ja, schauen ob es davor oder danach bullshit stehen hat. ggf trennen
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
############# strings
|
############# strings
|
||||||
|
@ -310,14 +320,6 @@ def lemmatizeWord(word,l_dict=lemma_dict,w_dict=word_dict):
|
||||||
except:
|
except:
|
||||||
print(word)
|
print(word)
|
||||||
return word
|
return word
|
||||||
|
|
||||||
def autocorrectWord(word,spellchecker=SPELLCHECKER):
|
|
||||||
try:
|
|
||||||
return spellchecker.suggest(word)[0]
|
|
||||||
except:
|
|
||||||
return word
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def lemmatize():
|
def lemmatize():
|
||||||
return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
|
return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
|
||||||
|
@ -325,10 +327,22 @@ def lemmatize():
|
||||||
def lemmatize():
|
def lemmatize():
|
||||||
return lambda string: " ".join([lemmatizeWord(s.lower()) for s in string.split()])
|
return lambda string: " ".join([lemmatizeWord(s.lower()) for s in string.split()])
|
||||||
|
|
||||||
|
DE_SPELLCHECKER = enchant.Dict("de_DE")
|
||||||
|
EN_SPELLCHECKER = enchant.Dict("en_US")
|
||||||
|
|
||||||
|
def autocorrectWord(word,spellchecker=DE_SPELLCHECKER):
|
||||||
|
|
||||||
|
try:
|
||||||
|
return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word
|
||||||
|
except:
|
||||||
|
return word
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def autocorrect():
|
def autocorrect():
|
||||||
return lambda string: " ".join([autocorrectWord(s.lower()) for s in string.split()])
|
return lambda string: " ".join([autocorrectWord(s.lower()) for s in string.split()])
|
||||||
|
|
||||||
def processTextstream(textstream, pre_parse=None, in_parse=None, post_parse=None, parser=DE_PARSER):
|
def processTextstream(textstream, pre_parse=None, on_tokens=None, post_parse=None, parser=DE_PARSER):
|
||||||
"""
|
"""
|
||||||
:param textstream: string-gen
|
:param textstream: string-gen
|
||||||
:param funclist: [func]
|
:param funclist: [func]
|
||||||
|
@ -348,8 +362,8 @@ def processTextstream(textstream, pre_parse=None, in_parse=None, post_parse=None
|
||||||
tokens = [tok for tok in doc]
|
tokens = [tok for tok in doc]
|
||||||
|
|
||||||
# in_parse
|
# in_parse
|
||||||
if in_parse is not None:
|
if on_tokens is not None:
|
||||||
tokens = processTokens(tokens, in_parse)
|
tokens = processTokens(tokens, on_tokens)
|
||||||
|
|
||||||
|
|
||||||
# post_parse
|
# post_parse
|
||||||
|
@ -388,13 +402,15 @@ pre_parse=[
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus","hallo","gerne","freundlich","fragen","fehler","bitten","ehre"
|
custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus",
|
||||||
|
"hallo","gerne","freundlich","fragen","fehler","bitten","ehre", "lieb",
|
||||||
|
"versuchen","unbestimmt","woche","tadelos", "klappen" ,"mittlerweile", "bekommen","erreichbar"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
in_parse=[
|
on_tokens=[
|
||||||
#removeENT(["PERSON"]),
|
#removeENT(["PERSON"]),
|
||||||
#todo addressen enfernen
|
#idee addressen enfernen #bisher mit cut_after("gruss")
|
||||||
#idee rechtschreibkorrektur
|
#idee rechtschreibkorrektur
|
||||||
#idee thesaurus
|
#idee thesaurus
|
||||||
|
|
||||||
|
@ -461,18 +477,21 @@ pipe=[
|
||||||
|
|
||||||
|
|
||||||
path2csv = "M42-Export/Tickets_med.csv"
|
path2csv = "M42-Export/Tickets_med.csv"
|
||||||
|
path2csv = "M42-Export/de_tickets.csv"
|
||||||
|
|
||||||
|
de_corpus = textacy.Corpus(DE_PARSER)
|
||||||
|
#en_corpus = textacy.Corpus(EN_PARSER)
|
||||||
|
|
||||||
ticketcorpus = textacy.Corpus(DE_PARSER)
|
|
||||||
|
|
||||||
|
|
||||||
## add files to textacy-corpus,
|
## add files to textacy-corpus,
|
||||||
printlog("add texts to textacy-corpus")
|
printlog("add texts to textacy-corpus")
|
||||||
ticketcorpus.add_texts(
|
de_corpus.add_texts(
|
||||||
processTextstream(csv_to_contentStream(path2csv,"Description"), pre_parse=pre_parse, in_parse=in_parse, post_parse=post_parse)
|
processTextstream(csv_to_contentStream(path2csv,"Description"), pre_parse=pre_parse, on_tokens=on_tokens, post_parse=post_parse)
|
||||||
)
|
)
|
||||||
|
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
printRandomDoc(ticketcorpus)
|
printRandomDoc(de_corpus)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -526,3 +545,115 @@ sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
############################################ Topic Modeling #############################################
|
||||||
|
print("\n\n")
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# build citionary of ticketcategories
|
||||||
|
labelist = []
|
||||||
|
|
||||||
|
for texdoc in de_corpus.get(lambda texdoc : texdoc.metadata["categoryName"] not in labelist):
|
||||||
|
labelist.append(texdoc.metadata["categoryName"])
|
||||||
|
|
||||||
|
|
||||||
|
LABELDICT = {k: v for v, k in enumerate(labelist)}
|
||||||
|
print(LABELDICT)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def label2ID(label,labeldict=LABELDICT):
|
||||||
|
return labeldict.get(label,len(labeldict))
|
||||||
|
|
||||||
|
def generate_labled_lines(textacyCorpus):
|
||||||
|
for doc in textacyCorpus:
|
||||||
|
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
|
||||||
|
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####################'####################' todo alles in config
|
||||||
|
|
||||||
|
ngrams = 1
|
||||||
|
|
||||||
|
min_df = 0.1
|
||||||
|
max_df = 0.9
|
||||||
|
no_below = 20
|
||||||
|
no_above = 0.5
|
||||||
|
|
||||||
|
topicModel = 'lda'
|
||||||
|
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
|
||||||
|
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||||
|
|
||||||
|
top_topic_words = 10
|
||||||
|
top_document_labels_per_topic = 5
|
||||||
|
|
||||||
|
n_topics = 20 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####################'####################
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
printlog("vectorize corpus...")
|
||||||
|
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||||
|
|
||||||
|
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in de_corpus)
|
||||||
|
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
||||||
|
id2term = vectorizer.__getattribute__("id_to_term")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
||||||
|
|
||||||
|
# Initialize and train a topic model
|
||||||
|
printlog("Initialize and train a topic model..")
|
||||||
|
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||||
|
model.fit(doc_term_matrix)
|
||||||
|
|
||||||
|
#Transform the corpus and interpret our model:
|
||||||
|
printlog("Transform the corpus and interpret our model..")
|
||||||
|
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
|
||||||
|
print('topic', topic_idx, ':', ' '.join(top_terms))
|
||||||
|
|
||||||
|
|
||||||
|
print()
|
||||||
|
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
|
||||||
|
print(topic_idx)
|
||||||
|
for j in top_docs:
|
||||||
|
print(de_corpus[j].metadata['categoryName'])
|
||||||
|
|
||||||
|
#####################################################################################################################
|
||||||
|
print()
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
printlog("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
|
69
testra.py
69
testra.py
|
@ -14,13 +14,54 @@ from textblob_de import PatternParser
|
||||||
import hunspell
|
import hunspell
|
||||||
from postal.parser import parse_address
|
from postal.parser import parse_address
|
||||||
|
|
||||||
|
import langdetect
|
||||||
|
import enchant
|
||||||
|
|
||||||
|
|
||||||
print(parse_address(str(textacy.fileio.read_file("teststring.txt"))))
|
|
||||||
|
#todo ticket.csv aufteilen in de und en
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#print(parse_address(str(textacy.fileio.read_file("teststring.txt"))))
|
||||||
|
from langdetect import detect
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
|
||||||
|
content_collumn_name = "Description"
|
||||||
|
content_collumn = 9 # standardvalue
|
||||||
|
|
||||||
|
de_tickets=[]
|
||||||
|
en_tickets=[]
|
||||||
|
misc_tickets=[]
|
||||||
|
|
||||||
|
error_count = 0
|
||||||
|
for i, lst in enumerate(stream):
|
||||||
|
if i == 0:
|
||||||
|
de_tickets.append(lst)
|
||||||
|
en_tickets.append(lst)
|
||||||
|
misc_tickets.append(lst)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
content_collumn_ = lst[content_collumn]
|
||||||
|
if detect(content_collumn_) == "de":
|
||||||
|
de_tickets.append(lst)
|
||||||
|
elif detect(content_collumn_) == "en":
|
||||||
|
en_tickets.append(lst)
|
||||||
|
else:
|
||||||
|
misc_tickets.append(lst)
|
||||||
|
|
||||||
|
except:
|
||||||
|
misc_tickets.append(lst)
|
||||||
|
error_count += 1
|
||||||
|
|
||||||
|
print(error_count)
|
||||||
|
|
||||||
|
textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";")
|
||||||
|
textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")
|
||||||
|
textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -72,16 +113,14 @@ for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
de_stop_words= set(
|
def replaceRockDots():
|
||||||
list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) +
|
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||||
list(textacy.fileio.read_file_lines("stopwords-de.txt"))
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt"))
|
|
||||||
|
|
||||||
VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
|
de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt"))
|
||||||
|
|
||||||
|
|
||||||
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
|
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
|
||||||
|
@ -89,21 +128,21 @@ VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
|
||||||
#print(blob.entities)
|
#print(blob.entities)
|
||||||
|
|
||||||
de_stop_words = list(map(replaceRockDots(),de_stop_words))
|
de_stop_words = list(map(replaceRockDots(),de_stop_words))
|
||||||
LEMMAS = list(map(replaceRockDots(),LEMMAS))
|
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
|
||||||
VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
|
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
|
||||||
|
|
||||||
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
|
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
|
||||||
LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
|
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
|
||||||
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
|
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
|
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
|
||||||
textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
|
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
|
||||||
textacy.fileio.write_file_lines(de_stop_words,"de_stop_words.txt")
|
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
|
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue