bereit für weitern testrun
This commit is contained in:
parent
89ea2a863d
commit
2ee9937d23
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
179
testo.py
179
testo.py
|
@ -2,6 +2,9 @@
|
|||
|
||||
|
||||
import time
|
||||
|
||||
import enchant
|
||||
|
||||
start = time.time()
|
||||
|
||||
import logging
|
||||
|
@ -49,11 +52,14 @@ logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INF
|
|||
thesauruspath = config.get("filepath","thesauruspath")
|
||||
THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
|
||||
|
||||
from langdetect import detect
|
||||
|
||||
|
||||
|
||||
DE_PARSER = spacy.load("de") #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen
|
||||
#EN_PARSER = spacy.load("en")
|
||||
|
||||
|
||||
SPELLCHECKER = hunspell.HunSpell('/usr/share/hunspell/de_DE.dic',
|
||||
'/usr/share/hunspell/de_DE.aff')
|
||||
|
||||
|
||||
def replaceRockDots():
|
||||
|
@ -72,10 +78,15 @@ VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
|
|||
"""
|
||||
|
||||
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("de_stop_words.txt")))
|
||||
|
||||
#en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS))
|
||||
|
||||
LEMMAS = list(textacy.fileio.read_file_lines("lemmas.txt"))
|
||||
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,textacy.fileio.read_file_lines("firstnames.txt")))
|
||||
|
||||
print(de_stop_words[10:30])
|
||||
|
||||
|
||||
print(LEMMAS[10:30])
|
||||
print(VORNAMEN[10:30])
|
||||
|
||||
|
@ -164,7 +175,7 @@ def csv_to_contentStream(path2csv: str, content_collumn_name: str):
|
|||
|
||||
|
||||
|
||||
############# return bool
|
||||
############# filter tokens
|
||||
|
||||
def keepPOS(pos_list):
|
||||
return lambda tok : tok.pos_ in pos_list
|
||||
|
@ -195,10 +206,12 @@ def removeENT(ent_list):
|
|||
|
||||
def remove_words_containing_Numbers():
|
||||
return lambda tok: not bool(re.search('\d', tok.lower_))
|
||||
|
||||
"""
|
||||
def remove_words_containing_topLVL():
|
||||
return lambda tok: not bool(re.search(regex_topLvl, tok.lower_))
|
||||
"""
|
||||
|
||||
def remove_words_containing_specialCharacters():
|
||||
return lambda tok: not bool(re.search(regex_specialChars, tok.lower_))
|
||||
|
||||
|
@ -214,9 +227,6 @@ def remove_first_names():
|
|||
|
||||
|
||||
|
||||
#falls wort nicht in vocab, erst schauen ob teilwort bekannt ist, falls ja, schauen ob es davor oder danach bullshit stehen hat. ggf trennen
|
||||
|
||||
|
||||
|
||||
|
||||
############# strings
|
||||
|
@ -310,14 +320,6 @@ def lemmatizeWord(word,l_dict=lemma_dict,w_dict=word_dict):
|
|||
except:
|
||||
print(word)
|
||||
return word
|
||||
|
||||
def autocorrectWord(word,spellchecker=SPELLCHECKER):
|
||||
try:
|
||||
return spellchecker.suggest(word)[0]
|
||||
except:
|
||||
return word
|
||||
|
||||
|
||||
"""
|
||||
def lemmatize():
|
||||
return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
|
||||
|
@ -325,10 +327,22 @@ def lemmatize():
|
|||
def lemmatize():
|
||||
return lambda string: " ".join([lemmatizeWord(s.lower()) for s in string.split()])
|
||||
|
||||
DE_SPELLCHECKER = enchant.Dict("de_DE")
|
||||
EN_SPELLCHECKER = enchant.Dict("en_US")
|
||||
|
||||
def autocorrectWord(word,spellchecker=DE_SPELLCHECKER):
|
||||
|
||||
try:
|
||||
return spellchecker.suggest(word)[0] if not spellchecker.check(word) else word
|
||||
except:
|
||||
return word
|
||||
|
||||
|
||||
|
||||
def autocorrect():
|
||||
return lambda string: " ".join([autocorrectWord(s.lower()) for s in string.split()])
|
||||
|
||||
def processTextstream(textstream, pre_parse=None, in_parse=None, post_parse=None, parser=DE_PARSER):
|
||||
def processTextstream(textstream, pre_parse=None, on_tokens=None, post_parse=None, parser=DE_PARSER):
|
||||
"""
|
||||
:param textstream: string-gen
|
||||
:param funclist: [func]
|
||||
|
@ -348,8 +362,8 @@ def processTextstream(textstream, pre_parse=None, in_parse=None, post_parse=None
|
|||
tokens = [tok for tok in doc]
|
||||
|
||||
# in_parse
|
||||
if in_parse is not None:
|
||||
tokens = processTokens(tokens, in_parse)
|
||||
if on_tokens is not None:
|
||||
tokens = processTokens(tokens, on_tokens)
|
||||
|
||||
|
||||
# post_parse
|
||||
|
@ -388,13 +402,15 @@ pre_parse=[
|
|||
|
||||
]
|
||||
|
||||
custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus","hallo","gerne","freundlich","fragen","fehler","bitten","ehre"
|
||||
custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus",
|
||||
"hallo","gerne","freundlich","fragen","fehler","bitten","ehre", "lieb",
|
||||
"versuchen","unbestimmt","woche","tadelos", "klappen" ,"mittlerweile", "bekommen","erreichbar"
|
||||
]
|
||||
|
||||
|
||||
in_parse=[
|
||||
on_tokens=[
|
||||
#removeENT(["PERSON"]),
|
||||
#todo addressen enfernen
|
||||
#idee addressen enfernen #bisher mit cut_after("gruss")
|
||||
#idee rechtschreibkorrektur
|
||||
#idee thesaurus
|
||||
|
||||
|
@ -461,18 +477,21 @@ pipe=[
|
|||
|
||||
|
||||
path2csv = "M42-Export/Tickets_med.csv"
|
||||
path2csv = "M42-Export/de_tickets.csv"
|
||||
|
||||
de_corpus = textacy.Corpus(DE_PARSER)
|
||||
#en_corpus = textacy.Corpus(EN_PARSER)
|
||||
|
||||
ticketcorpus = textacy.Corpus(DE_PARSER)
|
||||
|
||||
|
||||
## add files to textacy-corpus,
|
||||
printlog("add texts to textacy-corpus")
|
||||
ticketcorpus.add_texts(
|
||||
processTextstream(csv_to_contentStream(path2csv,"Description"), pre_parse=pre_parse, in_parse=in_parse, post_parse=post_parse)
|
||||
de_corpus.add_texts(
|
||||
processTextstream(csv_to_contentStream(path2csv,"Description"), pre_parse=pre_parse, on_tokens=on_tokens, post_parse=post_parse)
|
||||
)
|
||||
|
||||
for i in range(10):
|
||||
printRandomDoc(ticketcorpus)
|
||||
printRandomDoc(de_corpus)
|
||||
|
||||
|
||||
|
||||
|
@ -526,3 +545,115 @@ sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
|
|||
|
||||
|
||||
|
||||
############################################ Topic Modeling #############################################
|
||||
print("\n\n")
|
||||
start = time.time()
|
||||
|
||||
|
||||
|
||||
# build citionary of ticketcategories
|
||||
labelist = []
|
||||
|
||||
for texdoc in de_corpus.get(lambda texdoc : texdoc.metadata["categoryName"] not in labelist):
|
||||
labelist.append(texdoc.metadata["categoryName"])
|
||||
|
||||
|
||||
LABELDICT = {k: v for v, k in enumerate(labelist)}
|
||||
print(LABELDICT)
|
||||
|
||||
|
||||
|
||||
def label2ID(label,labeldict=LABELDICT):
|
||||
return labeldict.get(label,len(labeldict))
|
||||
|
||||
def generate_labled_lines(textacyCorpus):
|
||||
for doc in textacyCorpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
|
||||
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
####################'####################' todo alles in config
|
||||
|
||||
ngrams = 1
|
||||
|
||||
min_df = 0.1
|
||||
max_df = 0.9
|
||||
no_below = 20
|
||||
no_above = 0.5
|
||||
|
||||
topicModel = 'lda'
|
||||
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
|
||||
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||
|
||||
top_topic_words = 10
|
||||
top_document_labels_per_topic = 5
|
||||
|
||||
n_topics = 20 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
|
||||
|
||||
|
||||
end = time.time()
|
||||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
|
||||
|
||||
|
||||
|
||||
|
||||
####################'####################
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("vectorize corpus...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in de_corpus)
|
||||
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
||||
id2term = vectorizer.__getattribute__("id_to_term")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
||||
|
||||
# Initialize and train a topic model
|
||||
printlog("Initialize and train a topic model..")
|
||||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||
model.fit(doc_term_matrix)
|
||||
|
||||
#Transform the corpus and interpret our model:
|
||||
printlog("Transform the corpus and interpret our model..")
|
||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||
print()
|
||||
|
||||
|
||||
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
|
||||
print('topic', topic_idx, ':', ' '.join(top_terms))
|
||||
|
||||
|
||||
print()
|
||||
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
|
||||
print(topic_idx)
|
||||
for j in top_docs:
|
||||
print(de_corpus[j].metadata['categoryName'])
|
||||
|
||||
#####################################################################################################################
|
||||
print()
|
||||
print()
|
||||
|
||||
|
||||
|
||||
end = time.time()
|
||||
printlog("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
|
69
testra.py
69
testra.py
|
@ -14,13 +14,54 @@ from textblob_de import PatternParser
|
|||
import hunspell
|
||||
from postal.parser import parse_address
|
||||
|
||||
import langdetect
|
||||
import enchant
|
||||
|
||||
|
||||
print(parse_address(str(textacy.fileio.read_file("teststring.txt"))))
|
||||
|
||||
#todo ticket.csv aufteilen in de und en
|
||||
|
||||
|
||||
|
||||
#print(parse_address(str(textacy.fileio.read_file("teststring.txt"))))
|
||||
from langdetect import detect
|
||||
|
||||
|
||||
|
||||
|
||||
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
|
||||
content_collumn_name = "Description"
|
||||
content_collumn = 9 # standardvalue
|
||||
|
||||
de_tickets=[]
|
||||
en_tickets=[]
|
||||
misc_tickets=[]
|
||||
|
||||
error_count = 0
|
||||
for i, lst in enumerate(stream):
|
||||
if i == 0:
|
||||
de_tickets.append(lst)
|
||||
en_tickets.append(lst)
|
||||
misc_tickets.append(lst)
|
||||
else:
|
||||
try:
|
||||
content_collumn_ = lst[content_collumn]
|
||||
if detect(content_collumn_) == "de":
|
||||
de_tickets.append(lst)
|
||||
elif detect(content_collumn_) == "en":
|
||||
en_tickets.append(lst)
|
||||
else:
|
||||
misc_tickets.append(lst)
|
||||
|
||||
except:
|
||||
misc_tickets.append(lst)
|
||||
error_count += 1
|
||||
|
||||
print(error_count)
|
||||
|
||||
textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";")
|
||||
textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")
|
||||
textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")
|
||||
|
||||
|
||||
|
||||
|
@ -72,16 +113,14 @@ for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
|
|||
"""
|
||||
|
||||
|
||||
|
||||
"""
|
||||
de_stop_words= set(
|
||||
list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) +
|
||||
list(textacy.fileio.read_file_lines("stopwords-de.txt"))
|
||||
)
|
||||
def replaceRockDots():
|
||||
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||
|
||||
|
||||
LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt"))
|
||||
|
||||
VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
|
||||
de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt"))
|
||||
|
||||
|
||||
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
|
||||
|
@ -89,21 +128,21 @@ VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
|
|||
#print(blob.entities)
|
||||
|
||||
de_stop_words = list(map(replaceRockDots(),de_stop_words))
|
||||
LEMMAS = list(map(replaceRockDots(),LEMMAS))
|
||||
VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
|
||||
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
|
||||
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
|
||||
|
||||
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
|
||||
LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
|
||||
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
|
||||
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
|
||||
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
|
||||
|
||||
|
||||
|
||||
|
||||
textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
|
||||
textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
|
||||
textacy.fileio.write_file_lines(de_stop_words,"de_stop_words.txt")
|
||||
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
|
||||
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
|
||||
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
|
||||
|
||||
"""
|
||||
|
||||
end = time.time()
|
||||
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
|
||||
|
||||
|
|
Loading…
Reference in New Issue