thesaurus auf basis von deWornNet angefangen

This commit is contained in:
jannis.grundmann 2017-09-26 11:03:09 +02:00
parent 6b8785d987
commit 33cfbe2f99
5 changed files with 143911 additions and 93 deletions

File diff suppressed because it is too large Load Diff

1855
german_stopwords.txt Normal file

File diff suppressed because it is too large Load Diff

41965
nomen2.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -519,7 +519,7 @@ def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
for doc in pipe: for doc in pipe:
tokens = [tok for tok in doc] tokens = [tok for tok in doc]
print(" ".join([tok.lower_ for tok in tokens])) #print(" ".join([tok.lower_ for tok in tokens]))
# in_parse # in_parse
if token_filterlist is not None: if token_filterlist is not None:
@ -577,11 +577,19 @@ def filterTokens(tokens, funclist):
custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus", custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus",
"hallo","gerne","freundlich","fragen","fehler","bitten","ehre", "lieb", "hallo","gerne","freundlich","fragen","fehler","bitten","ehre", "lieb","helfen"
"versuchen","unbestimmt","woche","tadelos", "klappen" ,"mittlerweile", "bekommen","erreichbar" "versuchen","unbestimmt","woche","tadelos", "klappen" ,"mittlerweile", "bekommen","erreichbar","gruss",
"auffahren","vorgang","hinweis","institut","universitaet","name","gruss","id","erfolg","mail","folge",
"nummer","team","fakultaet","email","absender","tu","versenden","vorname","message",
"service","strasse","prozess","portal","raum","personal","moeglichkeit","fremd","wende","rueckfrage", "stehen", "verfuegung"
"funktionieren","kollege", "pruefen"
] ]
filter_tokens=[ filter_tokens=[
#removeENT(["PERSON"]), #removeENT(["PERSON"]),
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser #idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
@ -669,7 +677,7 @@ pipe=[
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv" #path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv" path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv" path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
de_corpus = textacy.Corpus(DE_PARSER) de_corpus = textacy.Corpus(DE_PARSER)
#en_corpus = textacy.Corpus(EN_PARSER) #en_corpus = textacy.Corpus(EN_PARSER)
@ -689,28 +697,6 @@ for i in range(10):
"""
spracherkennung
alles nach grüße ist irrelevant außer PS:
vllt kategorien in unterkategorien aufteilen
allg:
utf-korregieren,
emails, urls, nummern raus
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
sinnvoller wörter von müll trennen: 8203;verfügung -> bei sonderzeichen wörter trennen
abkürzungen raus: m.a, o.ä.
wörter korrigieren
sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
"""
end = time.time() end = time.time()
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60)) printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
@ -727,18 +713,18 @@ start = time.time()
ngrams = 1 ngrams = 1
min_df = 0.1 min_df = 0.1
max_df = 0.9 max_df = 0.6
no_below = 20 #no_below = 20
no_above = 0.5 #no_above = 0.5
topicModel = 'lda' topicModel = 'lda'
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix # http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf') weighting = ('tf' if topicModel == 'lda' else 'tfidf')
top_topic_words = 10 top_topic_words = 15
top_document_labels_per_topic = 5 top_document_labels_per_topic = 7
n_topics = 15 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic n_topics = 20 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
@ -789,6 +775,7 @@ for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_docu
printlog(topic_idx) printlog(topic_idx)
for j in top_docs: for j in top_docs:
printlog(de_corpus[j].metadata['categoryName']) printlog(de_corpus[j].metadata['categoryName'])
print()
##################################################################################################################### #####################################################################################################################
print() print()

133
testra.py
View File

@ -26,68 +26,8 @@ nomen=[]
### extract from derewo
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html ### extract from deWordNet.xml
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
for line in raw:
line_list=line.split()
if line_list[2] == "NN":
string = line_list[1].lower()
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
nomen.append(string.lower().strip())
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
"""
### extract from deWordNet.xml
#https://github.com/hdaSprachtechnologie/odenet #https://github.com/hdaSprachtechnologie/odenet
@ -112,7 +52,78 @@ for r in root:
textacy.fileio.write_file_lines(nomen,"nomen.txt") textacy.fileio.write_file_lines(nomen,"nomen.txt")
""" """
### extract from derewo
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
for line in raw:
line_list=line.split()
if line_list[2] == "NN":
string = line_list[1].lower()
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
nomen.append(string.lower().strip())
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
"""