thesaurus auf basis von deWornNet angefangen

This commit is contained in:
jannis.grundmann 2017-09-26 11:03:09 +02:00
parent 6b8785d987
commit 33cfbe2f99
5 changed files with 143911 additions and 93 deletions

File diff suppressed because it is too large Load Diff

1855
german_stopwords.txt Normal file

File diff suppressed because it is too large Load Diff

41965
nomen2.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -519,7 +519,7 @@ def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
for doc in pipe:
tokens = [tok for tok in doc]
print(" ".join([tok.lower_ for tok in tokens]))
#print(" ".join([tok.lower_ for tok in tokens]))
# in_parse
if token_filterlist is not None:
@ -577,11 +577,19 @@ def filterTokens(tokens, funclist):
custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus",
"hallo","gerne","freundlich","fragen","fehler","bitten","ehre", "lieb",
"versuchen","unbestimmt","woche","tadelos", "klappen" ,"mittlerweile", "bekommen","erreichbar"
"hallo","gerne","freundlich","fragen","fehler","bitten","ehre", "lieb","helfen"
"versuchen","unbestimmt","woche","tadelos", "klappen" ,"mittlerweile", "bekommen","erreichbar","gruss",
"auffahren","vorgang","hinweis","institut","universitaet","name","gruss","id","erfolg","mail","folge",
"nummer","team","fakultaet","email","absender","tu","versenden","vorname","message",
"service","strasse","prozess","portal","raum","personal","moeglichkeit","fremd","wende","rueckfrage", "stehen", "verfuegung"
"funktionieren","kollege", "pruefen"
]
filter_tokens=[
#removeENT(["PERSON"]),
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
@ -669,7 +677,7 @@ pipe=[
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
de_corpus = textacy.Corpus(DE_PARSER)
#en_corpus = textacy.Corpus(EN_PARSER)
@ -689,28 +697,6 @@ for i in range(10):
"""
spracherkennung
alles nach grüße ist irrelevant außer PS:
vllt kategorien in unterkategorien aufteilen
allg:
utf-korregieren,
emails, urls, nummern raus
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
sinnvoller wörter von müll trennen: 8203;verfügung -> bei sonderzeichen wörter trennen
abkürzungen raus: m.a, o.ä.
wörter korrigieren
sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
"""
end = time.time()
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
@ -727,18 +713,18 @@ start = time.time()
ngrams = 1
min_df = 0.1
max_df = 0.9
no_below = 20
no_above = 0.5
max_df = 0.6
#no_below = 20
#no_above = 0.5
topicModel = 'lda'
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
top_topic_words = 10
top_document_labels_per_topic = 5
top_topic_words = 15
top_document_labels_per_topic = 7
n_topics = 15 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
n_topics = 20 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
@ -789,6 +775,7 @@ for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_docu
printlog(topic_idx)
for j in top_docs:
printlog(de_corpus[j].metadata['categoryName'])
print()
#####################################################################################################################
print()

133
testra.py
View File

@ -26,68 +26,8 @@ nomen=[]
### extract from derewo
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
for line in raw:
line_list=line.split()
if line_list[2] == "NN":
string = line_list[1].lower()
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
nomen.append(string.lower().strip())
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
"""
### extract from deWordNet.xml
### extract from deWordNet.xml
#https://github.com/hdaSprachtechnologie/odenet
@ -112,7 +52,78 @@ for r in root:
textacy.fileio.write_file_lines(nomen,"nomen.txt")
"""
### extract from derewo
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
for line in raw:
line_list=line.split()
if line_list[2] == "NN":
string = line_list[1].lower()
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
nomen.append(string.lower().strip())
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
"""