thesaurus auf basis von deWornNet angefangen
This commit is contained in:
parent
6b8785d987
commit
33cfbe2f99
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
51
testo.py
51
testo.py
|
@ -519,7 +519,7 @@ def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
|
||||||
for doc in pipe:
|
for doc in pipe:
|
||||||
|
|
||||||
tokens = [tok for tok in doc]
|
tokens = [tok for tok in doc]
|
||||||
print(" ".join([tok.lower_ for tok in tokens]))
|
#print(" ".join([tok.lower_ for tok in tokens]))
|
||||||
|
|
||||||
# in_parse
|
# in_parse
|
||||||
if token_filterlist is not None:
|
if token_filterlist is not None:
|
||||||
|
@ -577,11 +577,19 @@ def filterTokens(tokens, funclist):
|
||||||
|
|
||||||
|
|
||||||
custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus",
|
custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","voraus",
|
||||||
"hallo","gerne","freundlich","fragen","fehler","bitten","ehre", "lieb",
|
"hallo","gerne","freundlich","fragen","fehler","bitten","ehre", "lieb","helfen"
|
||||||
"versuchen","unbestimmt","woche","tadelos", "klappen" ,"mittlerweile", "bekommen","erreichbar"
|
"versuchen","unbestimmt","woche","tadelos", "klappen" ,"mittlerweile", "bekommen","erreichbar","gruss",
|
||||||
|
"auffahren","vorgang","hinweis","institut","universitaet","name","gruss","id","erfolg","mail","folge",
|
||||||
|
"nummer","team","fakultaet","email","absender","tu","versenden","vorname","message",
|
||||||
|
"service","strasse","prozess","portal","raum","personal","moeglichkeit","fremd","wende","rueckfrage", "stehen", "verfuegung"
|
||||||
|
"funktionieren","kollege", "pruefen"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
filter_tokens=[
|
filter_tokens=[
|
||||||
#removeENT(["PERSON"]),
|
#removeENT(["PERSON"]),
|
||||||
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
#idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
|
||||||
|
@ -669,7 +677,7 @@ pipe=[
|
||||||
|
|
||||||
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
||||||
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
||||||
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
||||||
|
|
||||||
de_corpus = textacy.Corpus(DE_PARSER)
|
de_corpus = textacy.Corpus(DE_PARSER)
|
||||||
#en_corpus = textacy.Corpus(EN_PARSER)
|
#en_corpus = textacy.Corpus(EN_PARSER)
|
||||||
|
@ -689,28 +697,6 @@ for i in range(10):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
spracherkennung
|
|
||||||
alles nach grüße ist irrelevant außer PS:
|
|
||||||
|
|
||||||
vllt kategorien in unterkategorien aufteilen
|
|
||||||
|
|
||||||
allg:
|
|
||||||
utf-korregieren,
|
|
||||||
|
|
||||||
emails, urls, nummern raus
|
|
||||||
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
|
|
||||||
|
|
||||||
sinnvoller wörter von müll trennen: 8203;verfügung -> bei sonderzeichen wörter trennen
|
|
||||||
|
|
||||||
abkürzungen raus: m.a, o.ä.
|
|
||||||
|
|
||||||
wörter korrigieren
|
|
||||||
|
|
||||||
sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
|
|
||||||
|
|
||||||
"""
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
|
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
|
||||||
|
|
||||||
|
@ -727,18 +713,18 @@ start = time.time()
|
||||||
ngrams = 1
|
ngrams = 1
|
||||||
|
|
||||||
min_df = 0.1
|
min_df = 0.1
|
||||||
max_df = 0.9
|
max_df = 0.6
|
||||||
no_below = 20
|
#no_below = 20
|
||||||
no_above = 0.5
|
#no_above = 0.5
|
||||||
|
|
||||||
topicModel = 'lda'
|
topicModel = 'lda'
|
||||||
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
|
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
|
||||||
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||||
|
|
||||||
top_topic_words = 10
|
top_topic_words = 15
|
||||||
top_document_labels_per_topic = 5
|
top_document_labels_per_topic = 7
|
||||||
|
|
||||||
n_topics = 15 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
n_topics = 20 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -789,6 +775,7 @@ for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_docu
|
||||||
printlog(topic_idx)
|
printlog(topic_idx)
|
||||||
for j in top_docs:
|
for j in top_docs:
|
||||||
printlog(de_corpus[j].metadata['categoryName'])
|
printlog(de_corpus[j].metadata['categoryName'])
|
||||||
|
print()
|
||||||
|
|
||||||
#####################################################################################################################
|
#####################################################################################################################
|
||||||
print()
|
print()
|
||||||
|
|
133
testra.py
133
testra.py
|
@ -26,68 +26,8 @@ nomen=[]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### extract from derewo
|
|
||||||
|
|
||||||
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
|
### extract from deWordNet.xml
|
||||||
|
|
||||||
|
|
||||||
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
|
|
||||||
|
|
||||||
for line in raw:
|
|
||||||
line_list=line.split()
|
|
||||||
if line_list[2] == "NN":
|
|
||||||
string = line_list[1].lower()
|
|
||||||
|
|
||||||
# replaceRockDots
|
|
||||||
string = re.sub(r'[ß]', "ss", string)
|
|
||||||
string = re.sub(r'[ö]', "oe", string)
|
|
||||||
string = re.sub(r'[ü]', "ue", string)
|
|
||||||
string = re.sub(r'[ä]', "ae", string)
|
|
||||||
|
|
||||||
|
|
||||||
nomen.append(string.lower().strip())
|
|
||||||
|
|
||||||
|
|
||||||
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
### extract from deWordNet.xml
|
|
||||||
|
|
||||||
#https://github.com/hdaSprachtechnologie/odenet
|
#https://github.com/hdaSprachtechnologie/odenet
|
||||||
|
|
||||||
|
@ -112,7 +52,78 @@ for r in root:
|
||||||
|
|
||||||
|
|
||||||
textacy.fileio.write_file_lines(nomen,"nomen.txt")
|
textacy.fileio.write_file_lines(nomen,"nomen.txt")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
### extract from derewo
|
||||||
|
|
||||||
|
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
|
||||||
|
|
||||||
|
|
||||||
|
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
|
||||||
|
|
||||||
|
for line in raw:
|
||||||
|
line_list=line.split()
|
||||||
|
if line_list[2] == "NN":
|
||||||
|
string = line_list[1].lower()
|
||||||
|
|
||||||
|
# replaceRockDots
|
||||||
|
string = re.sub(r'[ß]', "ss", string)
|
||||||
|
string = re.sub(r'[ö]', "oe", string)
|
||||||
|
string = re.sub(r'[ü]', "ue", string)
|
||||||
|
string = re.sub(r'[ä]', "ae", string)
|
||||||
|
|
||||||
|
|
||||||
|
nomen.append(string.lower().strip())
|
||||||
|
|
||||||
|
|
||||||
|
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue