thesaurus auf basis von deWornNet weitergemacht
This commit is contained in:
parent
33cfbe2f99
commit
1a99d117ac
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
221
testo.py
221
testo.py
|
@ -4,6 +4,14 @@ from datetime import datetime
|
|||
|
||||
|
||||
print(datetime.now())
|
||||
|
||||
|
||||
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
||||
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
||||
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
||||
|
||||
path_csv_split = path2csv.split("/")
|
||||
print(path_csv_split[len(path_csv_split)-1])
|
||||
import time
|
||||
|
||||
import enchant
|
||||
|
@ -528,7 +536,8 @@ def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
|
|||
|
||||
|
||||
|
||||
yield " ".join([tok.lower_ for tok in tokens])
|
||||
#yield " ".join([tok.lower_ for tok in tokens])
|
||||
yield " ".join(list(set([tok.lower_ for tok in tokens])))
|
||||
|
||||
|
||||
|
||||
|
@ -675,10 +684,6 @@ pipe=[
|
|||
|
||||
|
||||
|
||||
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
||||
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
||||
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
||||
|
||||
de_corpus = textacy.Corpus(DE_PARSER)
|
||||
#en_corpus = textacy.Corpus(EN_PARSER)
|
||||
|
||||
|
@ -701,47 +706,170 @@ end = time.time()
|
|||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
|
||||
|
||||
|
||||
"""
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "de_corpus"
|
||||
corpus_compression = 'gzip'
|
||||
de_corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
de_corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
"""
|
||||
|
||||
# build citionary of ticketcategories
|
||||
labelist = []
|
||||
|
||||
for texdoc in de_corpus.get(lambda texdoc : texdoc.metadata["categoryName"] not in labelist):
|
||||
labelist.append(texdoc.metadata["categoryName"])
|
||||
|
||||
|
||||
LABELDICT = {k: v for v, k in enumerate(labelist)}
|
||||
|
||||
printlog(str("LABELDICT: {0}".format(LABELDICT)))
|
||||
|
||||
|
||||
def topicModeling(ngrams,min_df,max_df,topicModel = 'lda',n_topics = len(LABELDICT),named_entities=False,corpus=de_corpus):
|
||||
|
||||
|
||||
printlog("############################################ Topic Modeling {0} #############################################".format(topicModel))
|
||||
print("\n\n")
|
||||
printlog(str("ngrams: {0}".format(ngrams)))
|
||||
printlog(str("min_df: {0}".format(min_df)))
|
||||
printlog(str("max_df: {0}".format(max_df)))
|
||||
printlog(str("n_topics: {0}".format(n_topics)))
|
||||
printlog(str("named_entities: {0}".format(named_entities)))
|
||||
|
||||
|
||||
start = time.time()
|
||||
|
||||
top_topic_words = 10
|
||||
top_document_labels_per_topic = 5
|
||||
|
||||
|
||||
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
|
||||
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||
|
||||
####################'####################
|
||||
|
||||
|
||||
#printlog("vectorize corpus...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
|
||||
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
||||
id2term = vectorizer.__getattribute__("id_to_term")
|
||||
|
||||
|
||||
#printlog("terms_list: {0}".format(list(terms_list)))
|
||||
#printlog("doc_term_matrix: {0}".format(doc_term_matrix))
|
||||
|
||||
|
||||
|
||||
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
||||
|
||||
# Initialize and train a topic model
|
||||
#printlog("Initialize and train a topic model..")
|
||||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||
model.fit(doc_term_matrix)
|
||||
|
||||
#Transform the corpus and interpret our model:
|
||||
#printlog("Transform the corpus and interpret our model..")
|
||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||
print()
|
||||
|
||||
|
||||
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
|
||||
printlog('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
|
||||
|
||||
|
||||
print()
|
||||
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
|
||||
printlog(topic_idx)
|
||||
for j in top_docs:
|
||||
printlog(corpus[j].metadata['categoryName'])
|
||||
print()
|
||||
|
||||
#####################################################################################################################
|
||||
print()
|
||||
print()
|
||||
|
||||
|
||||
|
||||
end = time.time()
|
||||
printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start)/60,topicModel))
|
||||
|
||||
|
||||
|
||||
|
||||
############################################ Topic Modeling #############################################
|
||||
print("\n\n")
|
||||
start = time.time()
|
||||
|
||||
|
||||
ngrams = 1
|
||||
|
||||
min_df = 0.1
|
||||
max_df = 0.6
|
||||
#no_below = 20
|
||||
#no_above = 0.5
|
||||
|
||||
topicModel = 'lda'
|
||||
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
|
||||
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||
|
||||
top_topic_words = 15
|
||||
top_document_labels_per_topic = 7
|
||||
|
||||
n_topics = 20 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
#n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
|
||||
|
||||
|
||||
|
||||
topicModeling(ngrams = 1,
|
||||
min_df = 1,
|
||||
max_df = 1.0,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
|
||||
topicModeling(ngrams = 1,
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 1.0,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = 'lda',
|
||||
n_topics = len(LABELDICT),
|
||||
corpus=de_corpus)
|
||||
|
||||
topicModeling(ngrams = (1,2),
|
||||
min_df = 0.2,
|
||||
max_df = 0.8,
|
||||
topicModel = 'lda',
|
||||
n_topics = 20,
|
||||
corpus=de_corpus)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
####################'####################
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("vectorize corpus...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in de_corpus)
|
||||
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
||||
id2term = vectorizer.__getattribute__("id_to_term")
|
||||
|
||||
|
||||
|
||||
|
@ -753,44 +881,19 @@ id2term = vectorizer.__getattribute__("id_to_term")
|
|||
|
||||
|
||||
|
||||
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
||||
|
||||
# Initialize and train a topic model
|
||||
printlog("Initialize and train a topic model..")
|
||||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||
model.fit(doc_term_matrix)
|
||||
|
||||
#Transform the corpus and interpret our model:
|
||||
printlog("Transform the corpus and interpret our model..")
|
||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||
print()
|
||||
|
||||
|
||||
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
|
||||
printlog('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
|
||||
|
||||
|
||||
print()
|
||||
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
|
||||
printlog(topic_idx)
|
||||
for j in top_docs:
|
||||
printlog(de_corpus[j].metadata['categoryName'])
|
||||
print()
|
||||
|
||||
#####################################################################################################################
|
||||
print()
|
||||
print()
|
||||
|
||||
|
||||
|
||||
end = time.time()
|
||||
printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start)/60,topicModel))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
|
||||
|
||||
print("\n\n")
|
||||
|
@ -862,8 +965,6 @@ end = time.time()
|
|||
printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start)/60))
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
@ -873,3 +974,5 @@ printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
140
testra.py
140
testra.py
|
@ -2,6 +2,7 @@
|
|||
import re
|
||||
import time
|
||||
|
||||
import spacy
|
||||
import textacy
|
||||
|
||||
start = time.time()
|
||||
|
@ -14,27 +15,72 @@ import xml.etree.ElementTree as ET
|
|||
print(datetime.now())
|
||||
|
||||
|
||||
|
||||
|
||||
nomen=[]
|
||||
#PARSER=spacy.load("de")
|
||||
|
||||
#todo: thesaurus....yay...
|
||||
|
||||
"""
|
||||
def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
|
||||
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
|
||||
return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))
|
||||
|
||||
def getFirstSynonym(word, thesaurus, default_return_first_Syn=False):
|
||||
if not isinstance(word, str):
|
||||
return str(word)
|
||||
|
||||
word = word.lower()
|
||||
|
||||
# durch den thesaurrus iterieren
|
||||
for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
|
||||
|
||||
for syn in syn_block:
|
||||
syn = syn.lower()
|
||||
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
|
||||
if word == syn:
|
||||
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
|
||||
else: # falls es ein satz ist
|
||||
if word in syn:
|
||||
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
|
||||
return str(word) # zur Not, das ursrpüngliche Wort zurückgeben
|
||||
|
||||
def getHauptform(syn_block, word, default_return_first_Syn=False):
|
||||
for syn in syn_block:
|
||||
syn = syn.lower()
|
||||
|
||||
if "hauptform" in syn and len(syn.split(" ")) <= 2:
|
||||
# nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
|
||||
for w in syn.split(" "):
|
||||
if not re.match(r'\([^)]+\)', w):
|
||||
return w
|
||||
|
||||
if default_return_first_Syn:
|
||||
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
|
||||
for w in syn_block:
|
||||
if not re.match(r'\([^)]+\)', w):
|
||||
return w
|
||||
return word # zur Not, das ursrpüngliche Wort zurückgeben
|
||||
"""
|
||||
|
||||
### extract from deWordNet.xml
|
||||
|
||||
#https://github.com/hdaSprachtechnologie/odenet
|
||||
#idee synsets bilden
|
||||
|
||||
|
||||
"""
|
||||
path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml"
|
||||
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
||||
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
||||
|
||||
nomen=[]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
### extract from deWordNet.xml
|
||||
|
||||
#https://github.com/hdaSprachtechnologie/odenet
|
||||
|
||||
for r in root:
|
||||
for element in r:
|
||||
|
||||
if element.tag == "LexicalEntry":
|
||||
if element.tag == "Synset":
|
||||
attrib = element.attrib
|
||||
for i,subentry in enumerate(element):
|
||||
if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n":
|
||||
string = (subentry.attrib["writtenForm"])
|
||||
|
@ -49,9 +95,75 @@ for r in root:
|
|||
string_list=string.split()
|
||||
if len(string_list) == 1:
|
||||
nomen.append(string.lower().strip())
|
||||
"""
|
||||
|
||||
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||
|
||||
|
||||
lextree = ET.parse(lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||
syntree = ET.parse(synsets, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
lexroot = lextree.getroot()
|
||||
synroot = syntree.getroot()
|
||||
|
||||
|
||||
|
||||
for r in synroot:
|
||||
for element in r:
|
||||
|
||||
if element.tag == "Synset":
|
||||
sysnet = []
|
||||
attrib = element.attrib
|
||||
id = attrib["id"]
|
||||
|
||||
|
||||
for ro in lexroot:
|
||||
for elem in ro:
|
||||
if elem.tag == "LexicalEntry":
|
||||
subs_dicts = [subentry.attrib for subentry in elem]
|
||||
#<class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]
|
||||
|
||||
dic = {k:v for x in subs_dicts for k,v in x.items()} # to one dict
|
||||
if "synset" in dic.keys():
|
||||
if dic["synset"] == id:
|
||||
|
||||
if id == "de-1004-n":
|
||||
x = 0
|
||||
|
||||
string = (dic["writtenForm"])
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
|
||||
|
||||
# alle punkte raus
|
||||
string = re.sub(r'[.]', "", string)
|
||||
|
||||
|
||||
# alles in klammern raus
|
||||
stringlist = string.split()
|
||||
strings=[]
|
||||
for w in stringlist:
|
||||
if not bool(re.match(r'/\(([^)]+)\)/', w)): #todo funzt nich wie's soll
|
||||
strings.append(w)
|
||||
string = " ".join(strings)
|
||||
|
||||
#re.sub(r'/\(([^)]+)\)/', " ", string)
|
||||
|
||||
|
||||
sysnet.append(string.lower().strip())
|
||||
|
||||
|
||||
print(id,sysnet)
|
||||
|
||||
|
||||
|
||||
textacy.fileio.write_file_lines(nomen,"nomen.txt")
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue