thesaurus auf basis von deWornNet weitergemacht

This commit is contained in:
jannis.grundmann 2017-09-28 12:42:05 +02:00
parent 33cfbe2f99
commit 1a99d117ac
6 changed files with 752271 additions and 73 deletions

1273
abbkuerzungen.txt Normal file

File diff suppressed because it is too large Load Diff

654747
lexicalentries.xml Normal file

File diff suppressed because it is too large Load Diff

95963
synsets.xml Normal file

File diff suppressed because it is too large Load Diff

221
testo.py
View File

@ -4,6 +4,14 @@ from datetime import datetime
print(datetime.now())
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
path_csv_split = path2csv.split("/")
print(path_csv_split[len(path_csv_split)-1])
import time
import enchant
@ -528,7 +536,8 @@ def processContentstream(textstream, token_filterlist=None, parser=DE_PARSER):
yield " ".join([tok.lower_ for tok in tokens])
#yield " ".join([tok.lower_ for tok in tokens])
yield " ".join(list(set([tok.lower_ for tok in tokens])))
@ -675,10 +684,6 @@ pipe=[
#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
de_corpus = textacy.Corpus(DE_PARSER)
#en_corpus = textacy.Corpus(EN_PARSER)
@ -701,47 +706,170 @@ end = time.time()
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
"""
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
corpus_name = "de_corpus"
corpus_compression = 'gzip'
de_corpus.save(corpus_path, name=corpus_name, compression=corpus_compression)
de_corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
"""
# build citionary of ticketcategories
labelist = []
for texdoc in de_corpus.get(lambda texdoc : texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
LABELDICT = {k: v for v, k in enumerate(labelist)}
printlog(str("LABELDICT: {0}".format(LABELDICT)))
def topicModeling(ngrams,min_df,max_df,topicModel = 'lda',n_topics = len(LABELDICT),named_entities=False,corpus=de_corpus):
printlog("############################################ Topic Modeling {0} #############################################".format(topicModel))
print("\n\n")
printlog(str("ngrams: {0}".format(ngrams)))
printlog(str("min_df: {0}".format(min_df)))
printlog(str("max_df: {0}".format(max_df)))
printlog(str("n_topics: {0}".format(n_topics)))
printlog(str("named_entities: {0}".format(named_entities)))
start = time.time()
top_topic_words = 10
top_document_labels_per_topic = 5
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
####################'####################
#printlog("vectorize corpus...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
#printlog("terms_list: {0}".format(list(terms_list)))
#printlog("doc_term_matrix: {0}".format(doc_term_matrix))
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
# Initialize and train a topic model
#printlog("Initialize and train a topic model..")
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix)
#Transform the corpus and interpret our model:
#printlog("Transform the corpus and interpret our model..")
doc_topic_matrix = model.transform(doc_term_matrix)
print()
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
printlog('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
print()
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
printlog(topic_idx)
for j in top_docs:
printlog(corpus[j].metadata['categoryName'])
print()
#####################################################################################################################
print()
print()
end = time.time()
printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start)/60,topicModel))
############################################ Topic Modeling #############################################
print("\n\n")
start = time.time()
ngrams = 1
min_df = 0.1
max_df = 0.6
#no_below = 20
#no_above = 0.5
topicModel = 'lda'
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
top_topic_words = 15
top_document_labels_per_topic = 7
n_topics = 20 #len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
#n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
topicModeling(ngrams = 1,
min_df = 1,
max_df = 1.0,
topicModel = 'lda',
n_topics = len(LABELDICT),
corpus=de_corpus)
topicModeling(ngrams = 1,
min_df = 0.1,
max_df = 0.6,
topicModel = 'lda',
n_topics = len(LABELDICT),
corpus=de_corpus)
topicModeling(ngrams = (1,2),
min_df = 1,
max_df = 1.0,
topicModel = 'lda',
n_topics = len(LABELDICT),
corpus=de_corpus)
topicModeling(ngrams = (1,2),
min_df = 0.1,
max_df = 0.6,
topicModel = 'lda',
n_topics = len(LABELDICT),
corpus=de_corpus)
topicModeling(ngrams = (1,2),
min_df = 0.2,
max_df = 0.8,
topicModel = 'lda',
n_topics = 20,
corpus=de_corpus)
####################'####################
printlog("vectorize corpus...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in de_corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
@ -753,44 +881,19 @@ id2term = vectorizer.__getattribute__("id_to_term")
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
# Initialize and train a topic model
printlog("Initialize and train a topic model..")
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix)
#Transform the corpus and interpret our model:
printlog("Transform the corpus and interpret our model..")
doc_topic_matrix = model.transform(doc_term_matrix)
print()
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
printlog('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
print()
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
printlog(topic_idx)
for j in top_docs:
printlog(de_corpus[j].metadata['categoryName'])
print()
#####################################################################################################################
print()
print()
end = time.time()
printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start)/60,topicModel))
"""
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
print("\n\n")
@ -862,8 +965,6 @@ end = time.time()
printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start)/60))
"""
@ -873,3 +974,5 @@ printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end

140
testra.py
View File

@ -2,6 +2,7 @@
import re
import time
import spacy
import textacy
start = time.time()
@ -14,27 +15,72 @@ import xml.etree.ElementTree as ET
print(datetime.now())
nomen=[]
#PARSER=spacy.load("de")
#todo: thesaurus....yay...
"""
def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))
def getFirstSynonym(word, thesaurus, default_return_first_Syn=False):
if not isinstance(word, str):
return str(word)
word = word.lower()
# durch den thesaurrus iterieren
for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
for syn in syn_block:
syn = syn.lower()
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
if word == syn:
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
else: # falls es ein satz ist
if word in syn:
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
return str(word) # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform(syn_block, word, default_return_first_Syn=False):
for syn in syn_block:
syn = syn.lower()
if "hauptform" in syn and len(syn.split(" ")) <= 2:
# nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
for w in syn.split(" "):
if not re.match(r'\([^)]+\)', w):
return w
if default_return_first_Syn:
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block:
if not re.match(r'\([^)]+\)', w):
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
"""
### extract from deWordNet.xml
#https://github.com/hdaSprachtechnologie/odenet
#idee synsets bilden
"""
path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml"
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
nomen=[]
### extract from deWordNet.xml
#https://github.com/hdaSprachtechnologie/odenet
for r in root:
for element in r:
if element.tag == "LexicalEntry":
if element.tag == "Synset":
attrib = element.attrib
for i,subentry in enumerate(element):
if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n":
string = (subentry.attrib["writtenForm"])
@ -49,9 +95,75 @@ for r in root:
string_list=string.split()
if len(string_list) == 1:
nomen.append(string.lower().strip())
"""
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
lextree = ET.parse(lexicalentries, ET.XMLParser(encoding="utf-8"))
syntree = ET.parse(synsets, ET.XMLParser(encoding="utf-8"))
lexroot = lextree.getroot()
synroot = syntree.getroot()
for r in synroot:
for element in r:
if element.tag == "Synset":
sysnet = []
attrib = element.attrib
id = attrib["id"]
for ro in lexroot:
for elem in ro:
if elem.tag == "LexicalEntry":
subs_dicts = [subentry.attrib for subentry in elem]
#<class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]
dic = {k:v for x in subs_dicts for k,v in x.items()} # to one dict
if "synset" in dic.keys():
if dic["synset"] == id:
if id == "de-1004-n":
x = 0
string = (dic["writtenForm"])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# alle punkte raus
string = re.sub(r'[.]', "", string)
# alles in klammern raus
stringlist = string.split()
strings=[]
for w in stringlist:
if not bool(re.match(r'/\(([^)]+)\)/', w)): #todo funzt nich wie's soll
strings.append(w)
string = " ".join(strings)
#re.sub(r'/\(([^)]+)\)/', " ", string)
sysnet.append(string.lower().strip())
print(id,sysnet)
textacy.fileio.write_file_lines(nomen,"nomen.txt")