thesaurus erstellung luafzeit verbessert

This commit is contained in:
jannis.grundmann 2017-10-12 15:57:56 +02:00
parent 93e239756c
commit 4fe12679fb
4 changed files with 317 additions and 115 deletions

View File

@ -33,7 +33,6 @@ path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-E
content_collumn_name = "Description"
metaliste = [
"TicketNumber",
"Subject",
"CreatedDate",
@ -46,15 +45,18 @@ metaliste = [
"Solution"
]
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
corpus_name = "de_raw_corpus"
corpus_name = "de_raw_ticketCorpus"
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
# todo configuration file ?
# todo configuration file
"""
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
@ -98,7 +100,7 @@ def printRandomDoc(textacyCorpus):
def csv_to_textStream(path2csv: str, content_collumn_name: str):
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
"""
:param path2csv: string
:param content_collumn_name: string
@ -117,7 +119,7 @@ def csv_to_textStream(path2csv: str, content_collumn_name: str):
yield lst[content_collumn]
def csv_to_DictStream(path2csv: str, metalist: [str]):
def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
"""
:param path2csv: string
:param metalist: list of strings
@ -155,7 +157,8 @@ def save_corpus(corpus, corpus_path, corpus_name, parser):
"""
# save parser
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
parserpath = corpus_path + str(parser.lang) + '_parser'
parser.save_to_directory(parserpath)
# save content
contentpath = corpus_path + corpus_name + "_content.bin"
@ -171,34 +174,6 @@ def save_corpus(corpus, corpus_path, corpus_name, parser):
def cleanTextstream(textstream):
"""
:param textstream: string-gen
:param parser: spacy-parser
:yield: string-gen
"""
for txt in textstream:
yield textacy.preprocess.normalize_whitespace(txt)
def cleanDictstream(dictstream):
"""
:param dictstream: dict-gen
:param parser: spacy-parser
:yield: dict-gen
"""
for dic in dictstream:
result = {}
for key, value in dic.items():
result[key] = textacy.preprocess.normalize_whitespace(value)
yield result
def main():
printlog("Corporization: {0}".format(datetime.now()))
@ -222,8 +197,8 @@ def main():
printlog("Add texts to textacy-corpus")
de_corpus.add_texts(
cleanTextstream(csv_to_textStream(path2de_csv, content_collumn_name)),
cleanDictstream(csv_to_DictStream(path2de_csv, metaliste))
ticketcsv_to_textStream(path2de_csv, content_collumn_name),
ticket_csv_to_DictStream(path2de_csv, metaliste)
)

119
init.py
View File

@ -26,6 +26,7 @@ with open(config_ini) as f:
config.read_file(f)
"""
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"
# config logging
@ -80,7 +81,7 @@ def create_lemma_dict(lemmalist):
return lemma_dict
"""
def build_thesaurus(path2lexicalentries, path2synsets):
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
@ -134,6 +135,94 @@ def build_thesaurus(path2lexicalentries, path2synsets):
return thesaurus
#todo thesaurus in dictionary
"""
def build_thesaurus(path2lexicalentries):#, path2synsets):
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
lexroot = lextree.getroot()
#synroot = syntree.getroot()
word2synsets = {}
template = {"w1": ["s1", "s2"]}
for ro in lexroot:
for elem in ro:
if elem.tag == "LexicalEntry":
lex_dictlist = [subentry.attrib for subentry in elem]
synlist = []
string = "WORD"
for lex_dict in lex_dictlist:
if "synset" in lex_dict.keys():
synset = lex_dict["synset"]
synlist.append(synset)
if 'writtenForm' in lex_dict.keys():
string = (lex_dict["writtenForm"])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# alle punkte raus
string = re.sub(r'[.]', "", string)
# alles in klammern raus
string = re.sub(r"\((.*)\)", " ", string)
# längeres leerzeichen normalisieren
string = textacy.preprocess.normalize_whitespace(string)
string = string.lower().strip()
word2synsets[string] = synlist
synset2Words = {}
template = {"s1": ["w1","w2"]}
for word,synset in word2synsets.items():
for syn in synset:
if syn not in synset2Words.keys():
synset2Words[syn] = [word]
else:
synset2Words[syn].append(word)
# nach anzhal der wörter in den strings sortieren
for synset in word2synsets.values():
synset.sort(key=lambda x: len(x.split()))
thesaurus = {}
thesaurus_template = {"w1" : "mainsyn"}
for word,synset in word2synsets.items():
try:
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
except:
pass
return thesaurus
"""
for r in synroot:
for element in r:
if element.tag == "Synset":
synset = []
attrib = element.attrib
id = attrib["id"]
if id not in synset2Words.keys():
synset2Words[id] = "WORD"
"""
def create_stopwordlist():
@ -151,7 +240,7 @@ def create_stopwordlist():
de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("stopwords-de.txt"))))
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))
de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))
@ -172,34 +261,29 @@ def words(text): return re.findall(r'\w+', text.lower())
##################################################################################################
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare .json
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien
# außerdem saubere stoppwortliste und nomenliste
# THESAURUS
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
# SPELLCHECKING
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl"
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list.pkl"
path2thesauruslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list.pkl"
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl"
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl"
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl"
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
@ -235,11 +319,10 @@ def main():
printlog("Build and save Thesaurus")
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries, path2synsets=synsets)
print(THESAURUS[0:10])
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries)
save_obj(THESAURUS, path2thesauruslist)
save_obj(THESAURUS, path2thesaurusdict)

View File

@ -38,6 +38,45 @@ csv.field_size_limit(sys.maxsize)
import pickle
def save_obj(obj, path):
with open(path + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(path ):
with open(path + '.pkl', 'rb') as f:
return pickle.load(f)
def load_corpus(corpus_path, corpus_name, lang="de"):
contentpath = corpus_path + corpus_name + "_content.bin"
metapath = corpus_path + corpus_name + "_meta.json"
#load parser
parserpath = corpus_path + str(lang) + '_parser'
parser = spacy.load(parserpath)
corpus = textacy.Corpus(parser)
metadata_stream = textacy.fileio.read_json_lines(metapath)
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
corpus_name = "de_raw_ticketCorpus"
print(load_corpus(corpus_path,corpus_name))
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
@ -63,36 +102,31 @@ logging.basicConfig(filename=logfile, level=logging.INFO)
# THESAURUS
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
THESAURUS = load_obj(path2thesaurusdict)
# thesauruspath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv"
# thesauruspath = config.get("filepath","thesauruspath")
# THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
# SPELLCHECKING
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
# SPELLCHECKING
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
DE_PARSER = spacy.load("de")
EN_PARSER = spacy.load("en")
"""
de_stop_words= set(
list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) +
list(textacy.fileio.read_file_lines("stopwords-de.txt"))
)
LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt"))
VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
"""
de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german')))
@ -126,15 +160,7 @@ specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORE
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
import pickle
def save_obj(obj, path):
with open(path + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(path ):
with open(path + '.pkl', 'rb') as f:
return pickle.load(f)
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
@ -238,21 +264,6 @@ def csv_to_metaStream(path2csv: str, metalist: [str]):
yield metadata
def save_corpus(corpus, corpus_path, corpus_name, parser=DE_PARSER):
# save stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path, "w") as file:
parser.vocab.strings.dump(file)
# save content
contentpath = corpus_path + corpus_name + "_content.bin"
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
# save meta
metapath = corpus_path + corpus_name + "_meta.json"
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
############# filter tokens
@ -751,6 +762,51 @@ def filterTokens(tokens, funclist):
return tokens
def cleanString(string):
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# längeres leerzeichen normalisieren
string = textacy.preprocess.normalize_whitespace(string)
return(string)
def normalizeTextStream(textstream,clean=False):
"""
:param textstream: string-gen
:param parser: spacy-parser
:yield: string-gen
"""
for txt in textstream:
if clean:
yield cleanString(txt)
else:
yield textacy.preprocess.normalize_whitespace(txt)
def nomalizeDictstream(dictstream, clean=False):
"""
:param dictstream: dict-gen
:param parser: spacy-parser
:yield: dict-gen
"""
for dic in dictstream:
result = {}
for key, value in dic.items():
if clean:
result[key] = cleanString(value)
else:
result[key] = textacy.preprocess.normalize_whitespace(value)
yield result
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
"hallo", "gerne", "freundlich", "fragen", "fehler", "bitten", "ehre", "lieb", "helfen",

126
testra.py
View File

@ -5,6 +5,7 @@ import json
import spacy
import textacy
from functools import reduce
start = time.time()
@ -52,6 +53,8 @@ corpus.add_texts(
print(corpus)
"""
import pickle
def save_obj(obj, path):
@ -63,31 +66,122 @@ def load_obj(path ):
return pickle.load(f)
# THESAURUS
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
lemmalist = list(map(textacy.preprocess.normalize_whitespace,
list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))))
lemma_dict = {}
def build_thesaurus(path2lexicalentries):#, path2synsets):
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
for line in lemmalist:
lexroot = lextree.getroot()
#synroot = syntree.getroot()
lem_word_pair = line.split()
lemma = lem_word_pair[0].strip().lower()
word2synsets = {}
template = {"w1": ["s1", "s2"]}
word = lem_word_pair[1].strip().lower()
lemma_dict[word] = lemma
for ro in lexroot:
for elem in ro:
if elem.tag == "LexicalEntry":
lex_dictlist = [subentry.attrib for subentry in elem]
print(lemma_dict["abbekomme"])
synlist = []
string = "WORD"
save_obj(lemma_dict, "test_dictionies")
for lex_dict in lex_dictlist:
if "synset" in lex_dict.keys():
loaded = load_obj("test_dictionies")
synset = lex_dict["synset"]
synlist.append(synset)
print(loaded["abbekomme"])
if 'writtenForm' in lex_dict.keys():
string = (lex_dict["writtenForm"])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# alle punkte raus
string = re.sub(r'[.]', "", string)
# alles in klammern raus
string = re.sub(r"\((.*)\)", " ", string)
# längeres leerzeichen normalisieren
string = textacy.preprocess.normalize_whitespace(string)
string = string.lower().strip()
word2synsets[string] = synlist
synset2Words = {}
template = {"s1": ["w1","w2"]}
for word,synset in word2synsets.items():
for syn in synset:
if syn not in synset2Words.keys():
synset2Words[syn] = [word]
else:
synset2Words[syn].append(word)
# nach anzhal der wörter in den strings sortieren
for synset in word2synsets.values():
synset.sort(key=lambda x: len(x.split()))
thesaurus = {}
thesaurus_template = {"w1" : "mainsyn"}
for word,synset in word2synsets.items():
try:
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
except:
pass
return thesaurus
"""
for r in synroot:
for element in r:
if element.tag == "Synset":
synset = []
attrib = element.attrib
id = attrib["id"]
if id not in synset2Words.keys():
synset2Words[id] = "WORD"
"""
def load_corpus(corpus_path, corpus_name, lang="de"):
contentpath = corpus_path + corpus_name + "_content.bin"
metapath = corpus_path + corpus_name + "_meta.json"
# load parser
parserpath = corpus_path + str(lang) + '_parser'
parser = spacy.load(parserpath)
corpus = textacy.Corpus(parser)
metadata_stream = textacy.fileio.read_json_lines(metapath)
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus
#todo load corpus from file idee stringstore und vocab laden
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
corpus_name = "de_raw_ticketCorpus"
print(load_corpus(corpus_path, corpus_name))
"""
from postal.parser import parse_address
@ -101,12 +195,6 @@ address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder-
print(parse_address(address))
"""
"""
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"