thesaurus erstellung luafzeit verbessert
This commit is contained in:
parent
93e239756c
commit
4fe12679fb
|
@ -33,7 +33,6 @@ path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-E
|
||||||
content_collumn_name = "Description"
|
content_collumn_name = "Description"
|
||||||
|
|
||||||
metaliste = [
|
metaliste = [
|
||||||
|
|
||||||
"TicketNumber",
|
"TicketNumber",
|
||||||
"Subject",
|
"Subject",
|
||||||
"CreatedDate",
|
"CreatedDate",
|
||||||
|
@ -46,15 +45,18 @@ metaliste = [
|
||||||
"Solution"
|
"Solution"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||||
corpus_name = "de_raw_corpus"
|
corpus_name = "de_raw_ticketCorpus"
|
||||||
|
|
||||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# todo configuration file ?
|
# todo configuration file
|
||||||
"""
|
"""
|
||||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||||
|
|
||||||
|
@ -98,7 +100,7 @@ def printRandomDoc(textacyCorpus):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def csv_to_textStream(path2csv: str, content_collumn_name: str):
|
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
|
||||||
"""
|
"""
|
||||||
:param path2csv: string
|
:param path2csv: string
|
||||||
:param content_collumn_name: string
|
:param content_collumn_name: string
|
||||||
|
@ -117,7 +119,7 @@ def csv_to_textStream(path2csv: str, content_collumn_name: str):
|
||||||
yield lst[content_collumn]
|
yield lst[content_collumn]
|
||||||
|
|
||||||
|
|
||||||
def csv_to_DictStream(path2csv: str, metalist: [str]):
|
def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
|
||||||
"""
|
"""
|
||||||
:param path2csv: string
|
:param path2csv: string
|
||||||
:param metalist: list of strings
|
:param metalist: list of strings
|
||||||
|
@ -155,7 +157,8 @@ def save_corpus(corpus, corpus_path, corpus_name, parser):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# save parser
|
# save parser
|
||||||
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
|
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||||
|
parser.save_to_directory(parserpath)
|
||||||
|
|
||||||
# save content
|
# save content
|
||||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||||
|
@ -171,34 +174,6 @@ def save_corpus(corpus, corpus_path, corpus_name, parser):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def cleanTextstream(textstream):
|
|
||||||
"""
|
|
||||||
:param textstream: string-gen
|
|
||||||
:param parser: spacy-parser
|
|
||||||
:yield: string-gen
|
|
||||||
"""
|
|
||||||
|
|
||||||
for txt in textstream:
|
|
||||||
yield textacy.preprocess.normalize_whitespace(txt)
|
|
||||||
|
|
||||||
|
|
||||||
def cleanDictstream(dictstream):
|
|
||||||
"""
|
|
||||||
:param dictstream: dict-gen
|
|
||||||
:param parser: spacy-parser
|
|
||||||
:yield: dict-gen
|
|
||||||
"""
|
|
||||||
|
|
||||||
for dic in dictstream:
|
|
||||||
|
|
||||||
result = {}
|
|
||||||
|
|
||||||
for key, value in dic.items():
|
|
||||||
result[key] = textacy.preprocess.normalize_whitespace(value)
|
|
||||||
yield result
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
printlog("Corporization: {0}".format(datetime.now()))
|
printlog("Corporization: {0}".format(datetime.now()))
|
||||||
|
@ -222,8 +197,8 @@ def main():
|
||||||
printlog("Add texts to textacy-corpus")
|
printlog("Add texts to textacy-corpus")
|
||||||
|
|
||||||
de_corpus.add_texts(
|
de_corpus.add_texts(
|
||||||
cleanTextstream(csv_to_textStream(path2de_csv, content_collumn_name)),
|
ticketcsv_to_textStream(path2de_csv, content_collumn_name),
|
||||||
cleanDictstream(csv_to_DictStream(path2de_csv, metaliste))
|
ticket_csv_to_DictStream(path2de_csv, metaliste)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
119
init.py
119
init.py
|
@ -26,6 +26,7 @@ with open(config_ini) as f:
|
||||||
config.read_file(f)
|
config.read_file(f)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"
|
||||||
|
|
||||||
|
|
||||||
# config logging
|
# config logging
|
||||||
|
@ -80,7 +81,7 @@ def create_lemma_dict(lemmalist):
|
||||||
|
|
||||||
return lemma_dict
|
return lemma_dict
|
||||||
|
|
||||||
|
"""
|
||||||
def build_thesaurus(path2lexicalentries, path2synsets):
|
def build_thesaurus(path2lexicalentries, path2synsets):
|
||||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||||
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||||
|
@ -134,6 +135,94 @@ def build_thesaurus(path2lexicalentries, path2synsets):
|
||||||
return thesaurus
|
return thesaurus
|
||||||
|
|
||||||
#todo thesaurus in dictionary
|
#todo thesaurus in dictionary
|
||||||
|
"""
|
||||||
|
|
||||||
|
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||||
|
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||||
|
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||||
|
|
||||||
|
lexroot = lextree.getroot()
|
||||||
|
#synroot = syntree.getroot()
|
||||||
|
|
||||||
|
|
||||||
|
word2synsets = {}
|
||||||
|
template = {"w1": ["s1", "s2"]}
|
||||||
|
|
||||||
|
for ro in lexroot:
|
||||||
|
for elem in ro:
|
||||||
|
if elem.tag == "LexicalEntry":
|
||||||
|
lex_dictlist = [subentry.attrib for subentry in elem]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
synlist = []
|
||||||
|
string = "WORD"
|
||||||
|
|
||||||
|
for lex_dict in lex_dictlist:
|
||||||
|
if "synset" in lex_dict.keys():
|
||||||
|
|
||||||
|
synset = lex_dict["synset"]
|
||||||
|
synlist.append(synset)
|
||||||
|
|
||||||
|
if 'writtenForm' in lex_dict.keys():
|
||||||
|
string = (lex_dict["writtenForm"])
|
||||||
|
|
||||||
|
# replaceRockDots
|
||||||
|
string = re.sub(r'[ß]', "ss", string)
|
||||||
|
string = re.sub(r'[ö]', "oe", string)
|
||||||
|
string = re.sub(r'[ü]', "ue", string)
|
||||||
|
string = re.sub(r'[ä]', "ae", string)
|
||||||
|
|
||||||
|
# alle punkte raus
|
||||||
|
string = re.sub(r'[.]', "", string)
|
||||||
|
|
||||||
|
# alles in klammern raus
|
||||||
|
string = re.sub(r"\((.*)\)", " ", string)
|
||||||
|
|
||||||
|
# längeres leerzeichen normalisieren
|
||||||
|
string = textacy.preprocess.normalize_whitespace(string)
|
||||||
|
|
||||||
|
string = string.lower().strip()
|
||||||
|
|
||||||
|
word2synsets[string] = synlist
|
||||||
|
|
||||||
|
synset2Words = {}
|
||||||
|
template = {"s1": ["w1","w2"]}
|
||||||
|
|
||||||
|
for word,synset in word2synsets.items():
|
||||||
|
for syn in synset:
|
||||||
|
if syn not in synset2Words.keys():
|
||||||
|
synset2Words[syn] = [word]
|
||||||
|
else:
|
||||||
|
synset2Words[syn].append(word)
|
||||||
|
|
||||||
|
# nach anzhal der wörter in den strings sortieren
|
||||||
|
for synset in word2synsets.values():
|
||||||
|
synset.sort(key=lambda x: len(x.split()))
|
||||||
|
|
||||||
|
thesaurus = {}
|
||||||
|
thesaurus_template = {"w1" : "mainsyn"}
|
||||||
|
|
||||||
|
for word,synset in word2synsets.items():
|
||||||
|
try:
|
||||||
|
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return thesaurus
|
||||||
|
|
||||||
|
"""
|
||||||
|
for r in synroot:
|
||||||
|
for element in r:
|
||||||
|
|
||||||
|
if element.tag == "Synset":
|
||||||
|
synset = []
|
||||||
|
attrib = element.attrib
|
||||||
|
id = attrib["id"]
|
||||||
|
|
||||||
|
if id not in synset2Words.keys():
|
||||||
|
synset2Words[id] = "WORD"
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_stopwordlist():
|
def create_stopwordlist():
|
||||||
|
@ -151,7 +240,7 @@ def create_stopwordlist():
|
||||||
|
|
||||||
de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))
|
de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))
|
||||||
|
|
||||||
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("stopwords-de.txt"))))
|
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))
|
||||||
|
|
||||||
de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))
|
de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))
|
||||||
|
|
||||||
|
@ -172,34 +261,29 @@ def words(text): return re.findall(r'\w+', text.lower())
|
||||||
|
|
||||||
##################################################################################################
|
##################################################################################################
|
||||||
|
|
||||||
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare .json
|
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien
|
||||||
# außerdem saubere stoppwortliste und nomenliste
|
# außerdem saubere stoppwortliste und nomenliste
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# THESAURUS
|
# THESAURUS
|
||||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||||
|
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
|
||||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
|
||||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# SPELLCHECKING
|
# SPELLCHECKING
|
||||||
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
|
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
|
||||||
|
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
|
||||||
|
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
|
||||||
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl"
|
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
|
||||||
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list.pkl"
|
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
|
||||||
path2thesauruslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list.pkl"
|
|
||||||
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl"
|
|
||||||
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl"
|
|
||||||
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -235,11 +319,10 @@ def main():
|
||||||
|
|
||||||
|
|
||||||
printlog("Build and save Thesaurus")
|
printlog("Build and save Thesaurus")
|
||||||
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries, path2synsets=synsets)
|
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries)
|
||||||
print(THESAURUS[0:10])
|
|
||||||
|
|
||||||
|
|
||||||
save_obj(THESAURUS, path2thesauruslist)
|
save_obj(THESAURUS, path2thesaurusdict)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
140
preprocessing.py
140
preprocessing.py
|
@ -38,6 +38,45 @@ csv.field_size_limit(sys.maxsize)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
def save_obj(obj, path):
|
||||||
|
with open(path + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
def load_obj(path ):
|
||||||
|
with open(path + '.pkl', 'rb') as f:
|
||||||
|
return pickle.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||||
|
|
||||||
|
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||||
|
metapath = corpus_path + corpus_name + "_meta.json"
|
||||||
|
|
||||||
|
#load parser
|
||||||
|
parserpath = corpus_path + str(lang) + '_parser'
|
||||||
|
parser = spacy.load(parserpath)
|
||||||
|
|
||||||
|
corpus = textacy.Corpus(parser)
|
||||||
|
|
||||||
|
|
||||||
|
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||||
|
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||||
|
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||||
|
corpus.add_doc(
|
||||||
|
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||||
|
return corpus
|
||||||
|
|
||||||
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||||
|
corpus_name = "de_raw_ticketCorpus"
|
||||||
|
|
||||||
|
print(load_corpus(corpus_path,corpus_name))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
||||||
|
|
||||||
|
@ -63,36 +102,31 @@ logging.basicConfig(filename=logfile, level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
# THESAURUS
|
# THESAURUS
|
||||||
|
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
|
||||||
|
THESAURUS = load_obj(path2thesaurusdict)
|
||||||
|
|
||||||
# thesauruspath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv"
|
|
||||||
# thesauruspath = config.get("filepath","thesauruspath")
|
|
||||||
# THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
|
|
||||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
|
||||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
|
||||||
|
|
||||||
# SPELLCHECKING
|
# SPELLCHECKING
|
||||||
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
|
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
|
||||||
|
|
||||||
|
|
||||||
|
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
|
||||||
|
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
|
||||||
|
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
|
||||||
|
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# SPELLCHECKING
|
||||||
|
|
||||||
|
|
||||||
|
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
|
||||||
|
|
||||||
DE_PARSER = spacy.load("de")
|
DE_PARSER = spacy.load("de")
|
||||||
EN_PARSER = spacy.load("en")
|
EN_PARSER = spacy.load("en")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
de_stop_words= set(
|
|
||||||
list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) +
|
|
||||||
list(textacy.fileio.read_file_lines("stopwords-de.txt"))
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt"))
|
|
||||||
|
|
||||||
VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
|
|
||||||
"""
|
|
||||||
|
|
||||||
de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
||||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german')))
|
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german')))
|
||||||
|
|
||||||
|
@ -126,15 +160,7 @@ specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORE
|
||||||
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
|
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
def save_obj(obj, path):
|
|
||||||
with open(path + '.pkl', 'wb') as f:
|
|
||||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
|
||||||
|
|
||||||
def load_obj(path ):
|
|
||||||
with open(path + '.pkl', 'rb') as f:
|
|
||||||
return pickle.load(f)
|
|
||||||
def printlog(string, level="INFO"):
|
def printlog(string, level="INFO"):
|
||||||
"""log and prints"""
|
"""log and prints"""
|
||||||
print(string)
|
print(string)
|
||||||
|
@ -238,21 +264,6 @@ def csv_to_metaStream(path2csv: str, metalist: [str]):
|
||||||
yield metadata
|
yield metadata
|
||||||
|
|
||||||
|
|
||||||
def save_corpus(corpus, corpus_path, corpus_name, parser=DE_PARSER):
|
|
||||||
|
|
||||||
# save stringstore
|
|
||||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
|
||||||
with open(stringstore_path, "w") as file:
|
|
||||||
parser.vocab.strings.dump(file)
|
|
||||||
|
|
||||||
# save content
|
|
||||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
|
||||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
|
||||||
|
|
||||||
# save meta
|
|
||||||
metapath = corpus_path + corpus_name + "_meta.json"
|
|
||||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
############# filter tokens
|
############# filter tokens
|
||||||
|
@ -751,6 +762,51 @@ def filterTokens(tokens, funclist):
|
||||||
|
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
def cleanString(string):
|
||||||
|
# replaceRockDots
|
||||||
|
string = re.sub(r'[ß]', "ss", string)
|
||||||
|
string = re.sub(r'[ö]', "oe", string)
|
||||||
|
string = re.sub(r'[ü]', "ue", string)
|
||||||
|
string = re.sub(r'[ä]', "ae", string)
|
||||||
|
|
||||||
|
|
||||||
|
# längeres leerzeichen normalisieren
|
||||||
|
string = textacy.preprocess.normalize_whitespace(string)
|
||||||
|
|
||||||
|
return(string)
|
||||||
|
|
||||||
|
def normalizeTextStream(textstream,clean=False):
|
||||||
|
"""
|
||||||
|
:param textstream: string-gen
|
||||||
|
:param parser: spacy-parser
|
||||||
|
:yield: string-gen
|
||||||
|
"""
|
||||||
|
|
||||||
|
for txt in textstream:
|
||||||
|
if clean:
|
||||||
|
yield cleanString(txt)
|
||||||
|
else:
|
||||||
|
yield textacy.preprocess.normalize_whitespace(txt)
|
||||||
|
|
||||||
|
def nomalizeDictstream(dictstream, clean=False):
|
||||||
|
"""
|
||||||
|
:param dictstream: dict-gen
|
||||||
|
:param parser: spacy-parser
|
||||||
|
:yield: dict-gen
|
||||||
|
"""
|
||||||
|
|
||||||
|
for dic in dictstream:
|
||||||
|
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
for key, value in dic.items():
|
||||||
|
if clean:
|
||||||
|
result[key] = cleanString(value)
|
||||||
|
else:
|
||||||
|
result[key] = textacy.preprocess.normalize_whitespace(value)
|
||||||
|
yield result
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
|
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
|
||||||
"hallo", "gerne", "freundlich", "fragen", "fehler", "bitten", "ehre", "lieb", "helfen",
|
"hallo", "gerne", "freundlich", "fragen", "fehler", "bitten", "ehre", "lieb", "helfen",
|
||||||
|
|
126
testra.py
126
testra.py
|
@ -5,6 +5,7 @@ import json
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
import textacy
|
import textacy
|
||||||
|
from functools import reduce
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
|
@ -52,6 +53,8 @@ corpus.add_texts(
|
||||||
|
|
||||||
print(corpus)
|
print(corpus)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
def save_obj(obj, path):
|
def save_obj(obj, path):
|
||||||
|
@ -63,31 +66,122 @@ def load_obj(path ):
|
||||||
return pickle.load(f)
|
return pickle.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
# THESAURUS
|
||||||
|
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
||||||
|
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||||
|
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||||
|
|
||||||
lemmalist = list(map(textacy.preprocess.normalize_whitespace,
|
|
||||||
list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))))
|
|
||||||
|
|
||||||
lemma_dict = {}
|
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||||
|
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||||
|
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||||
|
|
||||||
for line in lemmalist:
|
lexroot = lextree.getroot()
|
||||||
|
#synroot = syntree.getroot()
|
||||||
|
|
||||||
lem_word_pair = line.split()
|
|
||||||
|
|
||||||
lemma = lem_word_pair[0].strip().lower()
|
word2synsets = {}
|
||||||
|
template = {"w1": ["s1", "s2"]}
|
||||||
|
|
||||||
word = lem_word_pair[1].strip().lower()
|
for ro in lexroot:
|
||||||
|
for elem in ro:
|
||||||
lemma_dict[word] = lemma
|
if elem.tag == "LexicalEntry":
|
||||||
|
lex_dictlist = [subentry.attrib for subentry in elem]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print(lemma_dict["abbekomme"])
|
synlist = []
|
||||||
|
string = "WORD"
|
||||||
|
|
||||||
save_obj(lemma_dict, "test_dictionies")
|
for lex_dict in lex_dictlist:
|
||||||
|
if "synset" in lex_dict.keys():
|
||||||
|
|
||||||
loaded = load_obj("test_dictionies")
|
synset = lex_dict["synset"]
|
||||||
|
synlist.append(synset)
|
||||||
|
|
||||||
print(loaded["abbekomme"])
|
if 'writtenForm' in lex_dict.keys():
|
||||||
|
string = (lex_dict["writtenForm"])
|
||||||
|
|
||||||
|
# replaceRockDots
|
||||||
|
string = re.sub(r'[ß]', "ss", string)
|
||||||
|
string = re.sub(r'[ö]', "oe", string)
|
||||||
|
string = re.sub(r'[ü]', "ue", string)
|
||||||
|
string = re.sub(r'[ä]', "ae", string)
|
||||||
|
|
||||||
|
# alle punkte raus
|
||||||
|
string = re.sub(r'[.]', "", string)
|
||||||
|
|
||||||
|
# alles in klammern raus
|
||||||
|
string = re.sub(r"\((.*)\)", " ", string)
|
||||||
|
|
||||||
|
# längeres leerzeichen normalisieren
|
||||||
|
string = textacy.preprocess.normalize_whitespace(string)
|
||||||
|
|
||||||
|
string = string.lower().strip()
|
||||||
|
|
||||||
|
word2synsets[string] = synlist
|
||||||
|
|
||||||
|
synset2Words = {}
|
||||||
|
template = {"s1": ["w1","w2"]}
|
||||||
|
|
||||||
|
for word,synset in word2synsets.items():
|
||||||
|
for syn in synset:
|
||||||
|
if syn not in synset2Words.keys():
|
||||||
|
synset2Words[syn] = [word]
|
||||||
|
else:
|
||||||
|
synset2Words[syn].append(word)
|
||||||
|
|
||||||
|
# nach anzhal der wörter in den strings sortieren
|
||||||
|
for synset in word2synsets.values():
|
||||||
|
synset.sort(key=lambda x: len(x.split()))
|
||||||
|
|
||||||
|
thesaurus = {}
|
||||||
|
thesaurus_template = {"w1" : "mainsyn"}
|
||||||
|
|
||||||
|
for word,synset in word2synsets.items():
|
||||||
|
try:
|
||||||
|
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return thesaurus
|
||||||
|
|
||||||
|
"""
|
||||||
|
for r in synroot:
|
||||||
|
for element in r:
|
||||||
|
|
||||||
|
if element.tag == "Synset":
|
||||||
|
synset = []
|
||||||
|
attrib = element.attrib
|
||||||
|
id = attrib["id"]
|
||||||
|
|
||||||
|
if id not in synset2Words.keys():
|
||||||
|
synset2Words[id] = "WORD"
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||||
|
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||||
|
metapath = corpus_path + corpus_name + "_meta.json"
|
||||||
|
|
||||||
|
# load parser
|
||||||
|
parserpath = corpus_path + str(lang) + '_parser'
|
||||||
|
parser = spacy.load(parserpath)
|
||||||
|
|
||||||
|
corpus = textacy.Corpus(parser)
|
||||||
|
|
||||||
|
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||||
|
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||||
|
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||||
|
corpus.add_doc(
|
||||||
|
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||||
|
return corpus
|
||||||
|
|
||||||
|
#todo load corpus from file idee stringstore und vocab laden
|
||||||
|
|
||||||
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||||
|
corpus_name = "de_raw_ticketCorpus"
|
||||||
|
|
||||||
|
print(load_corpus(corpus_path, corpus_name))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
from postal.parser import parse_address
|
from postal.parser import parse_address
|
||||||
|
@ -101,12 +195,6 @@ address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder-
|
||||||
print(parse_address(address))
|
print(parse_address(address))
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||||
|
|
Loading…
Reference in New Issue