thesaurus erstellung luafzeit verbessert
This commit is contained in:
parent
93e239756c
commit
4fe12679fb
|
@ -33,7 +33,6 @@ path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-E
|
|||
content_collumn_name = "Description"
|
||||
|
||||
metaliste = [
|
||||
|
||||
"TicketNumber",
|
||||
"Subject",
|
||||
"CreatedDate",
|
||||
|
@ -46,15 +45,18 @@ metaliste = [
|
|||
"Solution"
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "de_raw_corpus"
|
||||
corpus_name = "de_raw_ticketCorpus"
|
||||
|
||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
||||
|
||||
|
||||
|
||||
|
||||
# todo configuration file ?
|
||||
# todo configuration file
|
||||
"""
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
|
||||
|
@ -98,7 +100,7 @@ def printRandomDoc(textacyCorpus):
|
|||
|
||||
|
||||
|
||||
def csv_to_textStream(path2csv: str, content_collumn_name: str):
|
||||
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
|
||||
"""
|
||||
:param path2csv: string
|
||||
:param content_collumn_name: string
|
||||
|
@ -117,7 +119,7 @@ def csv_to_textStream(path2csv: str, content_collumn_name: str):
|
|||
yield lst[content_collumn]
|
||||
|
||||
|
||||
def csv_to_DictStream(path2csv: str, metalist: [str]):
|
||||
def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
|
||||
"""
|
||||
:param path2csv: string
|
||||
:param metalist: list of strings
|
||||
|
@ -155,7 +157,8 @@ def save_corpus(corpus, corpus_path, corpus_name, parser):
|
|||
"""
|
||||
|
||||
# save parser
|
||||
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
|
||||
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||
parser.save_to_directory(parserpath)
|
||||
|
||||
# save content
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
|
@ -171,34 +174,6 @@ def save_corpus(corpus, corpus_path, corpus_name, parser):
|
|||
|
||||
|
||||
|
||||
def cleanTextstream(textstream):
|
||||
"""
|
||||
:param textstream: string-gen
|
||||
:param parser: spacy-parser
|
||||
:yield: string-gen
|
||||
"""
|
||||
|
||||
for txt in textstream:
|
||||
yield textacy.preprocess.normalize_whitespace(txt)
|
||||
|
||||
|
||||
def cleanDictstream(dictstream):
|
||||
"""
|
||||
:param dictstream: dict-gen
|
||||
:param parser: spacy-parser
|
||||
:yield: dict-gen
|
||||
"""
|
||||
|
||||
for dic in dictstream:
|
||||
|
||||
result = {}
|
||||
|
||||
for key, value in dic.items():
|
||||
result[key] = textacy.preprocess.normalize_whitespace(value)
|
||||
yield result
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
printlog("Corporization: {0}".format(datetime.now()))
|
||||
|
@ -222,8 +197,8 @@ def main():
|
|||
printlog("Add texts to textacy-corpus")
|
||||
|
||||
de_corpus.add_texts(
|
||||
cleanTextstream(csv_to_textStream(path2de_csv, content_collumn_name)),
|
||||
cleanDictstream(csv_to_DictStream(path2de_csv, metaliste))
|
||||
ticketcsv_to_textStream(path2de_csv, content_collumn_name),
|
||||
ticket_csv_to_DictStream(path2de_csv, metaliste)
|
||||
)
|
||||
|
||||
|
||||
|
|
119
init.py
119
init.py
|
@ -26,6 +26,7 @@ with open(config_ini) as f:
|
|||
config.read_file(f)
|
||||
"""
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"
|
||||
|
||||
|
||||
# config logging
|
||||
|
@ -80,7 +81,7 @@ def create_lemma_dict(lemmalist):
|
|||
|
||||
return lemma_dict
|
||||
|
||||
|
||||
"""
|
||||
def build_thesaurus(path2lexicalentries, path2synsets):
|
||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||
|
@ -134,6 +135,94 @@ def build_thesaurus(path2lexicalentries, path2synsets):
|
|||
return thesaurus
|
||||
|
||||
#todo thesaurus in dictionary
|
||||
"""
|
||||
|
||||
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
lexroot = lextree.getroot()
|
||||
#synroot = syntree.getroot()
|
||||
|
||||
|
||||
word2synsets = {}
|
||||
template = {"w1": ["s1", "s2"]}
|
||||
|
||||
for ro in lexroot:
|
||||
for elem in ro:
|
||||
if elem.tag == "LexicalEntry":
|
||||
lex_dictlist = [subentry.attrib for subentry in elem]
|
||||
|
||||
|
||||
|
||||
synlist = []
|
||||
string = "WORD"
|
||||
|
||||
for lex_dict in lex_dictlist:
|
||||
if "synset" in lex_dict.keys():
|
||||
|
||||
synset = lex_dict["synset"]
|
||||
synlist.append(synset)
|
||||
|
||||
if 'writtenForm' in lex_dict.keys():
|
||||
string = (lex_dict["writtenForm"])
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# alle punkte raus
|
||||
string = re.sub(r'[.]', "", string)
|
||||
|
||||
# alles in klammern raus
|
||||
string = re.sub(r"\((.*)\)", " ", string)
|
||||
|
||||
# längeres leerzeichen normalisieren
|
||||
string = textacy.preprocess.normalize_whitespace(string)
|
||||
|
||||
string = string.lower().strip()
|
||||
|
||||
word2synsets[string] = synlist
|
||||
|
||||
synset2Words = {}
|
||||
template = {"s1": ["w1","w2"]}
|
||||
|
||||
for word,synset in word2synsets.items():
|
||||
for syn in synset:
|
||||
if syn not in synset2Words.keys():
|
||||
synset2Words[syn] = [word]
|
||||
else:
|
||||
synset2Words[syn].append(word)
|
||||
|
||||
# nach anzhal der wörter in den strings sortieren
|
||||
for synset in word2synsets.values():
|
||||
synset.sort(key=lambda x: len(x.split()))
|
||||
|
||||
thesaurus = {}
|
||||
thesaurus_template = {"w1" : "mainsyn"}
|
||||
|
||||
for word,synset in word2synsets.items():
|
||||
try:
|
||||
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
|
||||
except:
|
||||
pass
|
||||
return thesaurus
|
||||
|
||||
"""
|
||||
for r in synroot:
|
||||
for element in r:
|
||||
|
||||
if element.tag == "Synset":
|
||||
synset = []
|
||||
attrib = element.attrib
|
||||
id = attrib["id"]
|
||||
|
||||
if id not in synset2Words.keys():
|
||||
synset2Words[id] = "WORD"
|
||||
"""
|
||||
|
||||
|
||||
|
||||
def create_stopwordlist():
|
||||
|
@ -151,7 +240,7 @@ def create_stopwordlist():
|
|||
|
||||
de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))
|
||||
|
||||
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("stopwords-de.txt"))))
|
||||
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))
|
||||
|
||||
de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))
|
||||
|
||||
|
@ -172,34 +261,29 @@ def words(text): return re.findall(r'\w+', text.lower())
|
|||
|
||||
##################################################################################################
|
||||
|
||||
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare .json
|
||||
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien
|
||||
# außerdem saubere stoppwortliste und nomenliste
|
||||
|
||||
|
||||
|
||||
# THESAURUS
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||
#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
|
||||
|
||||
|
||||
|
||||
# SPELLCHECKING
|
||||
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
|
||||
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl"
|
||||
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list.pkl"
|
||||
path2thesauruslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list.pkl"
|
||||
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl"
|
||||
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl"
|
||||
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl"
|
||||
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
|
||||
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
|
||||
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
|
||||
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
|
||||
|
||||
|
||||
|
||||
|
@ -235,11 +319,10 @@ def main():
|
|||
|
||||
|
||||
printlog("Build and save Thesaurus")
|
||||
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries, path2synsets=synsets)
|
||||
print(THESAURUS[0:10])
|
||||
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries)
|
||||
|
||||
|
||||
save_obj(THESAURUS, path2thesauruslist)
|
||||
save_obj(THESAURUS, path2thesaurusdict)
|
||||
|
||||
|
||||
|
||||
|
|
140
preprocessing.py
140
preprocessing.py
|
@ -38,6 +38,45 @@ csv.field_size_limit(sys.maxsize)
|
|||
|
||||
|
||||
|
||||
import pickle
|
||||
|
||||
def save_obj(obj, path):
|
||||
with open(path + '.pkl', 'wb') as f:
|
||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_obj(path ):
|
||||
with open(path + '.pkl', 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
|
||||
#load parser
|
||||
parserpath = corpus_path + str(lang) + '_parser'
|
||||
parser = spacy.load(parserpath)
|
||||
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
return corpus
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "de_raw_ticketCorpus"
|
||||
|
||||
print(load_corpus(corpus_path,corpus_name))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
||||
|
||||
|
@ -63,36 +102,31 @@ logging.basicConfig(filename=logfile, level=logging.INFO)
|
|||
|
||||
|
||||
# THESAURUS
|
||||
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
|
||||
THESAURUS = load_obj(path2thesaurusdict)
|
||||
|
||||
# thesauruspath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv"
|
||||
# thesauruspath = config.get("filepath","thesauruspath")
|
||||
# THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||
|
||||
# SPELLCHECKING
|
||||
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
|
||||
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
|
||||
|
||||
|
||||
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
|
||||
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
|
||||
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
|
||||
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
|
||||
|
||||
|
||||
|
||||
# SPELLCHECKING
|
||||
|
||||
|
||||
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
|
||||
|
||||
DE_PARSER = spacy.load("de")
|
||||
EN_PARSER = spacy.load("en")
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
de_stop_words= set(
|
||||
list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) +
|
||||
list(textacy.fileio.read_file_lines("stopwords-de.txt"))
|
||||
)
|
||||
|
||||
|
||||
LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt"))
|
||||
|
||||
VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
|
||||
"""
|
||||
|
||||
de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german')))
|
||||
|
||||
|
@ -126,15 +160,7 @@ specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORE
|
|||
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
|
||||
|
||||
|
||||
import pickle
|
||||
|
||||
def save_obj(obj, path):
|
||||
with open(path + '.pkl', 'wb') as f:
|
||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_obj(path ):
|
||||
with open(path + '.pkl', 'rb') as f:
|
||||
return pickle.load(f)
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
|
@ -238,21 +264,6 @@ def csv_to_metaStream(path2csv: str, metalist: [str]):
|
|||
yield metadata
|
||||
|
||||
|
||||
def save_corpus(corpus, corpus_path, corpus_name, parser=DE_PARSER):
|
||||
|
||||
# save stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path, "w") as file:
|
||||
parser.vocab.strings.dump(file)
|
||||
|
||||
# save content
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
||||
|
||||
# save meta
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||||
|
||||
|
||||
|
||||
############# filter tokens
|
||||
|
@ -751,6 +762,51 @@ def filterTokens(tokens, funclist):
|
|||
|
||||
return tokens
|
||||
|
||||
def cleanString(string):
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
|
||||
# längeres leerzeichen normalisieren
|
||||
string = textacy.preprocess.normalize_whitespace(string)
|
||||
|
||||
return(string)
|
||||
|
||||
def normalizeTextStream(textstream,clean=False):
|
||||
"""
|
||||
:param textstream: string-gen
|
||||
:param parser: spacy-parser
|
||||
:yield: string-gen
|
||||
"""
|
||||
|
||||
for txt in textstream:
|
||||
if clean:
|
||||
yield cleanString(txt)
|
||||
else:
|
||||
yield textacy.preprocess.normalize_whitespace(txt)
|
||||
|
||||
def nomalizeDictstream(dictstream, clean=False):
|
||||
"""
|
||||
:param dictstream: dict-gen
|
||||
:param parser: spacy-parser
|
||||
:yield: dict-gen
|
||||
"""
|
||||
|
||||
for dic in dictstream:
|
||||
|
||||
result = {}
|
||||
|
||||
for key, value in dic.items():
|
||||
if clean:
|
||||
result[key] = cleanString(value)
|
||||
else:
|
||||
result[key] = textacy.preprocess.normalize_whitespace(value)
|
||||
yield result
|
||||
|
||||
|
||||
|
||||
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
|
||||
"hallo", "gerne", "freundlich", "fragen", "fehler", "bitten", "ehre", "lieb", "helfen",
|
||||
|
|
126
testra.py
126
testra.py
|
@ -5,6 +5,7 @@ import json
|
|||
|
||||
import spacy
|
||||
import textacy
|
||||
from functools import reduce
|
||||
|
||||
start = time.time()
|
||||
|
||||
|
@ -52,6 +53,8 @@ corpus.add_texts(
|
|||
|
||||
print(corpus)
|
||||
"""
|
||||
|
||||
|
||||
import pickle
|
||||
|
||||
def save_obj(obj, path):
|
||||
|
@ -63,31 +66,122 @@ def load_obj(path ):
|
|||
return pickle.load(f)
|
||||
|
||||
|
||||
# THESAURUS
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||
|
||||
lemmalist = list(map(textacy.preprocess.normalize_whitespace,
|
||||
list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))))
|
||||
|
||||
lemma_dict = {}
|
||||
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
for line in lemmalist:
|
||||
lexroot = lextree.getroot()
|
||||
#synroot = syntree.getroot()
|
||||
|
||||
lem_word_pair = line.split()
|
||||
|
||||
lemma = lem_word_pair[0].strip().lower()
|
||||
word2synsets = {}
|
||||
template = {"w1": ["s1", "s2"]}
|
||||
|
||||
word = lem_word_pair[1].strip().lower()
|
||||
|
||||
lemma_dict[word] = lemma
|
||||
for ro in lexroot:
|
||||
for elem in ro:
|
||||
if elem.tag == "LexicalEntry":
|
||||
lex_dictlist = [subentry.attrib for subentry in elem]
|
||||
|
||||
|
||||
|
||||
print(lemma_dict["abbekomme"])
|
||||
synlist = []
|
||||
string = "WORD"
|
||||
|
||||
save_obj(lemma_dict, "test_dictionies")
|
||||
for lex_dict in lex_dictlist:
|
||||
if "synset" in lex_dict.keys():
|
||||
|
||||
loaded = load_obj("test_dictionies")
|
||||
synset = lex_dict["synset"]
|
||||
synlist.append(synset)
|
||||
|
||||
print(loaded["abbekomme"])
|
||||
if 'writtenForm' in lex_dict.keys():
|
||||
string = (lex_dict["writtenForm"])
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# alle punkte raus
|
||||
string = re.sub(r'[.]', "", string)
|
||||
|
||||
# alles in klammern raus
|
||||
string = re.sub(r"\((.*)\)", " ", string)
|
||||
|
||||
# längeres leerzeichen normalisieren
|
||||
string = textacy.preprocess.normalize_whitespace(string)
|
||||
|
||||
string = string.lower().strip()
|
||||
|
||||
word2synsets[string] = synlist
|
||||
|
||||
synset2Words = {}
|
||||
template = {"s1": ["w1","w2"]}
|
||||
|
||||
for word,synset in word2synsets.items():
|
||||
for syn in synset:
|
||||
if syn not in synset2Words.keys():
|
||||
synset2Words[syn] = [word]
|
||||
else:
|
||||
synset2Words[syn].append(word)
|
||||
|
||||
# nach anzhal der wörter in den strings sortieren
|
||||
for synset in word2synsets.values():
|
||||
synset.sort(key=lambda x: len(x.split()))
|
||||
|
||||
thesaurus = {}
|
||||
thesaurus_template = {"w1" : "mainsyn"}
|
||||
|
||||
for word,synset in word2synsets.items():
|
||||
try:
|
||||
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
|
||||
except:
|
||||
pass
|
||||
return thesaurus
|
||||
|
||||
"""
|
||||
for r in synroot:
|
||||
for element in r:
|
||||
|
||||
if element.tag == "Synset":
|
||||
synset = []
|
||||
attrib = element.attrib
|
||||
id = attrib["id"]
|
||||
|
||||
if id not in synset2Words.keys():
|
||||
synset2Words[id] = "WORD"
|
||||
"""
|
||||
|
||||
|
||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
|
||||
# load parser
|
||||
parserpath = corpus_path + str(lang) + '_parser'
|
||||
parser = spacy.load(parserpath)
|
||||
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
return corpus
|
||||
|
||||
#todo load corpus from file idee stringstore und vocab laden
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "de_raw_ticketCorpus"
|
||||
|
||||
print(load_corpus(corpus_path, corpus_name))
|
||||
|
||||
"""
|
||||
from postal.parser import parse_address
|
||||
|
@ -101,12 +195,6 @@ address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder-
|
|||
print(parse_address(address))
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
|
|
Loading…
Reference in New Issue