refactoring.
jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA
This commit is contained in:
parent
db7ea1a72a
commit
412f25d8d8
24
cleaning.py
24
cleaning.py
|
@ -30,6 +30,16 @@ with open(config_ini) as f:
|
||||||
|
|
||||||
|
|
||||||
def clean(stringstream):#, NOUNS):
|
def clean(stringstream):#, NOUNS):
|
||||||
|
"""
|
||||||
|
fix bad unicode
|
||||||
|
seperate_words_on_regex `\=~%^&*()_+\[\]{};\'"|</>
|
||||||
|
normalize whitespace
|
||||||
|
remove linebreaks
|
||||||
|
replaceRockDöts
|
||||||
|
|
||||||
|
:param stringstream: str-gen
|
||||||
|
:return: string-gen
|
||||||
|
"""
|
||||||
|
|
||||||
#NOUNS = [n.lower() for n in NOUNS]
|
#NOUNS = [n.lower() for n in NOUNS]
|
||||||
|
|
||||||
|
@ -90,19 +100,22 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
def cleanCorpus(corpus):
|
def cleanCorpus(corpus):
|
||||||
logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
|
logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
ressources_path = FILEPATH + "ressources/"
|
ressources_path = FILEPATH + "ressources/"
|
||||||
|
|
||||||
|
|
||||||
path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
|
path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
|
||||||
|
|
||||||
|
|
||||||
#NOUNS = load_obj(path2nouns_list)
|
#NOUNS = load_obj(path2nouns_list)
|
||||||
#noun_disjunction = '|'.join(NOUNS)
|
#noun_disjunction = '|'.join(NOUNS)
|
||||||
#nouns_tuples = []
|
#nouns_tuples = []
|
||||||
#for n in NOUNS:
|
#for n in NOUNS:
|
||||||
# nouns_tuples.append((n.lower(),n))
|
# nouns_tuples.append((n.lower(),n))
|
||||||
|
"""
|
||||||
cleanCorpus_name = corpus.lang + "_clean"
|
|
||||||
|
|
||||||
|
|
||||||
|
# load Corpus
|
||||||
raw_corpus = corpus
|
raw_corpus = corpus
|
||||||
parser = corpus.spacy_lang
|
parser = corpus.spacy_lang
|
||||||
|
|
||||||
|
@ -115,13 +128,14 @@ def cleanCorpus(corpus):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# leere docs aus corpi kicken
|
# leere docs aus corpus kicken
|
||||||
cleaned_corpus.remove(lambda doc: len(doc) == 0)
|
cleaned_corpus.remove(lambda doc: len(doc) == 0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#save corpus
|
#save corpus
|
||||||
|
cleanCorpus_name = corpus.lang + "_clean"
|
||||||
save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name)
|
save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -90,7 +90,16 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
|
|
||||||
|
|
||||||
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0):
|
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0):
|
||||||
|
"""
|
||||||
|
Use textacy to create a Corpus out of the ITMC-Ticket.csv
|
||||||
|
|
||||||
|
:param path2_csv: str
|
||||||
|
:param corpus_path: str
|
||||||
|
:param content_collumn_name: str the Collumn which is used as the Docs text
|
||||||
|
:param lang: str standard 2-letter language
|
||||||
|
:param printrandom: print n random Documents
|
||||||
|
:return: textacy.Corpus
|
||||||
|
"""
|
||||||
|
|
||||||
# print paths
|
# print paths
|
||||||
path_csv_split = path2_csv.split("/")
|
path_csv_split = path2_csv.split("/")
|
||||||
|
|
215
init.py
215
init.py
|
@ -28,20 +28,20 @@ with open(config_ini) as f:
|
||||||
|
|
||||||
def create_lemma_dict(path2lemmalist):
|
def create_lemma_dict(path2lemmalist):
|
||||||
"""
|
"""
|
||||||
Creates a dict out of a file a la:
|
Creates a dict out of a txt file a la:
|
||||||
|
|
||||||
l1 w1
|
l1 w1
|
||||||
l1 w2
|
l1 w2
|
||||||
l2 w1
|
l2 w1
|
||||||
l2 w2
|
l2 w2
|
||||||
|
|
||||||
Result will be used as lemma_dict["word"] --> lemma
|
Result will be used as lemma_dict[word] --> lemma
|
||||||
|
|
||||||
:param path2lemmalist: str
|
:param path2lemmalist: str
|
||||||
:return: dictionary
|
:return: dictionary
|
||||||
"""
|
"""
|
||||||
lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(
|
file_gen = textacy.fileio.read_file_lines(path2lemmalist)
|
||||||
textacy.fileio.read_file_lines(path2lemmalist))))
|
lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(file_gen)))
|
||||||
|
|
||||||
lemma_dict = {}
|
lemma_dict = {}
|
||||||
|
|
||||||
|
@ -63,7 +63,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
|
||||||
Creates a dict out of the deWordNet
|
Creates a dict out of the deWordNet
|
||||||
https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml
|
https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml
|
||||||
|
|
||||||
Result will be used as lemma_dict["word"] --> lemma
|
Result will be used as thesaurus[word] --> main_synonym
|
||||||
|
|
||||||
:param path2wordnet: str
|
:param path2wordnet: str
|
||||||
:param returnall: bool if True, also return , word2synsets, synset2Words
|
:param returnall: bool if True, also return , word2synsets, synset2Words
|
||||||
|
@ -73,6 +73,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
|
||||||
|
|
||||||
lexroot = lextree.getroot()
|
lexroot = lextree.getroot()
|
||||||
|
|
||||||
|
# Build word2synsets
|
||||||
word2synsets = {}
|
word2synsets = {}
|
||||||
template = {"w1": ["s1", "s2"]}
|
template = {"w1": ["s1", "s2"]}
|
||||||
|
|
||||||
|
@ -82,7 +83,6 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
|
||||||
lex_dictlist = [subentry.attrib for subentry in elem]
|
lex_dictlist = [subentry.attrib for subentry in elem]
|
||||||
|
|
||||||
# idee technischer thesaurus
|
# idee technischer thesaurus
|
||||||
# idee hauptsynonmy muss einzelnes wort sein
|
|
||||||
|
|
||||||
synlist = []
|
synlist = []
|
||||||
string = "WORD"
|
string = "WORD"
|
||||||
|
@ -96,55 +96,92 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
|
||||||
if 'writtenForm' in lex_dict.keys():
|
if 'writtenForm' in lex_dict.keys():
|
||||||
string = (lex_dict["writtenForm"])
|
string = (lex_dict["writtenForm"])
|
||||||
|
|
||||||
if string == "Kennwort":
|
|
||||||
pass
|
|
||||||
|
|
||||||
# replaceRockDots
|
# replaceRockDots
|
||||||
string = re.sub(r'[ß]', "ss", string)
|
string = re.sub(r'[ß]', "ss", string)
|
||||||
string = re.sub(r'[ö]', "oe", string)
|
string = re.sub(r'[ö]', "oe", string)
|
||||||
|
string = re.sub(r'[Ö]', "Oe", string)
|
||||||
|
|
||||||
string = re.sub(r'[ü]', "ue", string)
|
string = re.sub(r'[ü]', "ue", string)
|
||||||
|
string = re.sub(r'[Ü]', "Ue", string)
|
||||||
|
|
||||||
string = re.sub(r'[ä]', "ae", string)
|
string = re.sub(r'[ä]', "ae", string)
|
||||||
|
string = re.sub(r'[Ä]', "ae", string)
|
||||||
|
|
||||||
# alle punkte raus
|
# alle punkte raus
|
||||||
string = re.sub(r'[.]', "", string)
|
string = re.sub(r'[.]', "", string)
|
||||||
|
|
||||||
# alles in klammern raus
|
# alles in klammern raus
|
||||||
string = re.sub(r"\((.*)\)", " ", string)
|
if "auptform" in string:
|
||||||
|
string = re.sub(r"\((.*)\)", " ", string)
|
||||||
|
string = string + " (hauptform)" # evtl. als hauptform merken
|
||||||
|
else:
|
||||||
|
string = re.sub(r"\((.*)\)", " ", string)
|
||||||
|
|
||||||
# längeres leerzeichen normalisieren
|
# längeres leerzeichen normalisieren
|
||||||
string = textacy.preprocess.normalize_whitespace(string)
|
string = textacy.preprocess.normalize_whitespace(string)
|
||||||
|
|
||||||
string = string.lower().strip()
|
string = string.strip()#.lower()
|
||||||
|
|
||||||
word2synsets[string] = synlist
|
if string != '':
|
||||||
|
word2synsets[string] = synlist
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Build synset2Words
|
||||||
synset2Words = {}
|
synset2Words = {}
|
||||||
template = {"s1": ["w1","w2"]}
|
template = {"s1": ["w1","w2"]}
|
||||||
|
|
||||||
for word,synset in word2synsets.items():
|
for word,synset in word2synsets.items():
|
||||||
if word != '':
|
if word != '':
|
||||||
|
|
||||||
|
|
||||||
for syn in synset:
|
for syn in synset:
|
||||||
if syn not in synset2Words.keys():
|
if syn not in synset2Words.keys():
|
||||||
synset2Words[syn] = [word]
|
synset2Words[syn] = [word]
|
||||||
else:
|
else:
|
||||||
synset2Words[syn].append(word)
|
synset2Words[syn].append(word)
|
||||||
|
|
||||||
# nach anzhal der wörter in den strings sortieren
|
|
||||||
for synset in word2synsets.values():
|
# Sortieren
|
||||||
synset.sort(key=lambda x: len(x.split()))
|
for words in synset2Words.values():
|
||||||
|
words.sort(key=lambda w: len(w.split())) # nach anzhal der wörter in den strings (weniger nach vorne)
|
||||||
|
for w in words:
|
||||||
|
if "(hauptform)" in w:
|
||||||
|
to_insert = re.sub(r"\((.*)\)", " ", w).strip()
|
||||||
|
|
||||||
|
words.remove(w)
|
||||||
|
words.insert(0, to_insert) # Hauptform evtl. nach vorne
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
thesaurus = {}
|
thesaurus = {}
|
||||||
thesaurus_template = {"w1" : "mainsyn"}
|
thesaurus_template = {"w1" : "mainsyn"}
|
||||||
|
# word --> [synset1, synset2, .. ] --> synset1 --> [syn1, syn2, ... ] --> syn1 / mainsyn
|
||||||
|
|
||||||
|
|
||||||
for word,synset in word2synsets.items():
|
for word,synsets in word2synsets.items(): #word , [synset1, synset2, .. ]
|
||||||
try:
|
try:
|
||||||
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym #todo nach (hauptform) suchen?
|
if "Passwort" in word:
|
||||||
|
x=2
|
||||||
|
|
||||||
|
first_synset = synsets[0] #erstes synset wählen . praktischer Grund
|
||||||
|
|
||||||
|
syns = synset2Words[first_synset] # [syn1, syn2, ... ]
|
||||||
|
|
||||||
|
first_syn = syns[0] # erstes synonym (evtl. Hauptform) wählen
|
||||||
|
|
||||||
|
word = re.sub(r"\((.*)\)", " ", word).strip() #(hautpform weg)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
thesaurus[word] = first_syn #Ann.: erstes synonym ist das Hauptsynonym
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if returnall:
|
if returnall:
|
||||||
return thesaurus, word2synsets, synset2Words
|
return thesaurus, word2synsets, synset2Words
|
||||||
else:
|
else:
|
||||||
|
@ -237,39 +274,8 @@ def build_words_for_spellchecking(path2words):
|
||||||
|
|
||||||
##################################################################################################
|
##################################################################################################
|
||||||
|
|
||||||
# THESAURUS
|
|
||||||
ressources_path = FILEPATH + "ressources/"
|
|
||||||
path2wordnet = ressources_path + config.get("thesaurus","input")
|
|
||||||
path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file")
|
|
||||||
|
|
||||||
|
|
||||||
# SPELLCHECKING
|
|
||||||
path2words_file = ressources_path + config.get("spellchecking","input")
|
|
||||||
path2wordlist = ressources_path + config.get("spellchecking","pickle_file")
|
|
||||||
|
|
||||||
|
|
||||||
# LEMMA
|
|
||||||
path2lemma_file = ressources_path + config.get("lemmatization","input")
|
|
||||||
path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
|
|
||||||
|
|
||||||
# NOMEN
|
|
||||||
nouns0 = ressources_path + config.get("nouns","input")
|
|
||||||
nouns1 = ressources_path + config.get("nouns","input1")
|
|
||||||
nouns2 = ressources_path + config.get("nouns","input2")
|
|
||||||
path2nouns_list = ressources_path + config.get("nouns","pickle_file")
|
|
||||||
|
|
||||||
|
|
||||||
# VORNAMEN
|
|
||||||
firstnames_txt = ressources_path + config.get("firstnames","input")
|
|
||||||
path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
|
|
||||||
|
|
||||||
# STOPWORDS
|
|
||||||
stop1 = ressources_path + config.get("de_stopwords","input1")
|
|
||||||
stop2 = ressources_path + config.get("de_stopwords","input2")
|
|
||||||
stop3 = ressources_path + config.get("de_stopwords","input3")
|
|
||||||
path2stopwordlist_de = ressources_path + config.get("de_stopwords","pickle_file")
|
|
||||||
|
|
||||||
path2stopwordlist_en = ressources_path + config.get("en_stopwords","pickle_file")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -277,44 +283,135 @@ def main():
|
||||||
start = time.time()
|
start = time.time()
|
||||||
logprint("Init: {0}".format(datetime.now()))
|
logprint("Init: {0}".format(datetime.now()))
|
||||||
|
|
||||||
|
ressources_path = FILEPATH + "ressources/"
|
||||||
""""""
|
|
||||||
logprint("create and save lemma_dict")
|
|
||||||
lemma_dict = create_lemma_dict(path2lemma_file)
|
|
||||||
save_obj(lemma_dict, path2lemmadict)
|
|
||||||
|
|
||||||
|
|
||||||
logprint("Build and save Wordlist for Spellchecking")
|
|
||||||
words = build_words_for_spellchecking(path2words_file)
|
|
||||||
save_obj(words, path2wordlist)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# THESAURUS
|
||||||
logprint("Build and save Thesaurus")
|
logprint("Build and save Thesaurus")
|
||||||
|
|
||||||
|
path2wordnet = ressources_path + config.get("thesaurus", "input")
|
||||||
thesaurus = build_thesaurus_dict(path2wordnet)
|
thesaurus = build_thesaurus_dict(path2wordnet)
|
||||||
|
|
||||||
|
path2thesaurus_dict = ressources_path + config.get("thesaurus", "pickle_file")
|
||||||
save_obj(thesaurus, path2thesaurus_dict)
|
save_obj(thesaurus, path2thesaurus_dict)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# LEMMA
|
||||||
|
logprint("create and save lemma_dict")
|
||||||
|
|
||||||
|
path2lemma_file = ressources_path + config.get("lemmatization", "input")
|
||||||
|
lemma_dict = create_lemma_dict(path2lemma_file)
|
||||||
|
|
||||||
|
path2lemmadict = ressources_path + config.get("lemmatization", "pickle_file")
|
||||||
|
save_obj(lemma_dict, path2lemmadict)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# SPELLCHECKING
|
||||||
|
logprint("Build and save Wordlist for Spellchecking")
|
||||||
|
|
||||||
|
path2words_file = ressources_path + config.get("spellchecking", "input")
|
||||||
|
words = build_words_for_spellchecking(path2words_file)
|
||||||
|
|
||||||
|
path2words_counter = ressources_path + config.get("spellchecking", "pickle_file")
|
||||||
|
save_obj(words, path2words_counter)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# STOPWORDS
|
||||||
logprint("Build and save stoppwortliste")
|
logprint("Build and save stoppwortliste")
|
||||||
|
|
||||||
|
stop1 = ressources_path + config.get("de_stopwords", "input1")
|
||||||
|
stop2 = ressources_path + config.get("de_stopwords", "input2")
|
||||||
|
stop3 = ressources_path + config.get("de_stopwords", "input3")
|
||||||
de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3)
|
de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3)
|
||||||
|
|
||||||
|
path2stopwordlist_de = ressources_path + config.get("de_stopwords", "pickle_file")
|
||||||
save_obj(de_stop_words, path2stopwordlist_de)
|
save_obj(de_stop_words, path2stopwordlist_de)
|
||||||
|
|
||||||
|
path2stopwordlist_en = ressources_path + config.get("en_stopwords", "pickle_file")
|
||||||
save_obj(en_stop_words, path2stopwordlist_en)
|
save_obj(en_stop_words, path2stopwordlist_en)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# NOMEN
|
||||||
logprint("Build and save nomenliste")
|
logprint("Build and save nomenliste")
|
||||||
#nouns = list_from_files(nouns1,nouns2)
|
|
||||||
nouns = list_from_files(nouns0)
|
nouns0 = ressources_path + config.get("nouns", "input")
|
||||||
|
nouns1 = ressources_path + config.get("nouns", "input1")
|
||||||
|
nouns2 = ressources_path + config.get("nouns", "input2")
|
||||||
|
nouns = list_from_files(nouns0,nouns1,nouns2)
|
||||||
|
|
||||||
|
path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
|
||||||
save_obj(nouns, path2nouns_list)
|
save_obj(nouns, path2nouns_list)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# VORNAMEN
|
||||||
logprint("Build and save firstnameslist")
|
logprint("Build and save firstnameslist")
|
||||||
|
|
||||||
|
firstnames_txt = ressources_path + config.get("firstnames", "input")
|
||||||
vornamen = list_from_files(firstnames_txt)
|
vornamen = list_from_files(firstnames_txt)
|
||||||
|
|
||||||
|
path2firstnameslist = ressources_path + config.get("firstnames", "pickle_file")
|
||||||
save_obj(vornamen, path2firstnameslist)
|
save_obj(vornamen, path2firstnameslist)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60))
|
logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60))
|
||||||
|
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
6
main.py
6
main.py
|
@ -30,15 +30,13 @@ start = time.time()
|
||||||
|
|
||||||
# todo modelle testen
|
# todo modelle testen
|
||||||
|
|
||||||
|
# todo ticket2kbkeys, subj, cats in init.py
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
logprint("main.py started at {}".format(datetime.now()))
|
logprint("main.py started at {}".format(datetime.now()))
|
||||||
|
|
||||||
|
|
||||||
|
init.main()
|
||||||
#init.main()
|
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
raw_corpus = corporization.main()
|
raw_corpus = corporization.main()
|
||||||
|
|
|
@ -217,7 +217,6 @@ def save_corpus(corpus, corpus_path, corpus_name):
|
||||||
:param corpus_path: str
|
:param corpus_path: str
|
||||||
:param corpus_name: str (should content the language like "_de_")
|
:param corpus_name: str (should content the language like "_de_")
|
||||||
"""
|
"""
|
||||||
#todo pos und ner tagging speichern
|
|
||||||
|
|
||||||
# save parser
|
# save parser
|
||||||
parser = corpus.spacy_lang
|
parser = corpus.spacy_lang
|
||||||
|
|
|
@ -126,7 +126,7 @@ def remove_first_names():
|
||||||
def remove_addresses(string):
|
def remove_addresses(string):
|
||||||
pass # todo remove_addresses idee postal.parser und zu metadaten hinzufügen
|
pass # todo remove_addresses idee postal.parser und zu metadaten hinzufügen
|
||||||
|
|
||||||
def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
|
def lemmatizeWord(word,lemma_dict=LEMMAS,n=5):
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
try:
|
try:
|
||||||
word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower()
|
word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower()
|
||||||
|
@ -134,26 +134,29 @@ def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
|
||||||
print(word)
|
print(word)
|
||||||
return word
|
return word
|
||||||
|
|
||||||
def getFirstSynonym(word, thesaurus=THESAURUS,n=3):
|
|
||||||
|
def getFirstSynonym(word, thesaurus=THESAURUS, n=3):
|
||||||
|
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
word = thesaurus[word.lower()] if word.lower() in thesaurus.keys() else word.lower()
|
if word in thesaurus.keys():
|
||||||
|
return thesaurus[word]
|
||||||
|
|
||||||
|
elif word.title() in thesaurus.keys():
|
||||||
|
return thesaurus[word.title()]
|
||||||
|
|
||||||
|
elif word.lower() in thesaurus.keys():
|
||||||
|
return thesaurus[word.lower()]
|
||||||
|
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
except:
|
except:
|
||||||
print(word)
|
print("THESAURUSFEHLER BEI: {}".format(word))
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
if not isinstance(word, str):
|
|
||||||
return str(word)
|
|
||||||
|
|
||||||
word = word.lower()
|
|
||||||
if word in thesaurus.keys():
|
|
||||||
return thesaurus[word]
|
|
||||||
else:
|
|
||||||
return str(word)
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
########################## Spellchecking ##########################################
|
########################## Spellchecking ##########################################
|
||||||
|
@ -328,6 +331,15 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||||
|
|
||||||
|
|
||||||
def extract_from_corpus(corpus):
|
def extract_from_corpus(corpus):
|
||||||
|
"""
|
||||||
|
Extract from each doc from a corpus a string containing disired token_texts
|
||||||
|
|
||||||
|
|
||||||
|
:param corpus: textacy.Corpus
|
||||||
|
:return: string-gen
|
||||||
|
"""
|
||||||
|
|
||||||
|
# WHITELIST erstellen. Enthält zumindest die evtuellen Topics
|
||||||
|
|
||||||
WHITELIST = ["boss", "sap", "firefox"] #todo autogenerierung relv. techn. begriffe
|
WHITELIST = ["boss", "sap", "firefox"] #todo autogenerierung relv. techn. begriffe
|
||||||
|
|
||||||
|
@ -337,6 +349,7 @@ def extract_from_corpus(corpus):
|
||||||
|
|
||||||
WHITELIST = WHITELIST + kb_cats + kb_keys + kb_subjs
|
WHITELIST = WHITELIST + kb_cats + kb_keys + kb_subjs
|
||||||
|
|
||||||
|
|
||||||
THESAURUS = load_obj(path2thesaurus_dict)
|
THESAURUS = load_obj(path2thesaurus_dict)
|
||||||
#WORDS = load_obj(path2wordsdict)
|
#WORDS = load_obj(path2wordsdict)
|
||||||
LEMMAS = load_obj(path2lemmadict)
|
LEMMAS = load_obj(path2lemmadict)
|
||||||
|
@ -344,6 +357,9 @@ def extract_from_corpus(corpus):
|
||||||
#EN_STOP_WORDS = load_obj(path2ENstopwordlist)
|
#EN_STOP_WORDS = load_obj(path2ENstopwordlist)
|
||||||
VORNAMEN = load_obj(path2firstnameslist)
|
VORNAMEN = load_obj(path2firstnameslist)
|
||||||
|
|
||||||
|
ents_boss = []
|
||||||
|
ents_sap = []
|
||||||
|
|
||||||
for doc in corpus:
|
for doc in corpus:
|
||||||
result = []
|
result = []
|
||||||
|
|
||||||
|
@ -353,10 +369,16 @@ def extract_from_corpus(corpus):
|
||||||
|
|
||||||
for tok in doc:
|
for tok in doc:
|
||||||
|
|
||||||
if tok.lower_ =="boss" or tok.lower_ =="sap":
|
|
||||||
print(tok.lower_+": "+tok.ent_type_)
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
if tok.lower_ =="boss":
|
||||||
|
ents_boss.append(tok.ent_type_)
|
||||||
|
|
||||||
|
if tok.lower_ =="sap":
|
||||||
|
ents_sap.append(tok.ent_type_)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# wenn in whitelist, direkt übernehmen
|
||||||
if tok.lower_ in WHITELIST:
|
if tok.lower_ in WHITELIST:
|
||||||
result.append(tok.lower_)
|
result.append(tok.lower_)
|
||||||
|
|
||||||
|
@ -372,25 +394,27 @@ def extract_from_corpus(corpus):
|
||||||
or tok.lower_ in VORNAMEN:
|
or tok.lower_ in VORNAMEN:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# cut after footer
|
|
||||||
if replaceRockDots(tok.lower_) in ["gruss", "grusse", "gruesse", "gruessen", "grusses"]: # fehler schneidet bei INC40506 das meiste weg
|
|
||||||
break
|
|
||||||
|
|
||||||
# boss/SAP ent_type = 'ORG' oder '' (ein-weimal LOC oder PERSON)
|
|
||||||
|
# cut after footer
|
||||||
|
if replaceRockDots(tok.lower_) in ["gruss", "grusse", "gruesse", "gruessen", "grusses"]: # fehler schneidet bei zB INC40506 das meiste weg
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if tok.pos_ in ["NOUN"] \
|
if tok.pos_ in ["NOUN"] \
|
||||||
or tok.ent_type_ in ["NORP","FACILITY","ORG","PRODUCT","WORK_OF_ART"]:
|
or tok.ent_type_ in ["NORP","FACILITY","ORG","PRODUCT","WORK_OF_ART","LOC"]:
|
||||||
#or tok.dep_ == "ROOT":
|
#or tok.dep_ == "ROOT":
|
||||||
# or tok.lower_ in NOUNS \ #,"PERSON"] \
|
# or tok.lower_ in NOUNS \ #,"PERSON"] \
|
||||||
toktext = tok.lower_
|
toktext = tok.lower_
|
||||||
|
|
||||||
|
|
||||||
toktext = lemmatized_word
|
toktext = lemmatized_word
|
||||||
|
|
||||||
|
# hauptsynonym bilden idee zwar das Huaptsyn bilden und zählen aber die originalen wörter in den llda algo geben
|
||||||
"""
|
"""
|
||||||
first_synonym = getFirstSynonym(toktext, thesaurus=THESAURUS)
|
first_synonym = getFirstSynonym(toktext, thesaurus=THESAURUS)
|
||||||
if first_synonym is not None:
|
if first_synonym is not None or first_synonym != '':
|
||||||
toktext = first_synonym if len(first_synonym.split()) == 1 else toktext
|
toktext = first_synonym if len(first_synonym.split()) == 1 else toktext
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -402,6 +426,14 @@ def extract_from_corpus(corpus):
|
||||||
yield " ".join(result)
|
yield " ".join(result)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
print(list(set(ents_sap)))
|
||||||
|
['', 'ORG', 'PERSON', 'LOC']
|
||||||
|
|
||||||
|
print(list(set(ents_boss)))
|
||||||
|
['', 'ORG', 'PERSON', 'LOC']
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -433,6 +465,9 @@ def preprocessCorpus(corpus, clean_in_meta):
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# idee labeled_lines.txt enthählt bigramme mit unterstrich
|
||||||
|
# todo preCorpus weg. llda bekommt labaled_lines.txt und lda doctermamtrix
|
||||||
|
|
||||||
|
|
||||||
# leere docs aus corpi kicken
|
# leere docs aus corpi kicken
|
||||||
pre_corpus.remove(lambda doc: len(doc) == 0)
|
pre_corpus.remove(lambda doc: len(doc) == 0)
|
||||||
|
|
|
@ -71439,7 +71439,7 @@
|
||||||
</Sense>
|
</Sense>
|
||||||
</LexicalEntry>
|
</LexicalEntry>
|
||||||
<LexicalEntry id="w10531">
|
<LexicalEntry id="w10531">
|
||||||
<Lemma writtenForm="Passwort" partOfSpeech="n"/>
|
<Lemma writtenForm="Passwort (Hauptform)" partOfSpeech="n"/>
|
||||||
<Sense id="w10531_2177-n" synset="de-2177-n">
|
<Sense id="w10531_2177-n" synset="de-2177-n">
|
||||||
</Sense>
|
</Sense>
|
||||||
</LexicalEntry>
|
</LexicalEntry>
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
kennwort kennworts
|
||||||
|
kennwort kennwortes
|
||||||
a as
|
a as
|
||||||
aachen aachens
|
aachen aachens
|
||||||
aal aale
|
aal aale
|
||||||
|
|
|
@ -571,14 +571,14 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
|
||||||
count_dict[kb] = 1
|
count_dict[kb] = 1
|
||||||
|
|
||||||
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
|
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
|
||||||
|
"""
|
||||||
for k,v in sorted_dict:
|
for k,v in sorted_dict:
|
||||||
subs = kb2subjects_dict[k]
|
subs = kb2subjects_dict[k]
|
||||||
keys = kb2keywords_dict[k]
|
keys = kb2keywords_dict[k]
|
||||||
print(subs, keys , v) # frage wieviele tickets pro topic?
|
print(subs, keys , v) # frage wieviele tickets pro topic?
|
||||||
|
|
||||||
print("kb_entrys used: {}".format(len(sorted_dict))) # frage wie viele kb_entry's insg genutzt?: 155
|
print("kb_entrys used: {}".format(len(sorted_dict))) # frage wie viele kb_entry's insg genutzt?: 155
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
labelist = ticket2keywords_dict.values()
|
labelist = ticket2keywords_dict.values()
|
||||||
|
@ -644,7 +644,7 @@ def load_from_labled_lines(path):
|
||||||
|
|
||||||
#idee plan
|
#idee plan
|
||||||
# clean laden, pre laden
|
# clean laden, pre laden
|
||||||
# unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee notfalls bigramme als geklammerte "wörter"
|
# unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee bigramme mit _ verbinden
|
||||||
# nimm nur ngrams wo midn. ein token in pre vorkommt
|
# nimm nur ngrams wo midn. ein token in pre vorkommt
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue