refactoring.

jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA
2017-12-11 12:10:40 +01:00 · 2017-12-11 12:10:40 +01:00 · 412f25d8d8
parent db7ea1a72a
commit 412f25d8d8
16 changed files with 340 additions and 126 deletions
--- a/cleaning.py
+++ b/cleaning.py
@ -30,6 +30,16 @@ with open(config_ini) as f:
 def clean(stringstream):#, NOUNS):
    """
    fix bad unicode
    seperate_words_on_regex `\=~%^&*()_+\[\]{};\'"|</>
    normalize whitespace
    remove linebreaks
    replaceRockDöts
    :param stringstream: str-gen
    :return: string-gen
    """
    #NOUNS = [n.lower() for n in NOUNS]
@ -90,19 +100,22 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path")
 def cleanCorpus(corpus):
    logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
-
+    """
    ressources_path = FILEPATH + "ressources/"
    path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
    #NOUNS = load_obj(path2nouns_list)
    #noun_disjunction = '|'.join(NOUNS)
    #nouns_tuples = []
    #for n in NOUNS:
    #    nouns_tuples.append((n.lower(),n))
-
+    """
    cleanCorpus_name = corpus.lang + "_clean"
    # load Corpus
    raw_corpus = corpus
    parser = corpus.spacy_lang
@ -115,13 +128,14 @@ def cleanCorpus(corpus):
    )
-    # leere docs aus corpi kicken
+    # leere docs aus corpus kicken
    cleaned_corpus.remove(lambda doc: len(doc) == 0)
    #save corpus
    cleanCorpus_name = corpus.lang + "_clean"
    save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name)
--- a/corporization.py
+++ b/corporization.py
@ -90,7 +90,16 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path")
 def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0):
    """
    Use textacy to create a Corpus out of the ITMC-Ticket.csv
    :param path2_csv: str
    :param corpus_path: str
    :param content_collumn_name: str    the Collumn which is used as the Docs text
    :param lang: str       standard 2-letter language
    :param printrandom: print n random Documents
    :return: textacy.Corpus
    """
    # print paths
    path_csv_split = path2_csv.split("/")
--- a/init.py
+++ b/init.py
@ -28,20 +28,20 @@ with open(config_ini) as f:
 def create_lemma_dict(path2lemmalist):
    """
-    Creates a dict out of a file a la:
+    Creates a dict out of a txt file a la:
    l1 w1
    l1 w2
    l2 w1
    l2 w2
-    Result will be used as lemma_dict["word"] --> lemma
+    Result will be used as lemma_dict[word] --> lemma
    :param path2lemmalist: str
    :return: dictionary
    """
-    lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(
+    file_gen = textacy.fileio.read_file_lines(path2lemmalist)
-        textacy.fileio.read_file_lines(path2lemmalist))))
+    lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(file_gen)))
    lemma_dict = {}
@ -63,7 +63,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
    Creates a dict out of the deWordNet
    https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml
-    Result will be used as lemma_dict["word"] --> lemma
+    Result will be used as thesaurus[word] --> main_synonym
    :param path2wordnet: str
    :param returnall: bool    if True, also return , word2synsets, synset2Words
@ -73,6 +73,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
    lexroot = lextree.getroot()
    # Build word2synsets
    word2synsets = {}
    template = {"w1": ["s1", "s2"]}
@ -82,7 +83,6 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
                lex_dictlist = [subentry.attrib for subentry in elem]
                # idee technischer thesaurus
                # idee hauptsynonmy muss einzelnes wort sein
                synlist = []
                string = "WORD"
@ -96,55 +96,92 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
                    if 'writtenForm' in lex_dict.keys():
                            string = (lex_dict["writtenForm"])
                            if string == "Kennwort":
                                pass
                            # replaceRockDots
                            string = re.sub(r'[ß]', "ss", string)
                            string = re.sub(r'[ö]', "oe", string)
                            string = re.sub(r'[Ö]', "Oe", string)
                            string = re.sub(r'[ü]', "ue", string)
                            string = re.sub(r'[Ü]', "Ue", string)
                            string = re.sub(r'[ä]', "ae", string)
                            string = re.sub(r'[Ä]', "ae", string)
                            # alle punkte raus
                            string = re.sub(r'[.]', "", string)
                            # alles in klammern raus
                            if "auptform" in string:
                                string = re.sub(r"\((.*)\)", " ", string)
                                string = string + " (hauptform)"            # evtl. als hauptform merken
                            else:
                                string = re.sub(r"\((.*)\)", " ", string)
                            # längeres leerzeichen normalisieren
                            string = textacy.preprocess.normalize_whitespace(string)
-                            string = string.lower().strip()
+                            string = string.strip()#.lower()
                if string != '':
                    word2synsets[string] = synlist
    # Build synset2Words
    synset2Words = {}
    template = {"s1": ["w1","w2"]}
    for word,synset in word2synsets.items():
        if word != '':
            for syn in synset:
                if syn not in synset2Words.keys():
                    synset2Words[syn] = [word]
                else:
                    synset2Words[syn].append(word)
-    # nach anzhal der wörter in den strings sortieren
+
-    for synset in word2synsets.values():
+    # Sortieren
-        synset.sort(key=lambda x: len(x.split()))
+    for words in synset2Words.values():
        words.sort(key=lambda w: len(w.split())) # nach anzhal der wörter in den strings (weniger nach vorne)
        for w in words:
            if "(hauptform)" in w:
                to_insert = re.sub(r"\((.*)\)", " ", w).strip()
                words.remove(w)
                words.insert(0, to_insert)  # Hauptform evtl. nach vorne
    thesaurus = {}
    thesaurus_template = {"w1" : "mainsyn"}
    # word --> [synset1, synset2, .. ] --> synset1 --> [syn1, syn2, ... ] --> syn1 / mainsyn
-    for word,synset in word2synsets.items():
+    for word,synsets in word2synsets.items():   #word , [synset1, synset2, .. ]
        try:
-            thesaurus[word] = synset2Words[synset[0]][0]    #Ann.: erstes synonym ist das Hauptsynonym #todo nach (hauptform) suchen?
+            if "Passwort" in word:
                x=2
            first_synset = synsets[0]           #erstes synset wählen . praktischer Grund
            syns = synset2Words[first_synset]   # [syn1, syn2, ... ]
            first_syn = syns[0] # erstes synonym (evtl. Hauptform)  wählen
            word = re.sub(r"\((.*)\)", " ", word).strip() #(hautpform weg)
            thesaurus[word] = first_syn  #Ann.: erstes synonym ist das Hauptsynonym
        except:
            pass
    if returnall:
        return thesaurus, word2synsets, synset2Words
    else:
@ -237,39 +274,8 @@ def build_words_for_spellchecking(path2words):
 ##################################################################################################
 # THESAURUS
 ressources_path = FILEPATH +  "ressources/"
 path2wordnet = ressources_path + config.get("thesaurus","input")
 path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file")
 # SPELLCHECKING
 path2words_file = ressources_path + config.get("spellchecking","input")
 path2wordlist = ressources_path + config.get("spellchecking","pickle_file")
 # LEMMA
 path2lemma_file = ressources_path + config.get("lemmatization","input")
 path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
 # NOMEN
 nouns0  = ressources_path + config.get("nouns","input")
 nouns1 = ressources_path + config.get("nouns","input1")
 nouns2 = ressources_path + config.get("nouns","input2")
 path2nouns_list = ressources_path + config.get("nouns","pickle_file")
 # VORNAMEN
 firstnames_txt = ressources_path + config.get("firstnames","input")
 path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
 # STOPWORDS
 stop1 = ressources_path + config.get("de_stopwords","input1")
 stop2 = ressources_path + config.get("de_stopwords","input2")
 stop3 = ressources_path + config.get("de_stopwords","input3")
 path2stopwordlist_de = ressources_path + config.get("de_stopwords","pickle_file")
 path2stopwordlist_en = ressources_path + config.get("en_stopwords","pickle_file")
@ -277,44 +283,135 @@ def main():
    start = time.time()
    logprint("Init: {0}".format(datetime.now()))
-
+    ressources_path = FILEPATH + "ressources/"
    """"""
    logprint("create and save lemma_dict")
    lemma_dict = create_lemma_dict(path2lemma_file)
    save_obj(lemma_dict, path2lemmadict)
    logprint("Build and save Wordlist for Spellchecking")
    words = build_words_for_spellchecking(path2words_file)
    save_obj(words, path2wordlist)
    # THESAURUS
    logprint("Build and save Thesaurus")
    path2wordnet = ressources_path + config.get("thesaurus", "input")
    thesaurus = build_thesaurus_dict(path2wordnet)
    path2thesaurus_dict = ressources_path + config.get("thesaurus", "pickle_file")
    save_obj(thesaurus, path2thesaurus_dict)
    # LEMMA
    logprint("create and save lemma_dict")
    path2lemma_file = ressources_path + config.get("lemmatization", "input")
    lemma_dict = create_lemma_dict(path2lemma_file)
    path2lemmadict = ressources_path + config.get("lemmatization", "pickle_file")
    save_obj(lemma_dict, path2lemmadict)
    # SPELLCHECKING
    logprint("Build and save Wordlist for Spellchecking")
    path2words_file = ressources_path + config.get("spellchecking", "input")
    words = build_words_for_spellchecking(path2words_file)
    path2words_counter = ressources_path + config.get("spellchecking", "pickle_file")
    save_obj(words, path2words_counter)
    # STOPWORDS
    logprint("Build and save stoppwortliste")
    stop1 = ressources_path + config.get("de_stopwords", "input1")
    stop2 = ressources_path + config.get("de_stopwords", "input2")
    stop3 = ressources_path + config.get("de_stopwords", "input3")
    de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3)
    path2stopwordlist_de = ressources_path + config.get("de_stopwords", "pickle_file")
    save_obj(de_stop_words, path2stopwordlist_de)
    path2stopwordlist_en = ressources_path + config.get("en_stopwords", "pickle_file")
    save_obj(en_stop_words, path2stopwordlist_en)
    # NOMEN
    logprint("Build and save nomenliste")
-    #nouns = list_from_files(nouns1,nouns2)
+
-    nouns = list_from_files(nouns0)
+    nouns0 = ressources_path + config.get("nouns", "input")
    nouns1 = ressources_path + config.get("nouns", "input1")
    nouns2 = ressources_path + config.get("nouns", "input2")
    nouns = list_from_files(nouns0,nouns1,nouns2)
    path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
    save_obj(nouns, path2nouns_list)
    # VORNAMEN
    logprint("Build and save firstnameslist")
    firstnames_txt = ressources_path + config.get("firstnames", "input")
    vornamen = list_from_files(firstnames_txt)
    path2firstnameslist = ressources_path + config.get("firstnames", "pickle_file")
    save_obj(vornamen, path2firstnameslist)
    end = time.time()
    logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60))
--- a/java_LabledLDA/models/tickets/.others.gz
+++ b/java_LabledLDA/models/tickets/.others.gz
--- a/java_LabledLDA/models/tickets/.tassign.gz
+++ b/java_LabledLDA/models/tickets/.tassign.gz
--- a/java_LabledLDA/models/tickets/.theta.gz
+++ b/java_LabledLDA/models/tickets/.theta.gz
--- a/java_LabledLDA/models/tickets/.twords.gz
+++ b/java_LabledLDA/models/tickets/.twords.gz
--- a/java_LabledLDA/models/tickets/.wordmap.gz
+++ b/java_LabledLDA/models/tickets/.wordmap.gz
--- a/java_LabledLDA/models/tickets/tickets.gz
+++ b/java_LabledLDA/models/tickets/tickets.gz
--- a/main.py
+++ b/main.py
@ -30,15 +30,13 @@ start = time.time()
 # todo modelle testen
-
+# todo ticket2kbkeys, subj, cats in init.py
 logprint("main.py started at {}".format(datetime.now()))
-
+init.main()
 #init.main()
 logprint("")
 raw_corpus = corporization.main()
--- a/miscellaneous.py
+++ b/miscellaneous.py
@ -217,7 +217,6 @@ def save_corpus(corpus, corpus_path, corpus_name):
    :param corpus_path: str
    :param corpus_name: str (should content the language like "_de_")
    """
    #todo pos und ner tagging speichern
    # save parser
    parser = corpus.spacy_lang
--- a/preprocessing.py
+++ b/preprocessing.py
@ -126,7 +126,7 @@ def remove_first_names():
 def remove_addresses(string):
    pass  # todo remove_addresses  idee postal.parser und zu metadaten hinzufügen
-def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
+def lemmatizeWord(word,lemma_dict=LEMMAS,n=5):
    for i in range(n):
        try:
            word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower()
@ -134,26 +134,29 @@ def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
            print(word)
    return word
 def getFirstSynonym(word, thesaurus=THESAURUS, n=3):
    for i in range(n):
        try:
-            word = thesaurus[word.lower()] if word.lower() in thesaurus.keys() else word.lower()
+            if word in thesaurus.keys():
-        except:
+                return thesaurus[word]
-            print(word)
+
            elif word.title() in thesaurus.keys():
                return thesaurus[word.title()]
            elif word.lower() in thesaurus.keys():
                return thesaurus[word.lower()]
            else:
                return word
    """
    if not isinstance(word, str):
        return str(word)
-    word = word.lower()
+        except:
-    if word in thesaurus.keys():
+            print("THESAURUSFEHLER BEI: {}".format(word))
-        return thesaurus[word]
+            return word
    else:
        return str(word)
    """
 ##########################    Spellchecking        ##########################################
@ -328,6 +331,15 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path")
 def extract_from_corpus(corpus):
    """
    Extract from each doc from a corpus a string containing disired token_texts
    :param corpus: textacy.Corpus
    :return: string-gen
    """
    # WHITELIST erstellen. Enthält zumindest die evtuellen Topics
    WHITELIST = ["boss", "sap", "firefox"] #todo autogenerierung relv. techn. begriffe
@ -337,6 +349,7 @@ def extract_from_corpus(corpus):
    WHITELIST = WHITELIST + kb_cats + kb_keys + kb_subjs
    THESAURUS = load_obj(path2thesaurus_dict)
    #WORDS = load_obj(path2wordsdict)
    LEMMAS = load_obj(path2lemmadict)
@ -344,6 +357,9 @@ def extract_from_corpus(corpus):
    #EN_STOP_WORDS = load_obj(path2ENstopwordlist)
    VORNAMEN = load_obj(path2firstnameslist)
    ents_boss = []
    ents_sap = []
    for doc in corpus:
        result = []
@ -353,10 +369,16 @@ def extract_from_corpus(corpus):
        for tok in doc:
            if tok.lower_ =="boss" or tok.lower_ =="sap":
                print(tok.lower_+": "+tok.ent_type_)
            """
            if tok.lower_ =="boss":
                ents_boss.append(tok.ent_type_)
            if tok.lower_ =="sap":
                ents_sap.append(tok.ent_type_)
            """
            # wenn in whitelist, direkt übernehmen
            if tok.lower_ in WHITELIST:
                result.append(tok.lower_)
@ -372,25 +394,27 @@ def extract_from_corpus(corpus):
            or tok.lower_ in VORNAMEN:
                continue
            # cut after footer
            if replaceRockDots(tok.lower_) in ["gruss", "grusse", "gruesse", "gruessen", "grusses"]:    # fehler schneidet bei INC40506 das meiste weg
                break
-            # boss/SAP ent_type = 'ORG' oder '' (ein-weimal LOC oder PERSON)
+
            # cut after footer
            if replaceRockDots(tok.lower_) in ["gruss", "grusse", "gruesse", "gruessen", "grusses"]:    # fehler schneidet bei zB INC40506 das meiste weg
                break
            if tok.pos_ in ["NOUN"] \
-            or tok.ent_type_ in ["NORP","FACILITY","ORG","PRODUCT","WORK_OF_ART"]:
+            or tok.ent_type_ in ["NORP","FACILITY","ORG","PRODUCT","WORK_OF_ART","LOC"]:
            #or tok.dep_ == "ROOT":
            # or tok.lower_ in NOUNS \ #,"PERSON"] \
                toktext = tok.lower_
                toktext = lemmatized_word
                # hauptsynonym bilden idee zwar das Huaptsyn bilden und zählen aber die originalen wörter in den llda algo geben
                """     
                first_synonym = getFirstSynonym(toktext, thesaurus=THESAURUS)
-                if first_synonym is not None:
+                if first_synonym is not None or first_synonym != '':
                    toktext = first_synonym if len(first_synonym.split()) == 1 else toktext
                """
@ -402,6 +426,14 @@ def extract_from_corpus(corpus):
        yield " ".join(result)
    """
    print(list(set(ents_sap)))
        ['', 'ORG', 'PERSON', 'LOC']
    print(list(set(ents_boss)))
        ['', 'ORG', 'PERSON', 'LOC']
    """
@ -433,6 +465,9 @@ def preprocessCorpus(corpus, clean_in_meta):
    )
    # idee labeled_lines.txt enthählt bigramme mit unterstrich
    # todo preCorpus weg. llda bekommt labaled_lines.txt und lda doctermamtrix
    # leere docs aus corpi kicken
    pre_corpus.remove(lambda doc: len(doc) == 0)
--- a/ressources/deWordNet.xml
+++ b/ressources/deWordNet.xml
@ -71439,7 +71439,7 @@
 	</Sense>
 </LexicalEntry>
 <LexicalEntry id="w10531">
-	<Lemma writtenForm="Passwort" partOfSpeech="n"/>
+	<Lemma writtenForm="Passwort (Hauptform)" partOfSpeech="n"/>
 	<Sense id="w10531_2177-n" synset="de-2177-n">
 	</Sense>
 </LexicalEntry>
--- a/ressources/lemmas.txt
+++ b/ressources/lemmas.txt
@ -1,3 +1,5 @@
 kennwort kennworts
 kennwort kennwortes
 a as
 aachen aachens
 aal aale
--- a/test.py
+++ b/test.py
--- a/topicModeling.py
+++ b/topicModeling.py
@ -571,14 +571,14 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
                count_dict[kb] = 1
    sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
-
+    """
    for k,v in sorted_dict:
        subs = kb2subjects_dict[k]
        keys = kb2keywords_dict[k]
        print(subs, keys , v) # frage wieviele tickets pro topic?
    print("kb_entrys used: {}".format(len(sorted_dict)))     # frage wie viele kb_entry's insg genutzt?: 155
-
+    """
    labelist = ticket2keywords_dict.values()
@ -644,7 +644,7 @@ def load_from_labled_lines(path):
    #idee plan
    # clean laden, pre laden
-    # unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee notfalls bigramme als geklammerte "wörter"
+    # unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee bigramme mit _ verbinden
        # nimm nur ngrams wo midn. ein token in pre vorkommt