From 4fe12679fbe19274594199a6163c87e249fc272d Mon Sep 17 00:00:00 2001
From: "jannis.grundmann" <jannis.grundmann@tu-dortmund.de>
Date: Thu, 12 Oct 2017 15:57:56 +0200
Subject: [PATCH] thesaurus erstellung luafzeit verbessert

---
 corporization.py |  47 ++++------------
 init.py          | 119 ++++++++++++++++++++++++++++++++++------
 preprocessing.py | 140 +++++++++++++++++++++++++++++++++--------------
 testra.py        | 126 +++++++++++++++++++++++++++++++++++-------
 4 files changed, 317 insertions(+), 115 deletions(-)

diff --git a/corporization.py b/corporization.py
index 00e958b..06e0bdb 100644
--- a/corporization.py
+++ b/corporization.py
@@ -33,7 +33,6 @@ path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-E
 content_collumn_name = "Description"
 
 metaliste = [
-
     "TicketNumber",
     "Subject",
     "CreatedDate",
@@ -46,15 +45,18 @@ metaliste = [
     "Solution"
 ]
 
+
+
+
 corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
-corpus_name = "de_raw_corpus"
+corpus_name = "de_raw_ticketCorpus"
 
 logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
 
 
 
 
-# todo configuration file ?
+# todo configuration file
 """
 config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
 
@@ -98,7 +100,7 @@ def printRandomDoc(textacyCorpus):
 
 
 
-def csv_to_textStream(path2csv: str, content_collumn_name: str):
+def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
     """
     :param path2csv: string
     :param content_collumn_name: string
@@ -117,7 +119,7 @@ def csv_to_textStream(path2csv: str, content_collumn_name: str):
             yield lst[content_collumn]
 
 
-def csv_to_DictStream(path2csv: str, metalist: [str]):
+def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
     """
     :param path2csv: string
     :param metalist: list of strings
@@ -155,7 +157,8 @@ def save_corpus(corpus, corpus_path, corpus_name, parser):
    """
 
     # save parser
-    parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
+    parserpath = corpus_path + str(parser.lang) + '_parser'
+    parser.save_to_directory(parserpath)
 
     # save content
     contentpath = corpus_path + corpus_name + "_content.bin"
@@ -171,34 +174,6 @@ def save_corpus(corpus, corpus_path, corpus_name, parser):
 
 
 
-def cleanTextstream(textstream):
-    """
-    :param textstream: string-gen
-    :param parser: spacy-parser
-    :yield: string-gen
-    """
-
-    for txt in textstream:
-        yield textacy.preprocess.normalize_whitespace(txt)
-
-
-def cleanDictstream(dictstream):
-    """
-    :param dictstream: dict-gen
-    :param parser: spacy-parser
-    :yield: dict-gen
-    """
-
-    for dic in dictstream:
-
-        result = {}
-
-        for key, value in dic.items():
-                result[key] = textacy.preprocess.normalize_whitespace(value)
-        yield result
-
-
-
 def main():
 
     printlog("Corporization: {0}".format(datetime.now()))
@@ -222,8 +197,8 @@ def main():
     printlog("Add texts to textacy-corpus")
 
     de_corpus.add_texts(
-        cleanTextstream(csv_to_textStream(path2de_csv, content_collumn_name)),
-        cleanDictstream(csv_to_DictStream(path2de_csv, metaliste))
+        ticketcsv_to_textStream(path2de_csv, content_collumn_name),
+        ticket_csv_to_DictStream(path2de_csv, metaliste)
     )
 
 
diff --git a/init.py b/init.py
index 83f4ca1..e39417c 100644
--- a/init.py
+++ b/init.py
@@ -26,6 +26,7 @@ with open(config_ini) as f:
     config.read_file(f)
 """
 
+# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"
 
 
 # config logging
@@ -80,7 +81,7 @@ def create_lemma_dict(lemmalist):
 
     return lemma_dict
 
-
+"""
 def build_thesaurus(path2lexicalentries, path2synsets):
     lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
     syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
@@ -134,6 +135,94 @@ def build_thesaurus(path2lexicalentries, path2synsets):
     return thesaurus
 
     #todo thesaurus in dictionary
+"""
+
+def build_thesaurus(path2lexicalentries):#, path2synsets):
+    lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
+    #syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
+
+    lexroot = lextree.getroot()
+    #synroot = syntree.getroot()
+
+
+    word2synsets = {}
+    template = {"w1": ["s1", "s2"]}
+
+    for ro in lexroot:
+        for elem in ro:
+            if elem.tag == "LexicalEntry":
+                lex_dictlist = [subentry.attrib for subentry in elem]
+
+
+
+                synlist = []
+                string = "WORD"
+
+                for lex_dict in lex_dictlist:
+                    if "synset" in lex_dict.keys():
+
+                            synset = lex_dict["synset"]
+                            synlist.append(synset)
+
+                    if 'writtenForm' in lex_dict.keys():
+                            string = (lex_dict["writtenForm"])
+
+                            # replaceRockDots
+                            string = re.sub(r'[ß]', "ss", string)
+                            string = re.sub(r'[ö]', "oe", string)
+                            string = re.sub(r'[ü]', "ue", string)
+                            string = re.sub(r'[ä]', "ae", string)
+
+                            # alle punkte raus
+                            string = re.sub(r'[.]', "", string)
+
+                            # alles in klammern raus
+                            string = re.sub(r"\((.*)\)", " ", string)
+
+                            # längeres leerzeichen normalisieren
+                            string = textacy.preprocess.normalize_whitespace(string)
+
+                            string = string.lower().strip()
+
+                word2synsets[string] = synlist
+
+    synset2Words = {}
+    template = {"s1": ["w1","w2"]}
+
+    for word,synset in word2synsets.items():
+        for syn in synset:
+            if syn not in synset2Words.keys():
+                synset2Words[syn] = [word]
+            else:
+                synset2Words[syn].append(word)
+
+    # nach anzhal der wörter in den strings sortieren
+    for synset in word2synsets.values():
+        synset.sort(key=lambda x: len(x.split()))
+
+    thesaurus = {}
+    thesaurus_template = {"w1" : "mainsyn"}
+
+    for word,synset in word2synsets.items():
+        try:
+            thesaurus[word] = synset2Words[synset[0]][0]    #Ann.: erstes synonym ist das Hauptsynonym
+        except:
+            pass
+    return thesaurus
+
+    """
+    for r in synroot:
+        for element in r:
+
+            if element.tag == "Synset":
+                synset = []
+                attrib = element.attrib
+                id = attrib["id"]
+
+                if id not in synset2Words.keys():
+                    synset2Words[id] = "WORD"
+    """
+
 
 
 def create_stopwordlist():
@@ -151,7 +240,7 @@ def create_stopwordlist():
 
     de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))
 
-    de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("stopwords-de.txt"))))
+    de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))
 
     de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))
 
@@ -172,34 +261,29 @@ def words(text): return re.findall(r'\w+', text.lower())
 
 ##################################################################################################
 
-# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare .json
+# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien
 # außerdem saubere stoppwortliste und nomenliste
 
 
 
 # THESAURUS
 lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
-synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
-
-lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
-synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
+#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
+path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
 
 
 
 # SPELLCHECKING
 path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
+path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
 
 
 
 
-
-
-path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl"
-path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list.pkl"
-path2thesauruslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list.pkl"
-path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl"
-path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl"
-path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl"
+path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
+path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
+path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
+path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
 
 
 
@@ -235,11 +319,10 @@ def main():
 
 
     printlog("Build and save Thesaurus")
-    THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries, path2synsets=synsets)
-    print(THESAURUS[0:10])
+    THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries)
 
 
-    save_obj(THESAURUS, path2thesauruslist)
+    save_obj(THESAURUS, path2thesaurusdict)
 
 
 
diff --git a/preprocessing.py b/preprocessing.py
index 0bd5e73..6ae8ccc 100644
--- a/preprocessing.py
+++ b/preprocessing.py
@@ -38,6 +38,45 @@ csv.field_size_limit(sys.maxsize)
 
 
 
+import pickle
+
+def save_obj(obj, path):
+    with open(path + '.pkl', 'wb') as f:
+        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
+
+def load_obj(path ):
+    with open(path + '.pkl', 'rb') as f:
+        return pickle.load(f)
+
+
+def load_corpus(corpus_path, corpus_name, lang="de"):
+
+    contentpath = corpus_path + corpus_name + "_content.bin"
+    metapath = corpus_path + corpus_name + "_meta.json"
+
+    #load parser
+    parserpath = corpus_path + str(lang) + '_parser'
+    parser = spacy.load(parserpath)
+
+    corpus = textacy.Corpus(parser)
+
+
+    metadata_stream = textacy.fileio.read_json_lines(metapath)
+    spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
+    for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
+        corpus.add_doc(
+            textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
+    return corpus
+
+corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
+corpus_name = "de_raw_ticketCorpus"
+
+print(load_corpus(corpus_path,corpus_name))
+
+
+
+
+
 
 # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
 
@@ -63,36 +102,31 @@ logging.basicConfig(filename=logfile, level=logging.INFO)
 
 
 # THESAURUS
+path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
+THESAURUS = load_obj(path2thesaurusdict)
 
-# thesauruspath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv"
-# thesauruspath = config.get("filepath","thesauruspath")
-# THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
-lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
-synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
 
 # SPELLCHECKING
-path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
+path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
 
 
+path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
+path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
+path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
+path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
+
+
+
+# SPELLCHECKING
+
+
+parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
 
 DE_PARSER = spacy.load("de")
 EN_PARSER = spacy.load("en")
 
 
 
-
-"""
-de_stop_words= set(
-    list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) +
-    list(textacy.fileio.read_file_lines("stopwords-de.txt"))
-)
-
-
-LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt"))
-
-VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
-"""
-
 de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
     "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german')))
 
@@ -126,15 +160,7 @@ specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORE
 hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
 
 
-import pickle
 
-def save_obj(obj, path):
-    with open(path + '.pkl', 'wb') as f:
-        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
-
-def load_obj(path ):
-    with open(path + '.pkl', 'rb') as f:
-        return pickle.load(f)
 def printlog(string, level="INFO"):
     """log and prints"""
     print(string)
@@ -238,21 +264,6 @@ def csv_to_metaStream(path2csv: str, metalist: [str]):
             yield metadata
 
 
-def save_corpus(corpus, corpus_path, corpus_name, parser=DE_PARSER):
-
-    # save stringstore
-    stringstore_path = corpus_path + corpus_name + '_strings.json'
-    with open(stringstore_path, "w") as file:
-        parser.vocab.strings.dump(file)
-
-    # save content
-    contentpath = corpus_path + corpus_name + "_content.bin"
-    textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
-
-    # save meta
-    metapath = corpus_path + corpus_name + "_meta.json"
-    textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
-
 
 
 #############  filter tokens
@@ -751,6 +762,51 @@ def filterTokens(tokens, funclist):
 
     return tokens
 
+def cleanString(string):
+    # replaceRockDots
+    string = re.sub(r'[ß]', "ss", string)
+    string = re.sub(r'[ö]', "oe", string)
+    string = re.sub(r'[ü]', "ue", string)
+    string = re.sub(r'[ä]', "ae", string)
+
+
+    # längeres leerzeichen normalisieren
+    string = textacy.preprocess.normalize_whitespace(string)
+
+    return(string)
+
+def normalizeTextStream(textstream,clean=False):
+    """
+    :param textstream: string-gen
+    :param parser: spacy-parser
+    :yield: string-gen
+    """
+
+    for txt in textstream:
+        if clean:
+            yield cleanString(txt)
+        else:
+            yield textacy.preprocess.normalize_whitespace(txt)
+
+def nomalizeDictstream(dictstream, clean=False):
+    """
+    :param dictstream: dict-gen
+    :param parser: spacy-parser
+    :yield: dict-gen
+    """
+
+    for dic in dictstream:
+
+        result = {}
+
+        for key, value in dic.items():
+            if clean:
+                result[key] = cleanString(value)
+            else:
+                result[key] = textacy.preprocess.normalize_whitespace(value)
+        yield result
+
+
 
 custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
                 "hallo", "gerne", "freundlich", "fragen", "fehler", "bitten", "ehre", "lieb", "helfen",
diff --git a/testra.py b/testra.py
index f7398da..c13dcf2 100644
--- a/testra.py
+++ b/testra.py
@@ -5,6 +5,7 @@ import json
 
 import spacy
 import textacy
+from functools import reduce
 
 start = time.time()
 
@@ -52,6 +53,8 @@ corpus.add_texts(
 
 print(corpus)
 """
+
+
 import pickle
 
 def save_obj(obj, path):
@@ -63,31 +66,122 @@ def load_obj(path ):
         return pickle.load(f)
 
 
+# THESAURUS
+lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
+lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
+synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
 
-lemmalist = list(map(textacy.preprocess.normalize_whitespace,
-                     list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))))
 
-lemma_dict = {}
+def build_thesaurus(path2lexicalentries):#, path2synsets):
+    lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
+    #syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
 
-for line in lemmalist:
+    lexroot = lextree.getroot()
+    #synroot = syntree.getroot()
 
-    lem_word_pair = line.split()
 
-    lemma = lem_word_pair[0].strip().lower()
+    word2synsets = {}
+    template = {"w1": ["s1", "s2"]}
 
-    word = lem_word_pair[1].strip().lower()
-
-    lemma_dict[word] = lemma
+    for ro in lexroot:
+        for elem in ro:
+            if elem.tag == "LexicalEntry":
+                lex_dictlist = [subentry.attrib for subentry in elem]
 
 
 
-print(lemma_dict["abbekomme"])
+                synlist = []
+                string = "WORD"
 
-save_obj(lemma_dict, "test_dictionies")
+                for lex_dict in lex_dictlist:
+                    if "synset" in lex_dict.keys():
 
-loaded = load_obj("test_dictionies")
+                            synset = lex_dict["synset"]
+                            synlist.append(synset)
 
-print(loaded["abbekomme"])
+                    if 'writtenForm' in lex_dict.keys():
+                            string = (lex_dict["writtenForm"])
+
+                            # replaceRockDots
+                            string = re.sub(r'[ß]', "ss", string)
+                            string = re.sub(r'[ö]', "oe", string)
+                            string = re.sub(r'[ü]', "ue", string)
+                            string = re.sub(r'[ä]', "ae", string)
+
+                            # alle punkte raus
+                            string = re.sub(r'[.]', "", string)
+
+                            # alles in klammern raus
+                            string = re.sub(r"\((.*)\)", " ", string)
+
+                            # längeres leerzeichen normalisieren
+                            string = textacy.preprocess.normalize_whitespace(string)
+
+                            string = string.lower().strip()
+
+                word2synsets[string] = synlist
+
+    synset2Words = {}
+    template = {"s1": ["w1","w2"]}
+
+    for word,synset in word2synsets.items():
+        for syn in synset:
+            if syn not in synset2Words.keys():
+                synset2Words[syn] = [word]
+            else:
+                synset2Words[syn].append(word)
+
+    # nach anzhal der wörter in den strings sortieren
+    for synset in word2synsets.values():
+        synset.sort(key=lambda x: len(x.split()))
+
+    thesaurus = {}
+    thesaurus_template = {"w1" : "mainsyn"}
+
+    for word,synset in word2synsets.items():
+        try:
+            thesaurus[word] = synset2Words[synset[0]][0]    #Ann.: erstes synonym ist das Hauptsynonym
+        except:
+            pass
+    return thesaurus
+
+    """
+    for r in synroot:
+        for element in r:
+
+            if element.tag == "Synset":
+                synset = []
+                attrib = element.attrib
+                id = attrib["id"]
+
+                if id not in synset2Words.keys():
+                    synset2Words[id] = "WORD"
+    """
+
+
+def load_corpus(corpus_path, corpus_name, lang="de"):
+    contentpath = corpus_path + corpus_name + "_content.bin"
+    metapath = corpus_path + corpus_name + "_meta.json"
+
+    # load parser
+    parserpath = corpus_path + str(lang) + '_parser'
+    parser = spacy.load(parserpath)
+
+    corpus = textacy.Corpus(parser)
+
+    metadata_stream = textacy.fileio.read_json_lines(metapath)
+    spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
+    for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
+        corpus.add_doc(
+            textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
+    return corpus
+
+#todo load corpus from file idee stringstore und vocab laden
+
+corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
+corpus_name = "de_raw_ticketCorpus"
+
+print(load_corpus(corpus_path, corpus_name))
 
 """
 from postal.parser import parse_address
@@ -101,12 +195,6 @@ address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder-
 print(parse_address(address))
 """
 
-
-
-
-
-
-
 """
 
 corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"