thesaurus erstellung luafzeit verbessert

2017-10-12 15:57:56 +02:00 · 2017-10-12 15:57:56 +02:00 · 4fe12679fb
parent 93e239756c
commit 4fe12679fb
4 changed files with 317 additions and 115 deletions
--- a/corporization.py
+++ b/corporization.py
@ -33,7 +33,6 @@ path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-E
 content_collumn_name = "Description"

 metaliste = [
-
    "TicketNumber",
    "Subject",
    "CreatedDate",
@ -46,15 +45,18 @@ metaliste = [
    "Solution"
 ]

+
+
+
 corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
-corpus_name = "de_raw_corpus"
+corpus_name = "de_raw_ticketCorpus"

 logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"




-# todo configuration file ?
+# todo configuration file
 """
 config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"

@ -98,7 +100,7 @@ def printRandomDoc(textacyCorpus):



-def csv_to_textStream(path2csv: str, content_collumn_name: str):
+def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
    """
    :param path2csv: string
    :param content_collumn_name: string
@ -117,7 +119,7 @@ def csv_to_textStream(path2csv: str, content_collumn_name: str):
            yield lst[content_collumn]


-def csv_to_DictStream(path2csv: str, metalist: [str]):
+def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
    """
    :param path2csv: string
    :param metalist: list of strings
@ -155,7 +157,8 @@ def save_corpus(corpus, corpus_path, corpus_name, parser):
   """

    # save parser
-    parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
+    parserpath = corpus_path + str(parser.lang) + '_parser'
+    parser.save_to_directory(parserpath)

    # save content
    contentpath = corpus_path + corpus_name + "_content.bin"
@ -171,34 +174,6 @@ def save_corpus(corpus, corpus_path, corpus_name, parser):



-def cleanTextstream(textstream):
-    """
-    :param textstream: string-gen
-    :param parser: spacy-parser
-    :yield: string-gen
-    """
-
-    for txt in textstream:
-        yield textacy.preprocess.normalize_whitespace(txt)
-
-
-def cleanDictstream(dictstream):
-    """
-    :param dictstream: dict-gen
-    :param parser: spacy-parser
-    :yield: dict-gen
-    """
-
-    for dic in dictstream:
-
-        result = {}
-
-        for key, value in dic.items():
-                result[key] = textacy.preprocess.normalize_whitespace(value)
-        yield result
-
-
-
 def main():

    printlog("Corporization: {0}".format(datetime.now()))
@ -222,8 +197,8 @@ def main():
    printlog("Add texts to textacy-corpus")

    de_corpus.add_texts(
-        cleanTextstream(csv_to_textStream(path2de_csv, content_collumn_name)),
-        cleanDictstream(csv_to_DictStream(path2de_csv, metaliste))
+        ticketcsv_to_textStream(path2de_csv, content_collumn_name),
+        ticket_csv_to_DictStream(path2de_csv, metaliste)
    )


--- a/init.py
+++ b/init.py
@ -26,6 +26,7 @@ with open(config_ini) as f:
    config.read_file(f)
 """

+# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"


 # config logging
@ -80,7 +81,7 @@ def create_lemma_dict(lemmalist):

    return lemma_dict

-
+"""
 def build_thesaurus(path2lexicalentries, path2synsets):
    lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
    syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
@ -134,6 +135,94 @@ def build_thesaurus(path2lexicalentries, path2synsets):
    return thesaurus

    #todo thesaurus in dictionary
+"""
+
+def build_thesaurus(path2lexicalentries):#, path2synsets):
+    lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
+    #syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
+
+    lexroot = lextree.getroot()
+    #synroot = syntree.getroot()
+
+
+    word2synsets = {}
+    template = {"w1": ["s1", "s2"]}
+
+    for ro in lexroot:
+        for elem in ro:
+            if elem.tag == "LexicalEntry":
+                lex_dictlist = [subentry.attrib for subentry in elem]
+
+
+
+                synlist = []
+                string = "WORD"
+
+                for lex_dict in lex_dictlist:
+                    if "synset" in lex_dict.keys():
+
+                            synset = lex_dict["synset"]
+                            synlist.append(synset)
+
+                    if 'writtenForm' in lex_dict.keys():
+                            string = (lex_dict["writtenForm"])
+
+                            # replaceRockDots
+                            string = re.sub(r'[ß]', "ss", string)
+                            string = re.sub(r'[ö]', "oe", string)
+                            string = re.sub(r'[ü]', "ue", string)
+                            string = re.sub(r'[ä]', "ae", string)
+
+                            # alle punkte raus
+                            string = re.sub(r'[.]', "", string)
+
+                            # alles in klammern raus
+                            string = re.sub(r"\((.*)\)", " ", string)
+
+                            # längeres leerzeichen normalisieren
+                            string = textacy.preprocess.normalize_whitespace(string)
+
+                            string = string.lower().strip()
+
+                word2synsets[string] = synlist
+
+    synset2Words = {}
+    template = {"s1": ["w1","w2"]}
+
+    for word,synset in word2synsets.items():
+        for syn in synset:
+            if syn not in synset2Words.keys():
+                synset2Words[syn] = [word]
+            else:
+                synset2Words[syn].append(word)
+
+    # nach anzhal der wörter in den strings sortieren
+    for synset in word2synsets.values():
+        synset.sort(key=lambda x: len(x.split()))
+
+    thesaurus = {}
+    thesaurus_template = {"w1" : "mainsyn"}
+
+    for word,synset in word2synsets.items():
+        try:
+            thesaurus[word] = synset2Words[synset[0]][0]    #Ann.: erstes synonym ist das Hauptsynonym
+        except:
+            pass
+    return thesaurus
+
+    """
+    for r in synroot:
+        for element in r:
+
+            if element.tag == "Synset":
+                synset = []
+                attrib = element.attrib
+                id = attrib["id"]
+
+                if id not in synset2Words.keys():
+                    synset2Words[id] = "WORD"
+    """
+


 def create_stopwordlist():
@ -151,7 +240,7 @@ def create_stopwordlist():

    de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))

-    de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("stopwords-de.txt"))))
+    de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))

    de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))

@ -172,34 +261,29 @@ def words(text): return re.findall(r'\w+', text.lower())

 ##################################################################################################

-# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare .json
+# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien
 # außerdem saubere stoppwortliste und nomenliste



 # THESAURUS
 lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
-synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
-
-lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
-synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
+#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
+path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"



 # SPELLCHECKING
 path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
+path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"




-
-
-path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl"
-path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list.pkl"
-path2thesauruslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list.pkl"
-path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl"
-path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl"
-path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl"
+path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
+path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
+path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
+path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"



@ -235,11 +319,10 @@ def main():


    printlog("Build and save Thesaurus")
-    THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries, path2synsets=synsets)
-    print(THESAURUS[0:10])
+    THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries)


-    save_obj(THESAURUS, path2thesauruslist)
+    save_obj(THESAURUS, path2thesaurusdict)



--- a/preprocessing.py
+++ b/preprocessing.py
@ -38,6 +38,45 @@ csv.field_size_limit(sys.maxsize)



+import pickle
+
+def save_obj(obj, path):
+    with open(path + '.pkl', 'wb') as f:
+        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
+
+def load_obj(path ):
+    with open(path + '.pkl', 'rb') as f:
+        return pickle.load(f)
+
+
+def load_corpus(corpus_path, corpus_name, lang="de"):
+
+    contentpath = corpus_path + corpus_name + "_content.bin"
+    metapath = corpus_path + corpus_name + "_meta.json"
+
+    #load parser
+    parserpath = corpus_path + str(lang) + '_parser'
+    parser = spacy.load(parserpath)
+
+    corpus = textacy.Corpus(parser)
+
+
+    metadata_stream = textacy.fileio.read_json_lines(metapath)
+    spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
+    for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
+        corpus.add_doc(
+            textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
+    return corpus
+
+corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
+corpus_name = "de_raw_ticketCorpus"
+
+print(load_corpus(corpus_path,corpus_name))
+
+
+
+
+

 # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"

@ -63,36 +102,31 @@ logging.basicConfig(filename=logfile, level=logging.INFO)


 # THESAURUS
+path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
+THESAURUS = load_obj(path2thesaurusdict)

-# thesauruspath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv"
-# thesauruspath = config.get("filepath","thesauruspath")
-# THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
-lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
-synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"

 # SPELLCHECKING
-path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
+path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"


+path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
+path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
+path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
+path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
+
+
+
+# SPELLCHECKING
+
+
+parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')

 DE_PARSER = spacy.load("de")
 EN_PARSER = spacy.load("en")



-
-"""
-de_stop_words= set(
-    list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) +
-    list(textacy.fileio.read_file_lines("stopwords-de.txt"))
-)
-
-
-LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt"))
-
-VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
-"""
-
 de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
    "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german')))

@ -126,15 +160,7 @@ specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORE
 hardSFinder = re.compile(r'[ß]', re.IGNORECASE)


-import pickle

-def save_obj(obj, path):
-    with open(path + '.pkl', 'wb') as f:
-        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
-
-def load_obj(path ):
-    with open(path + '.pkl', 'rb') as f:
-        return pickle.load(f)
 def printlog(string, level="INFO"):
    """log and prints"""
    print(string)
@ -238,21 +264,6 @@ def csv_to_metaStream(path2csv: str, metalist: [str]):
            yield metadata


-def save_corpus(corpus, corpus_path, corpus_name, parser=DE_PARSER):
-
-    # save stringstore
-    stringstore_path = corpus_path + corpus_name + '_strings.json'
-    with open(stringstore_path, "w") as file:
-        parser.vocab.strings.dump(file)
-
-    # save content
-    contentpath = corpus_path + corpus_name + "_content.bin"
-    textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
-
-    # save meta
-    metapath = corpus_path + corpus_name + "_meta.json"
-    textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
-


 #############  filter tokens
@ -751,6 +762,51 @@ def filterTokens(tokens, funclist):

    return tokens

+def cleanString(string):
+    # replaceRockDots
+    string = re.sub(r'[ß]', "ss", string)
+    string = re.sub(r'[ö]', "oe", string)
+    string = re.sub(r'[ü]', "ue", string)
+    string = re.sub(r'[ä]', "ae", string)
+
+
+    # längeres leerzeichen normalisieren
+    string = textacy.preprocess.normalize_whitespace(string)
+
+    return(string)
+
+def normalizeTextStream(textstream,clean=False):
+    """
+    :param textstream: string-gen
+    :param parser: spacy-parser
+    :yield: string-gen
+    """
+
+    for txt in textstream:
+        if clean:
+            yield cleanString(txt)
+        else:
+            yield textacy.preprocess.normalize_whitespace(txt)
+
+def nomalizeDictstream(dictstream, clean=False):
+    """
+    :param dictstream: dict-gen
+    :param parser: spacy-parser
+    :yield: dict-gen
+    """
+
+    for dic in dictstream:
+
+        result = {}
+
+        for key, value in dic.items():
+            if clean:
+                result[key] = cleanString(value)
+            else:
+                result[key] = textacy.preprocess.normalize_whitespace(value)
+        yield result
+
+

 custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
                "hallo", "gerne", "freundlich", "fragen", "fehler", "bitten", "ehre", "lieb", "helfen",
--- a/testra.py
+++ b/testra.py
@ -5,6 +5,7 @@ import json

 import spacy
 import textacy
+from functools import reduce

 start = time.time()

@ -52,6 +53,8 @@ corpus.add_texts(

 print(corpus)
 """
+
+
 import pickle

 def save_obj(obj, path):
@ -63,31 +66,122 @@ def load_obj(path ):
        return pickle.load(f)


+# THESAURUS
+lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
+lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
+synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"

-lemmalist = list(map(textacy.preprocess.normalize_whitespace,
-                     list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))))

-lemma_dict = {}
+def build_thesaurus(path2lexicalentries):#, path2synsets):
+    lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
+    #syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))

-for line in lemmalist:
+    lexroot = lextree.getroot()
+    #synroot = syntree.getroot()

-    lem_word_pair = line.split()

-    lemma = lem_word_pair[0].strip().lower()
+    word2synsets = {}
+    template = {"w1": ["s1", "s2"]}

-    word = lem_word_pair[1].strip().lower()
-
-    lemma_dict[word] = lemma
+    for ro in lexroot:
+        for elem in ro:
+            if elem.tag == "LexicalEntry":
+                lex_dictlist = [subentry.attrib for subentry in elem]



-print(lemma_dict["abbekomme"])
+                synlist = []
+                string = "WORD"

-save_obj(lemma_dict, "test_dictionies")
+                for lex_dict in lex_dictlist:
+                    if "synset" in lex_dict.keys():

-loaded = load_obj("test_dictionies")
+                            synset = lex_dict["synset"]
+                            synlist.append(synset)

-print(loaded["abbekomme"])
+                    if 'writtenForm' in lex_dict.keys():
+                            string = (lex_dict["writtenForm"])
+
+                            # replaceRockDots
+                            string = re.sub(r'[ß]', "ss", string)
+                            string = re.sub(r'[ö]', "oe", string)
+                            string = re.sub(r'[ü]', "ue", string)
+                            string = re.sub(r'[ä]', "ae", string)
+
+                            # alle punkte raus
+                            string = re.sub(r'[.]', "", string)
+
+                            # alles in klammern raus
+                            string = re.sub(r"\((.*)\)", " ", string)
+
+                            # längeres leerzeichen normalisieren
+                            string = textacy.preprocess.normalize_whitespace(string)
+
+                            string = string.lower().strip()
+
+                word2synsets[string] = synlist
+
+    synset2Words = {}
+    template = {"s1": ["w1","w2"]}
+
+    for word,synset in word2synsets.items():
+        for syn in synset:
+            if syn not in synset2Words.keys():
+                synset2Words[syn] = [word]
+            else:
+                synset2Words[syn].append(word)
+
+    # nach anzhal der wörter in den strings sortieren
+    for synset in word2synsets.values():
+        synset.sort(key=lambda x: len(x.split()))
+
+    thesaurus = {}
+    thesaurus_template = {"w1" : "mainsyn"}
+
+    for word,synset in word2synsets.items():
+        try:
+            thesaurus[word] = synset2Words[synset[0]][0]    #Ann.: erstes synonym ist das Hauptsynonym
+        except:
+            pass
+    return thesaurus
+
+    """
+    for r in synroot:
+        for element in r:
+
+            if element.tag == "Synset":
+                synset = []
+                attrib = element.attrib
+                id = attrib["id"]
+
+                if id not in synset2Words.keys():
+                    synset2Words[id] = "WORD"
+    """
+
+
+def load_corpus(corpus_path, corpus_name, lang="de"):
+    contentpath = corpus_path + corpus_name + "_content.bin"
+    metapath = corpus_path + corpus_name + "_meta.json"
+
+    # load parser
+    parserpath = corpus_path + str(lang) + '_parser'
+    parser = spacy.load(parserpath)
+
+    corpus = textacy.Corpus(parser)
+
+    metadata_stream = textacy.fileio.read_json_lines(metapath)
+    spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
+    for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
+        corpus.add_doc(
+            textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
+    return corpus
+
+#todo load corpus from file idee stringstore und vocab laden
+
+corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
+corpus_name = "de_raw_ticketCorpus"
+
+print(load_corpus(corpus_path, corpus_name))

 """
 from postal.parser import parse_address
@ -101,12 +195,6 @@ address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder-
 print(parse_address(address))
 """

-
-
-
-
-
-
 """

 corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"