topicModelingTickets/init.py

# -*- coding: utf-8 -*-

from datetime import datetime

import time
import logging
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle


# todo configuration file ?
"""
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"

config = ConfigParser.ConfigParser()
with open(config_ini) as f:
    config.read_file(f)
"""

# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"


# config logging
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
logging.basicConfig(filename=logfile, level=logging.INFO)


DE_PARSER = spacy.load("de")
EN_PARSER = spacy.load("en")


def replaceRockDots():
    return lambda string: re.sub(r'[ß]', "ss",
                                 (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))

def printlog(string, level="INFO"):
    """log and prints"""
    print(string)
    if level == "INFO":
        logging.info(string)
    elif level == "DEBUG":
        logging.debug(string)
    elif level == "WARNING":
        logging.warning(string)


def save_obj(obj, path):
    with open(path + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(path ):
    with open(path + '.pkl', 'rb') as f:
        return pickle.load(f)

def create_lemma_dict(lemmalist):

    lemma_dict = {}

    for line in lemmalist:
        lem_word_pair = line.split()

        lemma = lem_word_pair[0].strip().lower()

        word = lem_word_pair[1].strip().lower()

        lemma_dict[word] = lemma

    return lemma_dict

"""
def build_thesaurus(path2lexicalentries, path2synsets):
    lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
    syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))

    lexroot = lextree.getroot()
    synroot = syntree.getroot()

    thesaurus = []

    for r in synroot:
        for element in r:

            if element.tag == "Synset":
                sysnet = []
                attrib = element.attrib
                id = attrib["id"]

                for ro in lexroot:
                    for elem in ro:
                        if elem.tag == "LexicalEntry":
                            subs_dicts = [subentry.attrib for subentry in elem]
                            # <class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]

                            dic = {k: v for x in subs_dicts for k, v in x.items()}  # to one dict
                            if "synset" in dic.keys():
                                if dic["synset"] == id:
                                    string = (dic["writtenForm"])

                                    # replaceRockDots
                                    string = re.sub(r'[ß]', "ss", string)
                                    string = re.sub(r'[ö]', "oe", string)
                                    string = re.sub(r'[ü]', "ue", string)
                                    string = re.sub(r'[ä]', "ae", string)

                                    # alle punkte raus
                                    string = re.sub(r'[.]', "", string)

                                    # alles in klammern raus
                                    string = re.sub(r"\((.*)\)", " ", string)

                                    # längeres leerzeichen normalisieren
                                    string = textacy.preprocess.normalize_whitespace(string)

                                    sysnet.append(string.lower().strip())

                                    # nach anzhal der wörter in den strings sortieren
                                    sysnet.sort(key=lambda x: len(x.split()))
                if len(sysnet) != 0:
                    # todo warum sind manche leer?
                    thesaurus.append(sysnet)
    return thesaurus

    #todo thesaurus in dictionary
"""

def build_thesaurus(path2lexicalentries):#, path2synsets):
    lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
    #syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))

    lexroot = lextree.getroot()
    #synroot = syntree.getroot()


    word2synsets = {}
    template = {"w1": ["s1", "s2"]}

    for ro in lexroot:
        for elem in ro:
            if elem.tag == "LexicalEntry":
                lex_dictlist = [subentry.attrib for subentry in elem]


                synlist = []
                string = "WORD"

                for lex_dict in lex_dictlist:
                    if "synset" in lex_dict.keys():

                            synset = lex_dict["synset"]
                            synlist.append(synset)

                    if 'writtenForm' in lex_dict.keys():
                            string = (lex_dict["writtenForm"])

                            # replaceRockDots
                            string = re.sub(r'[ß]', "ss", string)
                            string = re.sub(r'[ö]', "oe", string)
                            string = re.sub(r'[ü]', "ue", string)
                            string = re.sub(r'[ä]', "ae", string)

                            # alle punkte raus
                            string = re.sub(r'[.]', "", string)

                            # alles in klammern raus
                            string = re.sub(r"\((.*)\)", " ", string)

                            # längeres leerzeichen normalisieren
                            string = textacy.preprocess.normalize_whitespace(string)

                            string = string.lower().strip()

                word2synsets[string] = synlist

    synset2Words = {}
    template = {"s1": ["w1","w2"]}

    for word,synset in word2synsets.items():
        for syn in synset:
            if syn not in synset2Words.keys():
                synset2Words[syn] = [word]
            else:
                synset2Words[syn].append(word)

    # nach anzhal der wörter in den strings sortieren
    for synset in word2synsets.values():
        synset.sort(key=lambda x: len(x.split()))

    thesaurus = {}
    thesaurus_template = {"w1" : "mainsyn"}

    for word,synset in word2synsets.items():
        try:
            thesaurus[word] = synset2Words[synset[0]][0]    #Ann.: erstes synonym ist das Hauptsynonym
        except:
            pass
    return thesaurus

    """
    for r in synroot:
        for element in r:

            if element.tag == "Synset":
                synset = []
                attrib = element.attrib
                id = attrib["id"]

                if id not in synset2Words.keys():
                    synset2Words[id] = "WORD"
    """


def create_stopwordlist():

    de_stop_words1 = list(map(replaceRockDots(),
                              list(
                                  map(textacy.preprocess.normalize_whitespace,
                                      textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt")
                                      )
                                  )
                              )
                          )

    de_stop_words2 = list(map(replaceRockDots(),list(set(nltk_stopwords.words('german')))))

    de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))

    de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))

    de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))

    return de_stop_words

    #todo en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(nltk_stopwords.words('english'))))


##########################    Spellchecking        ##########################################
# http://norvig.com/spell-correct.html
# http://wortschatz.uni-leipzig.de/en/download

def words(text): return re.findall(r'\w+', text.lower())


##################################################################################################

# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien
# außerdem saubere stoppwortliste und nomenliste


# THESAURUS
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"


# SPELLCHECKING
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"


path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"


def main():
    start = time.time()
    printlog("Init: {0}".format(datetime.now()))


    printlog("create and save lemma_dict")
    LEMMAS = list(
        textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))

    lemma_dict = create_lemma_dict(LEMMAS)
    save_obj(lemma_dict, path2lemmadict)


    printlog("Build and save Wordlist for Spellchecking")
    WORDS = Counter(words(open(path2words).read()))
    save_obj(WORDS, path2wordlist)


    printlog("Build and save Thesaurus")
    THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries)


    save_obj(THESAURUS, path2thesaurusdict)


    printlog("Build and save stoppwortliste")
    de_stop_words = create_stopwordlist()
    save_obj(de_stop_words, path2stopwordlist)


    printlog("Build and save nomenliste")
    NOUNS = list(textacy.fileio.read_file_lines(
        "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(
        textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
    NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
    save_obj(NOUNS, path2NOUNSlist)


    printlog("Build and save fistnameslist")
    VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
        "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))

    save_obj(VORNAMEN, path2firstnameslist)


    end = time.time()
    printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))


if __name__ == "__main__":
    main()
refactoring. 2017-10-11 17:16:04 +02:00			`# -- coding: utf-8 --`

			`from datetime import datetime`

			`import time`
			`import logging`
			`from nltk.corpus import stopwords as nltk_stopwords`
			`from collections import Counter`
			`import csv`
			`import re`
			`import xml.etree.ElementTree as ET`
			`import spacy`
			`import textacy`
			`from scipy import *`
			`import sys`
			`csv.field_size_limit(sys.maxsize)`
			`import pickle`


			`# todo configuration file ?`
			`"""`
			`config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"`

			`config = ConfigParser.ConfigParser()`
			`with open(config_ini) as f:`
			`config.read_file(f)`
			`"""`

thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"`
refactoring. 2017-10-11 17:16:04 +02:00

			`# config logging`
			`logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"`
			`logging.basicConfig(filename=logfile, level=logging.INFO)`




			`DE_PARSER = spacy.load("de")`
			`EN_PARSER = spacy.load("en")`



			`def replaceRockDots():`
			`return lambda string: re.sub(r'[ß]', "ss",`
			`(re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))`

			`def printlog(string, level="INFO"):`
			`"""log and prints"""`
			`print(string)`
			`if level == "INFO":`
			`logging.info(string)`
			`elif level == "DEBUG":`
			`logging.debug(string)`
			`elif level == "WARNING":`
			`logging.warning(string)`




			`def save_obj(obj, path):`
			`with open(path + '.pkl', 'wb') as f:`
			`pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)`

			`def load_obj(path ):`
			`with open(path + '.pkl', 'rb') as f:`
			`return pickle.load(f)`

			`def create_lemma_dict(lemmalist):`

			`lemma_dict = {}`

			`for line in lemmalist:`
			`lem_word_pair = line.split()`

			`lemma = lem_word_pair[0].strip().lower()`

			`word = lem_word_pair[1].strip().lower()`

			`lemma_dict[word] = lemma`

			`return lemma_dict`

thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`"""`
refactoring. 2017-10-11 17:16:04 +02:00			`def build_thesaurus(path2lexicalentries, path2synsets):`
			`lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))`
			`syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))`

			`lexroot = lextree.getroot()`
			`synroot = syntree.getroot()`

			`thesaurus = []`

			`for r in synroot:`
			`for element in r:`

			`if element.tag == "Synset":`
			`sysnet = []`
			`attrib = element.attrib`
			`id = attrib["id"]`

			`for ro in lexroot:`
			`for elem in ro:`
			`if elem.tag == "LexicalEntry":`
			`subs_dicts = [subentry.attrib for subentry in elem]`
			`# <class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]`

			`dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict`
			`if "synset" in dic.keys():`
			`if dic["synset"] == id:`
			`string = (dic["writtenForm"])`

			`# replaceRockDots`
			`string = re.sub(r'[ß]', "ss", string)`
			`string = re.sub(r'[ö]', "oe", string)`
			`string = re.sub(r'[ü]', "ue", string)`
			`string = re.sub(r'[ä]', "ae", string)`

			`# alle punkte raus`
			`string = re.sub(r'[.]', "", string)`

			`# alles in klammern raus`
			`string = re.sub(r"\((.*)\)", " ", string)`

			`# längeres leerzeichen normalisieren`
			`string = textacy.preprocess.normalize_whitespace(string)`

			`sysnet.append(string.lower().strip())`

			`# nach anzhal der wörter in den strings sortieren`
			`sysnet.sort(key=lambda x: len(x.split()))`
			`if len(sysnet) != 0:`
			`# todo warum sind manche leer?`
			`thesaurus.append(sysnet)`
			`return thesaurus`

			`#todo thesaurus in dictionary`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`"""`

			`def build_thesaurus(path2lexicalentries):#, path2synsets):`
			`lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))`
			`#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))`

			`lexroot = lextree.getroot()`
			`#synroot = syntree.getroot()`


			`word2synsets = {}`
			`template = {"w1": ["s1", "s2"]}`

			`for ro in lexroot:`
			`for elem in ro:`
			`if elem.tag == "LexicalEntry":`
			`lex_dictlist = [subentry.attrib for subentry in elem]`



			`synlist = []`
			`string = "WORD"`

			`for lex_dict in lex_dictlist:`
			`if "synset" in lex_dict.keys():`

			`synset = lex_dict["synset"]`
			`synlist.append(synset)`

			`if 'writtenForm' in lex_dict.keys():`
			`string = (lex_dict["writtenForm"])`

			`# replaceRockDots`
			`string = re.sub(r'[ß]', "ss", string)`
			`string = re.sub(r'[ö]', "oe", string)`
			`string = re.sub(r'[ü]', "ue", string)`
			`string = re.sub(r'[ä]', "ae", string)`

			`# alle punkte raus`
			`string = re.sub(r'[.]', "", string)`

			`# alles in klammern raus`
			`string = re.sub(r"\((.*)\)", " ", string)`

			`# längeres leerzeichen normalisieren`
			`string = textacy.preprocess.normalize_whitespace(string)`

			`string = string.lower().strip()`

			`word2synsets[string] = synlist`

			`synset2Words = {}`
			`template = {"s1": ["w1","w2"]}`

			`for word,synset in word2synsets.items():`
			`for syn in synset:`
			`if syn not in synset2Words.keys():`
			`synset2Words[syn] = [word]`
			`else:`
			`synset2Words[syn].append(word)`

			`# nach anzhal der wörter in den strings sortieren`
			`for synset in word2synsets.values():`
			`synset.sort(key=lambda x: len(x.split()))`

			`thesaurus = {}`
			`thesaurus_template = {"w1" : "mainsyn"}`

			`for word,synset in word2synsets.items():`
			`try:`
			`thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym`
			`except:`
			`pass`
			`return thesaurus`

			`"""`
			`for r in synroot:`
			`for element in r:`

			`if element.tag == "Synset":`
			`synset = []`
			`attrib = element.attrib`
			`id = attrib["id"]`

			`if id not in synset2Words.keys():`
			`synset2Words[id] = "WORD"`
			`"""`

refactoring. 2017-10-11 17:16:04 +02:00

			`def create_stopwordlist():`

			`de_stop_words1 = list(map(replaceRockDots(),`
			`list(`
			`map(textacy.preprocess.normalize_whitespace,`
			`textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt")`
			`)`
			`)`
			`)`
			`)`

			`de_stop_words2 = list(map(replaceRockDots(),list(set(nltk_stopwords.words('german')))))`

			`de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))`

thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))`
refactoring. 2017-10-11 17:16:04 +02:00
			`de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))`

			`return de_stop_words`

			`#todo en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(nltk_stopwords.words('english'))))`





			`########################## Spellchecking ##########################################`
			`# http://norvig.com/spell-correct.html`
			`# http://wortschatz.uni-leipzig.de/en/download`

			`def words(text): return re.findall(r'\w+', text.lower())`


			`##################################################################################################`

thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien`
refactoring. 2017-10-11 17:16:04 +02:00			`# außerdem saubere stoppwortliste und nomenliste`



			`# THESAURUS`
			`lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"`
			`path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"`
refactoring. 2017-10-11 17:16:04 +02:00


			`# SPELLCHECKING`
			`path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"`
refactoring. 2017-10-11 17:16:04 +02:00



thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"`
			`path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"`
			`path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"`
			`path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"`
refactoring. 2017-10-11 17:16:04 +02:00






			`def main():`
			`start = time.time()`
			`printlog("Init: {0}".format(datetime.now()))`




			`printlog("create and save lemma_dict")`
			`LEMMAS = list(`
			`textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))`

			`lemma_dict = create_lemma_dict(LEMMAS)`
			`save_obj(lemma_dict, path2lemmadict)`






			`printlog("Build and save Wordlist for Spellchecking")`
			`WORDS = Counter(words(open(path2words).read()))`
			`save_obj(WORDS, path2wordlist)`





			`printlog("Build and save Thesaurus")`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries)`
refactoring. 2017-10-11 17:16:04 +02:00

thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`save_obj(THESAURUS, path2thesaurusdict)`
refactoring. 2017-10-11 17:16:04 +02:00




			`printlog("Build and save stoppwortliste")`
			`de_stop_words = create_stopwordlist()`
			`save_obj(de_stop_words, path2stopwordlist)`






			`printlog("Build and save nomenliste")`
			`NOUNS = list(textacy.fileio.read_file_lines(`
			`"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(`
			`textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))`
			`NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))`
			`save_obj(NOUNS, path2NOUNSlist)`






			`printlog("Build and save fistnameslist")`
			`VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(`
			`"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))`

			`save_obj(VORNAMEN, path2firstnameslist)`






			`end = time.time()`
			`printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))`



			`if __name__ == "__main__":`
			`main()`