topicModelingTickets/init.py

# -*- coding: utf-8 -*-

from miscellaneous import *
from stop_words import get_stop_words
import csv
import sys
import xml.etree.ElementTree as ET

from nltk.corpus import stopwords as nltk_stopwords

from collections import Counter
import time
from datetime import datetime
import os

csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"


# load config
config_ini = FILEPATH + "config.ini"

config = ConfigParser.ConfigParser()
with open(config_ini) as f:
    config.read_file(f)


def create_lemma_dict(path2lemmalist):
    """
    Creates a dict out of a file a la:

    l1 w1
    l1 w2
    l2 w1
    l2 w2

    Result will be used as lemma_dict["word"] --> lemma

    :param path2lemmalist: str
    :return: dictionary
    """
    lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(
        textacy.fileio.read_file_lines(path2lemmalist))))

    lemma_dict = {}

    for line in lemmalist:
        lem_word_pair = line.split()

        lemma = lem_word_pair[0].strip().lower()

        word = lem_word_pair[1].strip().lower()

        lemma_dict[word] = lemma

    return lemma_dict


def build_thesaurus_dict(path2wordnet,returnall=False):
    """
    Creates a dict out of the deWordNet
    https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml

    Result will be used as lemma_dict["word"] --> lemma

    :param path2wordnet: str
    :param returnall: bool    if True, also return , word2synsets, synset2Words
    :return: dictionaries:   thesaurus
    """
    lextree = ET.parse(path2wordnet, ET.XMLParser(encoding="utf-8"))

    lexroot = lextree.getroot()

    word2synsets = {}
    template = {"w1": ["s1", "s2"]}

    for ro in lexroot:
        for elem in ro:
            if elem.tag == "LexicalEntry":
                lex_dictlist = [subentry.attrib for subentry in elem]


                synlist = []
                string = "WORD"

                for lex_dict in lex_dictlist:
                    if "synset" in lex_dict.keys():

                            synset = lex_dict["synset"]
                            synlist.append(synset)

                    if 'writtenForm' in lex_dict.keys():
                            string = (lex_dict["writtenForm"])

                            if string == "Kennwort":
                                pass

                            # replaceRockDots
                            string = re.sub(r'[ß]', "ss", string)
                            string = re.sub(r'[ö]', "oe", string)
                            string = re.sub(r'[ü]', "ue", string)
                            string = re.sub(r'[ä]', "ae", string)

                            # alle punkte raus
                            string = re.sub(r'[.]', "", string)

                            # alles in klammern raus
                            string = re.sub(r"\((.*)\)", " ", string)

                            # längeres leerzeichen normalisieren
                            string = textacy.preprocess.normalize_whitespace(string)

                            string = string.lower().strip()

                word2synsets[string] = synlist


    synset2Words = {}
    template = {"s1": ["w1","w2"]}

    for word,synset in word2synsets.items():
        if word != '':
            for syn in synset:
                if syn not in synset2Words.keys():
                    synset2Words[syn] = [word]
                else:
                    synset2Words[syn].append(word)

    # nach anzhal der wörter in den strings sortieren
    for synset in word2synsets.values():
        synset.sort(key=lambda x: len(x.split()))

    thesaurus = {}
    thesaurus_template = {"w1" : "mainsyn"}


    for word,synset in word2synsets.items():
        try:
            thesaurus[word] = synset2Words[synset[0]][0]    #Ann.: erstes synonym ist das Hauptsynonym #todo nach (hauptform) suchen?
        except:
            pass


    if returnall:
        return thesaurus, word2synsets, synset2Words
    else:
        return thesaurus


def create_stopword_lists(*paths):
    """
    creates a list of stoppwords from:
        spacy
        nltk
        stop_words

    :param paths: list of additional filepaths where each file looks like
        w1
        w2
        w3
    filenames must be a la de_stopwords_1.txt, en_stopwords_2.txt

    :return: lists: de_stopwords, en_stopwords
    """

    ##  GERMAN

    # from packages
    de_stop_words1 = list(get_stop_words("de"))

    de_stop_words2 = list(nltk_stopwords.words('german'))

    de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)

    #from files
    de_filepaths = []
    for path in paths:
        if os.path.basename(path).split("_")[0] == 'de' and os.path.basename(path).split("_")[
            1] == 'stopwords':
            de_filepaths.append(path)


    de_stop_words4 = list_from_files(*de_filepaths)

    #combine everything
    de_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
                                                             de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))


    ##  ENGLISH

    # from packages
    en_stop_words1 = list(get_stop_words("en"))

    en_stop_words2 = list(nltk_stopwords.words('english'))

    en_stop_words3 = list(__import__("spacy.en", globals(), locals(), ['object']).STOP_WORDS)

    # from files
    en_filepaths = [path for path in paths if
                    os.path.basename(path).split("_")[0] == 'en' and os.path.basename(path).split("_")[
                        1] == 'stopwords']

    en_stop_words4 = list_from_files(*en_filepaths)


    # combine everything
    en_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
                                                             en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4)))))


    return de_stop_words, en_stop_words


def build_words_for_spellchecking(path2words):
    """
    create word-Counter for spellchecking

    http://norvig.com/spell-correct.html
    http://wortschatz.uni-leipzig.de/en/download

    http://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_news_2015_1M.tar.gz
    :return: Counter
    """
    def words(text): return re.findall(r'\w+', text.lower())

    return Counter(words(open(path2words).read()))


##################################################################################################

# THESAURUS
path2wordnet = FILEPATH + config.get("thesaurus","input")
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")


# SPELLCHECKING
path2words_file = FILEPATH + config.get("spellchecking","input")
path2wordlist = FILEPATH + config.get("spellchecking","pickle_file")


# LEMMA
path2lemma_file = FILEPATH + config.get("lemmatization","input")
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")

# NOMEN
nouns1 = FILEPATH + config.get("nouns","input1")
nouns2 = FILEPATH + config.get("nouns","input2")
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")


# VORNAMEN
firstnames_txt = FILEPATH + config.get("firstnames","input")
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")

# STOPWORDS
stop1 = FILEPATH + config.get("de_stopwords","input1")
stop2 = FILEPATH + config.get("de_stopwords","input2")
stop3 = FILEPATH + config.get("de_stopwords","input3")
path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file")


def main():
    start = time.time()
    printlog("Init: {0}".format(datetime.now()))


    printlog("create and save lemma_dict")
    lemma_dict = create_lemma_dict(path2lemma_file)
    save_obj(lemma_dict, path2lemmadict)


    printlog("Build and save Wordlist for Spellchecking")
    words = build_words_for_spellchecking(path2words_file)
    save_obj(words, path2wordlist)


    printlog("Build and save Thesaurus")
    thesaurus = build_thesaurus_dict(path2wordnet)
    save_obj(thesaurus, path2thesaurus_dict)


    printlog("Build and save stoppwortliste")
    de_stop_words = create_stopword_lists(stop1, stop2, stop3)
    save_obj(de_stop_words, path2stopwordlist)


    printlog("Build and save nomenliste")
    nouns = list_from_files(nouns1,nouns2)
    save_obj(nouns, path2nouns_list)


    printlog("Build and save firstnameslist")
    vornamen = list_from_files(firstnames_txt)
    save_obj(vornamen, path2firstnameslist)


    end = time.time()
    printlog("Time Elapsed Initialization:{0} min".format((end - start) / 60))


if __name__ == "__main__":
    main()