topicModelingTickets/init.py

# -*- coding: utf-8 -*-

from miscellaneous import *
from stop_words import get_stop_words
import csv
import sys
import xml.etree.ElementTree as ET

from nltk.corpus import stopwords as nltk_stopwords

from collections import Counter
import time
from datetime import datetime
import os

csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"


# load config
config_ini = FILEPATH + "config.ini"

config = ConfigParser.ConfigParser()
with open(config_ini) as f:
    config.read_file(f)


def create_lemma_dict(path2lemmalist):
    """
    Creates a dict out of a txt file a la:

    l1 w1
    l1 w2
    l2 w1
    l2 w2

    Result will be used as lemma_dict[word] --> lemma

    :param path2lemmalist: str
    :return: dictionary
    """
    file_gen = textacy.fileio.read_file_lines(path2lemmalist)
    lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(file_gen)))

    lemma_dict = {}

    for line in lemmalist:
        lem_word_pair = line.split()

        lemma = lem_word_pair[0].strip().lower()

        word = lem_word_pair[1].strip().lower()

        lemma_dict[word] = lemma

    return lemma_dict


def build_thesaurus_dict(path2wordnet,returnall=False):
    """
    Creates a dict out of the deWordNet
    https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml

    Result will be used as thesaurus[word] --> main_synonym

    :param path2wordnet: str
    :param returnall: bool    if True, also return , word2synsets, synset2Words
    :return: dictionaries:   thesaurus
    """
    lextree = ET.parse(path2wordnet, ET.XMLParser(encoding="utf-8"))

    lexroot = lextree.getroot()

    # Build word2synsets
    word2synsets = {}
    template = {"w1": ["s1", "s2"]}

    for ro in lexroot:
        for elem in ro:
            if elem.tag == "LexicalEntry":
                lex_dictlist = [subentry.attrib for subentry in elem]

                # idee technischer thesaurus

                synlist = []
                string = "WORD"

                for lex_dict in lex_dictlist:
                    if "synset" in lex_dict.keys():

                            synset = lex_dict["synset"]
                            synlist.append(synset)

                    if 'writtenForm' in lex_dict.keys():
                            string = (lex_dict["writtenForm"])


                            # replaceRockDots
                            string = re.sub(r'[ß]', "ss", string)
                            string = re.sub(r'[ö]', "oe", string)
                            string = re.sub(r'[Ö]', "Oe", string)

                            string = re.sub(r'[ü]', "ue", string)
                            string = re.sub(r'[Ü]', "Ue", string)

                            string = re.sub(r'[ä]', "ae", string)
                            string = re.sub(r'[Ä]', "ae", string)

                            # alle punkte raus
                            string = re.sub(r'[.]', "", string)

                            # alles in klammern raus
                            if "auptform" in string:
                                string = re.sub(r"\((.*)\)", " ", string)
                                string = string + " (hauptform)"            # evtl. als hauptform merken
                            else:
                                string = re.sub(r"\((.*)\)", " ", string)

                            # längeres leerzeichen normalisieren
                            string = textacy.preprocess.normalize_whitespace(string)

                            string = string.strip()#.lower()

                if string != '':
                    word2synsets[string] = synlist


    # Build synset2Words
    synset2Words = {}
    template = {"s1": ["w1","w2"]}

    for word,synset in word2synsets.items():
        if word != '':


            for syn in synset:
                if syn not in synset2Words.keys():
                    synset2Words[syn] = [word]
                else:
                    synset2Words[syn].append(word)


    # Sortieren
    for words in synset2Words.values():
        words.sort(key=lambda w: len(w.split())) # nach anzhal der wörter in den strings (weniger nach vorne)
        for w in words:
            if "(hauptform)" in w:
                to_insert = re.sub(r"\((.*)\)", " ", w).strip()

                words.remove(w)
                words.insert(0, to_insert)  # Hauptform evtl. nach vorne


    thesaurus = {}
    thesaurus_template = {"w1" : "mainsyn"}
    # word --> [synset1, synset2, .. ] --> synset1 --> [syn1, syn2, ... ] --> syn1 / mainsyn


    for word,synsets in word2synsets.items():   #word , [synset1, synset2, .. ]
        try:
            if "Passwort" in word:
                x=2

            first_synset = synsets[0]           #erstes synset wählen . praktischer Grund

            syns = synset2Words[first_synset]   # [syn1, syn2, ... ]

            first_syn = syns[0] # erstes synonym (evtl. Hauptform)  wählen

            word = re.sub(r"\((.*)\)", " ", word).strip() #(hautpform weg)


            thesaurus[word] = first_syn  #Ann.: erstes synonym ist das Hauptsynonym
        except:
            pass


    if returnall:
        return thesaurus, word2synsets, synset2Words
    else:
        return thesaurus


def create_stopword_lists(*paths):
    """
    creates a list of stoppwords from:
        spacy
        nltk
        stop_words

    :param paths: list of additional filepaths where each file looks like
        w1
        w2
        w3
    filenames must be a la de_stopwords_1.txt, en_stopwords_2.txt

    :return: lists: de_stopwords, en_stopwords
    """

    ##  GERMAN

    # from packages
    de_stop_words1 = list(get_stop_words("de"))

    de_stop_words2 = list(nltk_stopwords.words('german'))

    de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)

    #from files
    de_filepaths = []
    for path in paths:
        if os.path.basename(path).split("_")[0] == 'de' and os.path.basename(path).split("_")[
            1] == 'stopwords':
            de_filepaths.append(path)


    de_stop_words4 = list_from_files(*de_filepaths)

    #combine everything
    de_stop_words = list(set(map(replaceRockDots_lambda(), list(map(textacy.preprocess.normalize_whitespace,
                                                                    de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))


    ##  ENGLISH

    # from packages
    en_stop_words1 = list(get_stop_words("en"))

    en_stop_words2 = list(nltk_stopwords.words('english'))

    en_stop_words3 = list(__import__("spacy.en", globals(), locals(), ['object']).STOP_WORDS)

    # from files
    en_filepaths = [path for path in paths if
                    os.path.basename(path).split("_")[0] == 'en' and os.path.basename(path).split("_")[
                        1] == 'stopwords']

    en_stop_words4 = list_from_files(*en_filepaths)


    # combine everything
    en_stop_words = list(set(map(replaceRockDots_lambda(), list(map(textacy.preprocess.normalize_whitespace,
                                                                    en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4)))))


    return de_stop_words, en_stop_words


def build_words_for_spellchecking(path2words):
    """
    create word-Counter for spellchecking

    http://norvig.com/spell-correct.html
    http://wortschatz.uni-leipzig.de/en/download

    http://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_news_2015_1M.tar.gz
    :return: Counter
    """
    def words(text): return re.findall(r'\w+', text.lower())

    return Counter(words(open(path2words).read()))


##################################################################################################


def main():
    start = time.time()
    logprint("Init: {0}".format(datetime.now()))

    ressources_path = FILEPATH + "ressources/"


    # THESAURUS
    logprint("Build and save Thesaurus")

    path2wordnet = ressources_path + config.get("thesaurus", "input")
    thesaurus = build_thesaurus_dict(path2wordnet)

    path2thesaurus_dict = ressources_path + config.get("thesaurus", "pickle_file")
    save_obj(thesaurus, path2thesaurus_dict)


    # LEMMA
    logprint("create and save lemma_dict")

    path2lemma_file = ressources_path + config.get("lemmatization", "input")
    lemma_dict = create_lemma_dict(path2lemma_file)

    path2lemmadict = ressources_path + config.get("lemmatization", "pickle_file")
    save_obj(lemma_dict, path2lemmadict)


    # SPELLCHECKING
    logprint("Build and save Wordlist for Spellchecking")

    path2words_file = ressources_path + config.get("spellchecking", "input")
    words = build_words_for_spellchecking(path2words_file)

    path2words_counter = ressources_path + config.get("spellchecking", "pickle_file")
    save_obj(words, path2words_counter)


    # STOPWORDS
    logprint("Build and save stoppwortliste")

    stop1 = ressources_path + config.get("de_stopwords", "input1")
    stop2 = ressources_path + config.get("de_stopwords", "input2")
    stop3 = ressources_path + config.get("de_stopwords", "input3")
    de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3)

    path2stopwordlist_de = ressources_path + config.get("de_stopwords", "pickle_file")
    save_obj(de_stop_words, path2stopwordlist_de)

    path2stopwordlist_en = ressources_path + config.get("en_stopwords", "pickle_file")
    save_obj(en_stop_words, path2stopwordlist_en)


    # NOMEN
    logprint("Build and save nomenliste")

    nouns0 = ressources_path + config.get("nouns", "input")
    nouns1 = ressources_path + config.get("nouns", "input1")
    nouns2 = ressources_path + config.get("nouns", "input2")
    nouns = list_from_files(nouns0,nouns1,nouns2)

    path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
    save_obj(nouns, path2nouns_list)


    # VORNAMEN
    logprint("Build and save firstnameslist")

    firstnames_txt = ressources_path + config.get("firstnames", "input")
    vornamen = list_from_files(firstnames_txt)

    path2firstnameslist = ressources_path + config.get("firstnames", "pickle_file")
    save_obj(vornamen, path2firstnameslist)


    end = time.time()
    logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60))


if __name__ == "__main__":
    main()
refactoring. 2017-10-11 17:16:04 +02:00			`# -- coding: utf-8 --`

weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`from miscellaneous import *`
aufgeräumt 2017-10-16 14:01:38 +02:00			`from stop_words import get_stop_words`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`import csv`
			`import sys`
			`import xml.etree.ElementTree as ET`
aufgeräumt 2017-10-16 14:01:38 +02:00
refactoring. 2017-10-11 17:16:04 +02:00			`from nltk.corpus import stopwords as nltk_stopwords`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00
refactoring. 2017-10-11 17:16:04 +02:00			`from collections import Counter`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`import time`
			`from datetime import datetime`
			`import os`

refactoring. 2017-10-11 17:16:04 +02:00			`csv.field_size_limit(sys.maxsize)`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"`
refactoring. 2017-10-11 17:16:04 +02:00

aufgeräumt 2017-10-16 14:01:38 +02:00			`# load config`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`config_ini = FILEPATH + "config.ini"`
refactoring. 2017-10-11 17:16:04 +02:00
			`config = ConfigParser.ConfigParser()`
			`with open(config_ini) as f:`
			`config.read_file(f)`



aufgeräumt 2017-10-16 14:01:38 +02:00			`def create_lemma_dict(path2lemmalist):`
			`"""`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`Creates a dict out of a txt file a la:`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`l1 w1`
			`l1 w2`
			`l2 w1`
			`l2 w2`
refactoring. 2017-10-11 17:16:04 +02:00
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`Result will be used as lemma_dict[word] --> lemma`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`:param path2lemmalist: str`
			`:return: dictionary`
			`"""`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`file_gen = textacy.fileio.read_file_lines(path2lemmalist)`
			`lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(file_gen)))`
refactoring. 2017-10-11 17:16:04 +02:00
			`lemma_dict = {}`

			`for line in lemmalist:`
			`lem_word_pair = line.split()`

			`lemma = lem_word_pair[0].strip().lower()`

			`word = lem_word_pair[1].strip().lower()`

			`lemma_dict[word] = lemma`

			`return lemma_dict`



aufgeräumt 2017-10-16 14:01:38 +02:00			`def build_thesaurus_dict(path2wordnet,returnall=False):`
			`"""`
			`Creates a dict out of the deWordNet`
			`https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml`
refactoring. 2017-10-11 17:16:04 +02:00
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`Result will be used as thesaurus[word] --> main_synonym`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`:param path2wordnet: str`
aufgeräumt 2017-10-16 14:01:38 +02:00			`:param returnall: bool if True, also return , word2synsets, synset2Words`
			`:return: dictionaries: thesaurus`
			`"""`
			`lextree = ET.parse(path2wordnet, ET.XMLParser(encoding="utf-8"))`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00
			`lexroot = lextree.getroot()`

refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`# Build word2synsets`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`word2synsets = {}`
			`template = {"w1": ["s1", "s2"]}`

			`for ro in lexroot:`
			`for elem in ro:`
			`if elem.tag == "LexicalEntry":`
			`lex_dictlist = [subentry.attrib for subentry in elem]`

preprocessing überarbeitet 2017-12-08 11:06:07 +01:00			`# idee technischer thesaurus`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00
			`synlist = []`
			`string = "WORD"`

			`for lex_dict in lex_dictlist:`
			`if "synset" in lex_dict.keys():`

			`synset = lex_dict["synset"]`
			`synlist.append(synset)`

			`if 'writtenForm' in lex_dict.keys():`
			`string = (lex_dict["writtenForm"])`

aufgeräumt 2017-10-16 14:01:38 +02:00
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`# replaceRockDots`
			`string = re.sub(r'[ß]', "ss", string)`
			`string = re.sub(r'[ö]', "oe", string)`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`string = re.sub(r'[Ö]', "Oe", string)`

thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`string = re.sub(r'[ü]', "ue", string)`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`string = re.sub(r'[Ü]', "Ue", string)`

thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`string = re.sub(r'[ä]', "ae", string)`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`string = re.sub(r'[Ä]', "ae", string)`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00
			`# alle punkte raus`
			`string = re.sub(r'[.]', "", string)`

			`# alles in klammern raus`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`if "auptform" in string:`
			`string = re.sub(r"\((.*)\)", " ", string)`
			`string = string + " (hauptform)" # evtl. als hauptform merken`
			`else:`
			`string = re.sub(r"\((.*)\)", " ", string)`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00
			`# längeres leerzeichen normalisieren`
			`string = textacy.preprocess.normalize_whitespace(string)`

refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`string = string.strip()#.lower()`

			`if string != '':`
			`word2synsets[string] = synlist`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00

aufgeräumt 2017-10-16 14:01:38 +02:00
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`# Build synset2Words`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`synset2Words = {}`
			`template = {"s1": ["w1","w2"]}`

			`for word,synset in word2synsets.items():`
aufgeräumt 2017-10-16 14:01:38 +02:00			`if word != '':`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00

aufgeräumt 2017-10-16 14:01:38 +02:00			`for syn in synset:`
			`if syn not in synset2Words.keys():`
			`synset2Words[syn] = [word]`
			`else:`
			`synset2Words[syn].append(word)`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
			`# Sortieren`
			`for words in synset2Words.values():`
			`words.sort(key=lambda w: len(w.split())) # nach anzhal der wörter in den strings (weniger nach vorne)`
			`for w in words:`
			`if "(hauptform)" in w:`
			`to_insert = re.sub(r"\((.*)\)", " ", w).strip()`

			`words.remove(w)`
			`words.insert(0, to_insert) # Hauptform evtl. nach vorne`



thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00
			`thesaurus = {}`
			`thesaurus_template = {"w1" : "mainsyn"}`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`# word --> [synset1, synset2, .. ] --> synset1 --> [syn1, syn2, ... ] --> syn1 / mainsyn`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`for word,synsets in word2synsets.items(): #word , [synset1, synset2, .. ]`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`try:`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`if "Passwort" in word:`
			`x=2`

			`first_synset = synsets[0] #erstes synset wählen . praktischer Grund`

			`syns = synset2Words[first_synset] # [syn1, syn2, ... ]`

			`first_syn = syns[0] # erstes synonym (evtl. Hauptform) wählen`

			`word = re.sub(r"\((.*)\)", " ", word).strip() #(hautpform weg)`



			`thesaurus[word] = first_syn #Ann.: erstes synonym ist das Hauptsynonym`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00			`except:`
			`pass`


refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`if returnall:`
			`return thesaurus, word2synsets, synset2Words`
			`else:`
			`return thesaurus`
thesaurus erstellung luafzeit verbessert 2017-10-12 15:57:56 +02:00

refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`def create_stopword_lists(*paths):`
			`"""`
			`creates a list of stoppwords from:`
			`spacy`
			`nltk`
			`stop_words`

			`:param paths: list of additional filepaths where each file looks like`
			`w1`
			`w2`
			`w3`
			`filenames must be a la de_stopwords_1.txt, en_stopwords_2.txt`

			`:return: lists: de_stopwords, en_stopwords`
			`"""`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`## GERMAN`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`# from packages`
			`de_stop_words1 = list(get_stop_words("de"))`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`de_stop_words2 = list(nltk_stopwords.words('german'))`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`#from files`
			`de_filepaths = []`
			`for path in paths:`
			`if os.path.basename(path).split("_")[0] == 'de' and os.path.basename(path).split("_")[`
			`1] == 'stopwords':`
			`de_filepaths.append(path)`
refactoring. 2017-10-11 17:16:04 +02:00

aufgeräumt 2017-10-16 14:01:38 +02:00			`de_stop_words4 = list_from_files(*de_filepaths)`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`#combine everything`
preprocessing überarbeitet 2017-12-08 11:06:07 +01:00			`de_stop_words = list(set(map(replaceRockDots_lambda(), list(map(textacy.preprocess.normalize_whitespace,`
			`de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))`
refactoring. 2017-10-11 17:16:04 +02:00


aufgeräumt 2017-10-16 14:01:38 +02:00			`## ENGLISH`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`# from packages`
			`en_stop_words1 = list(get_stop_words("en"))`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`en_stop_words2 = list(nltk_stopwords.words('english'))`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`en_stop_words3 = list(__import__("spacy.en", globals(), locals(), ['object']).STOP_WORDS)`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`# from files`
			`en_filepaths = [path for path in paths if`
			`os.path.basename(path).split("_")[0] == 'en' and os.path.basename(path).split("_")[`
			`1] == 'stopwords']`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`en_stop_words4 = list_from_files(*en_filepaths)`
refactoring. 2017-10-11 17:16:04 +02:00

aufgeräumt 2017-10-16 14:01:38 +02:00			`# combine everything`
preprocessing überarbeitet 2017-12-08 11:06:07 +01:00			`en_stop_words = list(set(map(replaceRockDots_lambda(), list(map(textacy.preprocess.normalize_whitespace,`
			`en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4)))))`
refactoring. 2017-10-11 17:16:04 +02:00

aufgeräumt 2017-10-16 14:01:38 +02:00			`return de_stop_words, en_stop_words`
refactoring. 2017-10-11 17:16:04 +02:00


aufgeräumt 2017-10-16 14:01:38 +02:00			`def build_words_for_spellchecking(path2words):`
			`"""`
			`create word-Counter for spellchecking`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`http://norvig.com/spell-correct.html`
			`http://wortschatz.uni-leipzig.de/en/download`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`http://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_news_2015_1M.tar.gz`
			`:return: Counter`
			`"""`
			`def words(text): return re.findall(r'\w+', text.lower())`
refactoring. 2017-10-11 17:16:04 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`return Counter(words(open(path2words).read()))`
refactoring. 2017-10-11 17:16:04 +02:00


aufgeräumt 2017-10-16 14:01:38 +02:00			`##################################################################################################`
refactoring. 2017-10-11 17:16:04 +02:00





refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`def main():`
			`start = time.time()`
			`logprint("Init: {0}".format(datetime.now()))`
refactoring. 2017-10-11 17:16:04 +02:00
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00			`ressources_path = FILEPATH + "ressources/"`
refactoring. 2017-10-11 17:16:04 +02:00

preprocessing abgeschlossen 2017-10-18 17:37:20 +02:00
refactoring. 2017-10-11 17:16:04 +02:00


refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
			`# THESAURUS`
			`logprint("Build and save Thesaurus")`

			`path2wordnet = ressources_path + config.get("thesaurus", "input")`
			`thesaurus = build_thesaurus_dict(path2wordnet)`

			`path2thesaurus_dict = ressources_path + config.get("thesaurus", "pickle_file")`
			`save_obj(thesaurus, path2thesaurus_dict)`





refactoring. 2017-10-11 17:16:04 +02:00

refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00


			`# LEMMA`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`logprint("create and save lemma_dict")`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
			`path2lemma_file = ressources_path + config.get("lemmatization", "input")`
aufgeräumt 2017-10-16 14:01:38 +02:00			`lemma_dict = create_lemma_dict(path2lemma_file)`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
			`path2lemmadict = ressources_path + config.get("lemmatization", "pickle_file")`
aufgeräumt 2017-10-16 14:01:38 +02:00			`save_obj(lemma_dict, path2lemmadict)`
refactoring. 2017-10-11 17:16:04 +02:00

refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00











			`# SPELLCHECKING`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`logprint("Build and save Wordlist for Spellchecking")`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
			`path2words_file = ressources_path + config.get("spellchecking", "input")`
aufgeräumt 2017-10-16 14:01:38 +02:00			`words = build_words_for_spellchecking(path2words_file)`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
			`path2words_counter = ressources_path + config.get("spellchecking", "pickle_file")`
			`save_obj(words, path2words_counter)`

refactoring. 2017-10-11 17:16:04 +02:00






refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00


			`# STOPWORDS`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`logprint("Build and save stoppwortliste")`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
			`stop1 = ressources_path + config.get("de_stopwords", "input1")`
			`stop2 = ressources_path + config.get("de_stopwords", "input2")`
			`stop3 = ressources_path + config.get("de_stopwords", "input3")`
preprocessing abgeschlossen 2017-10-18 17:37:20 +02:00			`de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3)`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
			`path2stopwordlist_de = ressources_path + config.get("de_stopwords", "pickle_file")`
preprocessing abgeschlossen 2017-10-18 17:37:20 +02:00			`save_obj(de_stop_words, path2stopwordlist_de)`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
			`path2stopwordlist_en = ressources_path + config.get("en_stopwords", "pickle_file")`
preprocessing abgeschlossen 2017-10-18 17:37:20 +02:00			`save_obj(en_stop_words, path2stopwordlist_en)`
refactoring. 2017-10-11 17:16:04 +02:00


refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00






			`# NOMEN`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`logprint("Build and save nomenliste")`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
			`nouns0 = ressources_path + config.get("nouns", "input")`
			`nouns1 = ressources_path + config.get("nouns", "input1")`
			`nouns2 = ressources_path + config.get("nouns", "input2")`
			`nouns = list_from_files(nouns0,nouns1,nouns2)`

			`path2nouns_list = ressources_path + config.get("nouns", "pickle_file")`
aufgeräumt 2017-10-16 14:01:38 +02:00			`save_obj(nouns, path2nouns_list)`
refactoring. 2017-10-11 17:16:04 +02:00

refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00






			`# VORNAMEN`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`logprint("Build and save firstnameslist")`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
			`firstnames_txt = ressources_path + config.get("firstnames", "input")`
aufgeräumt 2017-10-16 14:01:38 +02:00			`vornamen = list_from_files(firstnames_txt)`
refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00
			`path2firstnameslist = ressources_path + config.get("firstnames", "pickle_file")`
aufgeräumt 2017-10-16 14:01:38 +02:00			`save_obj(vornamen, path2firstnameslist)`
refactoring. 2017-10-11 17:16:04 +02:00

refactoring. jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA 2017-12-11 12:10:40 +01:00









refactoring. 2017-10-11 17:16:04 +02:00			`end = time.time()`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60))`
refactoring. 2017-10-11 17:16:04 +02:00


			`if __name__ == "__main__":`
			`main()`