textcleaning macht keinen spass

2017-09-18 16:08:11 +02:00 · 2017-09-18 16:08:11 +02:00 · 20d9eed5b3
parent 092052dfe1
commit 20d9eed5b3
5 changed files with 21469 additions and 55 deletions
--- a/backup.py
+++ b/backup.py
@ -0,0 +1,383 @@
+# -*- coding: utf-8 -*-
+
+
+############# misc
+
+def printlog(string, level="INFO"):
+    """log and prints"""
+    print(string)
+    if level=="INFO":
+        logging.info(string)
+    elif level=="DEBUG":
+        logging.debug(string)
+    elif level == "WARNING":
+        logging.warning(string)
+printlog("Load functions")
+
+def compose(*functions):
+    def compose2(f, g):
+        return lambda x: f(g(x))
+    return functools.reduce(compose2, functions, lambda x: x)
+
+def get_calling_function():
+    """finds the calling function in many decent cases.
+    https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
+    """
+    fr = sys._getframe(1)   # inspect.stack()[1][0]
+    co = fr.f_code
+    for get in (
+        lambda:fr.f_globals[co.co_name],
+        lambda:getattr(fr.f_locals['self'], co.co_name),
+        lambda:getattr(fr.f_locals['cls'], co.co_name),
+        lambda:fr.f_back.f_locals[co.co_name], # nested
+        lambda:fr.f_back.f_locals['func'],  # decorators
+        lambda:fr.f_back.f_locals['meth'],
+        lambda:fr.f_back.f_locals['f'],
+        ):
+        try:
+            func = get()
+        except (KeyError, AttributeError):
+            pass
+        else:
+            if func.__code__ == co:
+                return func
+    raise AttributeError("func not found")
+
+
+def printRandomDoc(textacyCorpus):
+    import random
+    print()
+
+    printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
+    randIndex = int((len(textacyCorpus) - 1) * random.random())
+    printlog("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
+
+    print()
+
+#############  load xml
+def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
+    """
+    generates strings from XML
+    :param path2xml:
+    :param main_textfield:
+    :param cleaning_function:
+    :yields strings
+    """
+    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
+    root = tree.getroot()
+
+    for ticket in root:
+        for field in ticket:
+            if field.tag == main_textfield:
+                    yield field.text
+
+def generateMetadatafromTicketXML(path2xml, leave_out=['Description']):
+    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
+    root = tree.getroot()
+
+    for ticket in root:
+        metadata = {}
+        for field in ticket:
+            if field.tag not in leave_out:
+
+                    metadata[field.tag] = field.text
+
+        yield metadata
+
+
+#############  load csv
+
+def csv_to_contentStream(path2csv: str, content_collumn_name: str):
+    """
+    :param path2csv: string
+    :param content_collumn_name: string
+    :return: string-generator
+    """
+    stream = textacy.fileio.read_csv(path2csv, delimiter=";")  # ,encoding='utf8')
+    content_collumn = 0  # standardvalue
+
+    for i,lst in enumerate(stream):
+        if i == 0:
+            # look for desired column
+            for j,col in enumerate(lst):
+                if col == content_collumn_name:
+                    content_collumn = j
+        else:
+            yield lst[content_collumn]
+
+def csv_to_metaStream(path2csv: str, metalist: [str]):
+    """
+    :param path2csv: string
+    :param metalist: list of strings
+    :return: dict-generator
+    """
+    stream = textacy.fileio.read_csv(path2csv, delimiter=";")  # ,encoding='utf8')
+
+    content_collumn = 0  # standardvalue
+    metaindices = []
+    metadata_temp = {}
+    for i,lst in enumerate(stream):
+        if i == 0:
+            for j,col in enumerate(lst):        # geht bestimmt effizienter... egal, weil passiert nur einmal
+                for key in metalist:
+                    if key == col:
+                        metaindices.append(j)
+            metadata_temp = dict(zip(metalist,metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
+
+        else:
+            metadata = metadata_temp.copy()
+            for key,value in metadata.items():
+                metadata[key] = lst[value]
+            yield metadata
+
+
+
+############################################    Preprocessing   ##############################################
+
+
+#############  on str-gen
+
+def processTokens(tokens, funclist, parser):
+    # in:tokenlist, funclist
+    # out: tokenlist
+    for f in funclist:
+        # idee: funclist sortieren,s.d. erst alle string-methoden ausgeführt werden, dann wird geparesed, dann wird auf tokens gearbeitet, dann evtl. auf dem ganzen Doc
+
+        if 'bool' in str(f.__annotations__):
+            tokens = list(filter(f, tokens))
+
+        elif 'str' in str(f.__annotations__):
+            tokens = list(map(f, tokens))  # purer text
+            doc = parser(" ".join(tokens))  # neu parsen
+            tokens = [tok for tok in doc]  # nur tokens
+
+        elif 'spacy.tokens.doc.Doc' in str(f.__annotations__):
+            #todo wirkt gefrickelt
+            doc = parser(" ".join(tok.lower_ for tok in tokens))  # geparsed
+            tokens = f(doc)
+            doc = parser(" ".join(tokens))  # geparsed
+            tokens = [tok for tok in doc]  # nur tokens
+        else:
+            warnings.warn("Unknown Annotation while preprocessing. Function: {0}".format(str(f)))
+
+    return tokens
+
+def processTextstream(textstream, funclist, parser=DE_PARSER):
+    """
+    :param textstream: string-gen
+    :param funclist: [func]
+    :param parser: spacy-parser
+    :return: string-gen
+    """
+    # input:str-stream output:str-stream
+    pipe = parser.pipe(textstream)
+
+    for doc in pipe:
+
+        tokens = []
+        for tok in doc:
+            tokens.append(tok)
+
+        tokens = processTokens(tokens,funclist,parser)
+        yield " ".join([tok.lower_ for tok in tokens])
+
+def processDictstream(dictstream, funcdict, parser=DE_PARSER):
+    """
+
+    :param dictstream: dict-gen
+    :param funcdict:
+                    clean_in_meta = {
+                        "Solution":funclist,
+                        ...
+                    }
+
+    :param parser: spacy-parser
+    :return: dict-gen
+    """
+    for dic in dictstream:
+        result = {}
+        for key, value in dic.items():
+
+            if key in funcdict:
+
+                doc = parser(value)
+                tokens = [tok for tok in doc]
+                funclist = funcdict[key]
+
+                tokens = processTokens(tokens,funclist,parser)
+
+
+                result[key] = " ".join([tok.lower_ for tok in tokens])
+
+
+            else:
+                result[key] = value
+        yield result
+
+
+#############  return bool
+
+def keepPOS(pos_list) -> bool:
+    ret = lambda tok : tok.pos_ in pos_list
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def removePOS(pos_list)-> bool:
+    ret = lambda tok : tok.pos_ not in pos_list
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def removeWords(words, keep=None)-> bool:
+    if hasattr(keep, '__iter__'):
+        for k in keep:
+            try:
+                words.remove(k)
+            except ValueError:
+                pass
+
+    ret = lambda tok :  tok.lower_ not in words
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def keepENT(ent_list) -> bool:
+    ret = lambda tok : tok.ent_type_ in ent_list
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def removeENT(ent_list) -> bool:
+    ret = lambda tok: tok.ent_type_ not in ent_list
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def remove_words_containing_Numbers() -> bool:
+    ret = lambda tok: not bool(re.search('\d', tok.lower_))
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+
+def remove_words_containing_specialCharacters() -> bool:
+    ret = lambda tok: not bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', tok.lower_))
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+
+def remove_words_containing_topLVL() -> bool:
+    ret = lambda tok: not bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', tok.lower_))
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+
+def lemmatizeWord(word,filepath=LEMMAS):
+    """http://www.lexiconista.com/datasets/lemmatization/"""
+    for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
+        if word.lower() == line.split()[1].strip().lower():
+            return line.split()[0].strip().lower()
+    return word.lower() # falls nix gefunden wurde
+
+def lemmatize() -> str:
+    ret = lambda tok: lemmatizeWord(tok.lower_)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+
+#############  return strings
+
+mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
+emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
+urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
+topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
+specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
+hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
+
+
+
+def replaceEmails(replace_with="EMAIL") -> str:
+    ret = lambda tok : emailFinder.sub(replace_with, tok.lower_)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def replaceURLs(replace_with="URL") -> str:
+    ret = lambda tok: textacy.preprocess.replace_urls(tok.lower_,replace_with=replace_with)
+    #ret = lambda tok: urlFinder.sub(replace_with,tok.lower_)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def replaceSpecialChars(replace_with=" ") -> str:
+    ret = lambda tok: specialFinder.sub(replace_with,tok.lower_)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+
+def replaceTwitterMentions(replace_with="TWITTER_MENTION") -> str:
+    ret = lambda tok : mentionFinder.sub(replace_with,tok.lower_)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def replaceNumbers(replace_with="NUMBER") -> str:
+    ret = lambda tok: textacy.preprocess.replace_numbers(tok.lower_, replace_with=replace_with)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def replacePhonenumbers(replace_with="PHONENUMBER") -> str:
+    ret = lambda tok: textacy.preprocess.replace_phone_numbers(tok.lower_, replace_with=replace_with)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def replaceHardS(replace_with="ss") -> str:
+    ret = lambda tok: hardSFinder.sub(replace_with,tok.lower_)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+
+def fixUnicode() -> str:
+    ret = lambda tok: textacy.preprocess.fix_bad_unicode(tok.lower_, normalization=u'NFC')
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+
+
+
+
+
+
+
+
+
+def resolveAbbreviations():
+    pass #todo
+
+#todo wörter mit len < 2 entfernen( vorher abkürzungen (v.a. tu und fh) auflösen) und > 35 oder 50 ("Reiserücktrittskostenversicherung)
+
+#############  return docs
+
+def keepUniqeTokens() -> spacy.tokens.Doc:
+    ret = lambda doc: (set([tok.lower_ for tok in doc]))
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def lower() -> spacy.tokens.Doc:
+    ret = lambda doc: ([tok.lower_ for tok in doc])
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+
+################################################################################################################
--- a/spell.py
+++ b/spell.py
@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+# https://github.com/norvig/pytudes/blob/master/spell.py
+
+"""Spelling Corrector in Python 3; see http://norvig.com/spell-correct.html
+
+Copyright (c) 2007-2016 Peter Norvig
+MIT license: www.opensource.org/licenses/mit-license.php
+"""
+
+################ Spelling Corrector
+
+import re
+from collections import Counter
+import spacy
+import textacy
+def words(text): return re.findall(r'\w+', text.lower())
+
+
+
+
+WORDS = Counter(words(open('bigo.txt').read()))
+x=0
+def P(word, N=sum(WORDS.values())):
+    "Probability of `word`."
+    return WORDS[word] / N
+
+
+def correction(word):
+    "Most probable spelling correction for word."
+    return max(candidates(word), key=P)
+
+
+def candidates(word):
+    "Generate possible spelling corrections for word."
+    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
+
+
+def known(words):
+    "The subset of `words` that appear in the dictionary of WORDS."
+    return set(w for w in words if w in WORDS)
+
+
+def edits1(word):
+    "All edits that are one edit away from `word`."
+    letters = 'abcdefghijklmnopqrstuvwxyz'
+    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
+    deletes = [L + R[1:] for L, R in splits if R]
+    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
+    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
+    inserts = [L + c + R for L, R in splits for c in letters]
+    return set(deletes + transposes + replaces + inserts)
+
+
+def edits2(word):
+    "All edits that are two edits away from `word`."
+    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
+
--- a/test.py
+++ b/test.py
@ -196,12 +196,14 @@ def processTokens(tokens, funclist, parser):
    # in:tokenlist, funclist
    # out: tokenlist
    for f in funclist:
+        # idee: funclist sortieren,s.d. erst alle string-methoden ausgeführt werden, dann wird geparesed, dann wird auf tokens gearbeitet, dann evtl. auf dem ganzen Doc
+
        if 'bool' in str(f.__annotations__):
            tokens = list(filter(f, tokens))

        elif 'str' in str(f.__annotations__):
            tokens = list(map(f, tokens))  # purer text
-            doc = parser(" ".join(tokens))  # geparsed
+            doc = parser(" ".join(tokens))  # neu parsen
            tokens = [tok for tok in doc]  # nur tokens

        elif 'spacy.tokens.doc.Doc' in str(f.__annotations__):
@ -320,6 +322,14 @@ def remove_words_containing_specialCharacters() -> bool:
    ret.__annotations__ = get_calling_function().__annotations__
    return ret

+
+def remove_words_containing_topLVL() -> bool:
+    ret = lambda tok: not bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', tok.lower_))
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+
 def lemmatizeWord(word,filepath=LEMMAS):
    """http://www.lexiconista.com/datasets/lemmatization/"""
    for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
@ -334,58 +344,16 @@ def lemmatize() -> str:
    return ret


-
-
-
-
-def normalizeSynonyms(default_return_first_Syn=False) -> str:
-    ret = lambda tok : getFirstSynonym(tok.lower_, default_return_first_Syn=default_return_first_Syn)
-
-    ret.__annotations__ = get_calling_function().__annotations__
-    return ret
-
-def getFirstSynonym(word, thesaurus=THESAURUS, default_return_first_Syn=False):
-    if not isinstance(word, str):
-        return str(word)
-
-    word = word.lower()
-
-    # durch den thesaurrus iterieren
-    for syn_block in thesaurus:  # syn_block ist eine liste mit Synonymen
-
-        for syn in syn_block:
-            syn = syn.lower()
-            if re.match(r'\A[\w-]+\Z', syn):  # falls syn einzelwort ist
-                if word == syn:
-                    return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
-            else:  # falls es ein satz ist
-                if word in syn:
-                    return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
-    return str(word)  # zur Not, das ursrpüngliche Wort zurückgeben
-
-def getHauptform(syn_block, word, default_return_first_Syn=False):
-    for syn in syn_block:
-        syn = syn.lower()
-
-        if "hauptform" in syn and len(syn.split(" ")) <= 2:
-            # nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
-            for w in syn.split(" "):
-                if not re.match(r'\([^)]+\)', w):
-                    return w
-
-    if default_return_first_Syn:
-        # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
-        for w in syn_block:
-            if not re.match(r'\([^)]+\)', w):
-                return w
-    return word  # zur Not, das ursrpüngliche Wort zurückgeben
-
-
 #############  return strings

 mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
 emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
 urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
+topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
+specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
+hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
+
+

 def replaceEmails(replace_with="EMAIL") -> str:
    ret = lambda tok : emailFinder.sub(replace_with, tok.lower_)
@ -400,6 +368,13 @@ def replaceURLs(replace_with="URL") -> str:
    ret.__annotations__ = get_calling_function().__annotations__
    return ret

+def replaceSpecialChars(replace_with=" ") -> str:
+    ret = lambda tok: specialFinder.sub(replace_with,tok.lower_)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+
 def replaceTwitterMentions(replace_with="TWITTER_MENTION") -> str:
    ret = lambda tok : mentionFinder.sub(replace_with,tok.lower_)

@ -418,7 +393,11 @@ def replacePhonenumbers(replace_with="PHONENUMBER") -> str:
    ret.__annotations__ = get_calling_function().__annotations__
    return ret

+def replaceHardS(replace_with="ss") -> str:
+    ret = lambda tok: hardSFinder.sub(replace_with,tok.lower_)

+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret


 def fixUnicode() -> str:
@ -428,11 +407,20 @@ def fixUnicode() -> str:
    return ret


+
+
+
+
+
+
+
+
 def resolveAbbreviations():
    pass #todo

+#todo wörter mit len < 2 entfernen( vorher abkürzungen (v.a. tu und fh) auflösen) und > 35 oder 50 ("Reiserücktrittskostenversicherung)

-#############  return docs #todo das stimmt nicht so ganz, da kommt kein doc raus, sondern n tokenset
+#############  return docs

 def keepUniqeTokens() -> spacy.tokens.Doc:
    ret = lambda doc: (set([tok.lower_ for tok in doc]))
@ -462,7 +450,6 @@ printlog("CSV: {0}".format(path2csv))
 ticketcorpus = textacy.Corpus(DE_PARSER)


-#idee ß zu ss ändern? prinzipiell?

 """
 vllt kategorien in unterkategorien aufteilen 
@ -503,20 +490,26 @@ clean_in_meta = {
 printlog("Start Preprocessing")

 clean_in_content=[
+    replaceHardS(),
+    replaceSpecialChars(),

-    removePOS(["SPACE","PUNCT","NUM"]),
+    remove_words_containing_topLVL(),
    remove_words_containing_Numbers(),
    remove_words_containing_specialCharacters(),

+    #removePOS(["SPACE","PUNCT","NUM"]),
+    #removeENT("PERSON"),
+
+    #keepPOS(["NOUN"]),
+
+
    #replaceURLs(),
    #replaceEmails(),
    #fixUnicode(),

-    #lemmatize(),
-    #removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")),
+    lemmatize(),
+    removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")),

-    #removeENT("PERSON"),
-    #keepPOS(["NOUN"]),
    #keepUniqeTokens(),
    #keepENT(config.get("preprocessing","ents2keep"))

--- a/testo.py
+++ b/testo.py
@ -0,0 +1,419 @@
+# -*- coding: utf-8 -*-
+
+
+import time
+start = time.time()
+
+import logging
+
+import csv
+import functools
+import os.path
+import re
+import subprocess
+import time
+import xml.etree.ElementTree as ET
+import sys
+import spacy
+import textacy
+from scipy import *
+from textacy import Vectorizer
+import warnings
+import configparser as ConfigParser
+import sys
+
+
+csv.field_size_limit(sys.maxsize)
+
+
+
+
+# Load the configuration file
+config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
+
+config = ConfigParser.ConfigParser()
+with open(config_ini) as f:
+    config.read_file(f)
+
+
+
+
+# config logging
+logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
+
+
+
+thesauruspath = config.get("filepath","thesauruspath")
+THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
+
+
+DE_PARSER = spacy.load("de") #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen
+de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
+
+
+LEMMAS=config.get("filepath","lemmas")
+
+VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
+
+
+regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
+regex_topLvl = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
+
+
+mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
+emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
+urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
+topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
+specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
+hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
+
+
+def printlog(string, level="INFO"):
+    """log and prints"""
+    print(string)
+    if level=="INFO":
+        logging.info(string)
+    elif level=="DEBUG":
+        logging.debug(string)
+    elif level == "WARNING":
+        logging.warning(string)
+printlog("Load functions")
+
+def compose(*functions):
+    def compose2(f, g):
+        return lambda x: f(g(x))
+    return functools.reduce(compose2, functions, lambda x: x)
+
+def get_calling_function():
+    """finds the calling function in many decent cases.
+    https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
+    """
+    fr = sys._getframe(1)   # inspect.stack()[1][0]
+    co = fr.f_code
+    for get in (
+        lambda:fr.f_globals[co.co_name],
+        lambda:getattr(fr.f_locals['self'], co.co_name),
+        lambda:getattr(fr.f_locals['cls'], co.co_name),
+        lambda:fr.f_back.f_locals[co.co_name], # nested
+        lambda:fr.f_back.f_locals['func'],  # decorators
+        lambda:fr.f_back.f_locals['meth'],
+        lambda:fr.f_back.f_locals['f'],
+        ):
+        try:
+            func = get()
+        except (KeyError, AttributeError):
+            pass
+        else:
+            if func.__code__ == co:
+                return func
+    raise AttributeError("func not found")
+
+
+def printRandomDoc(textacyCorpus):
+    import random
+    print()
+
+    printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
+    randIndex = int((len(textacyCorpus) - 1) * random.random())
+    printlog("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
+
+    print()
+
+
+
+def csv_to_contentStream(path2csv: str, content_collumn_name: str):
+    """
+    :param path2csv: string
+    :param content_collumn_name: string
+    :return: string-generator
+    """
+    stream = textacy.fileio.read_csv(path2csv, delimiter=";")  # ,encoding='utf8')
+    content_collumn = 0  # standardvalue
+
+    for i,lst in enumerate(stream):
+        if i == 0:
+            # look for desired column
+            for j,col in enumerate(lst):
+                if col == content_collumn_name:
+                    content_collumn = j
+        else:
+            yield lst[content_collumn]
+
+
+
+#############  return bool
+
+def keepPOS(pos_list):
+    return lambda tok : tok.pos_ in pos_list
+
+
+def removePOS(pos_list):
+    return lambda tok : tok.pos_ not in pos_list
+
+
+def removeWords(words, keep=None):
+    if hasattr(keep, '__iter__'):
+        for k in keep:
+            try:
+                words.remove(k)
+            except ValueError:
+                pass
+
+    return lambda tok :  tok.lower_ not in words
+
+
+def keepENT(ent_list):
+    return lambda tok : tok.ent_type_ in ent_list
+
+
+def removeENT(ent_list):
+    return lambda tok: tok.ent_type_ not in ent_list
+
+
+def remove_words_containing_Numbers():
+    return lambda tok: not bool(re.search('\d', tok.lower_))
+
+
+def remove_words_containing_specialCharacters():
+    return lambda tok: not bool(re.search(regex_specialChars, tok.lower_))
+
+
+def remove_long_words():
+    return lambda tok: not len(tok.lower_) < 2
+
+def remove_short_words():
+    return lambda tok: not len(tok.lower_) > 35
+
+def remove_first_names():
+    return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN]
+
+############# strings
+
+def stringcleaning(stringstream, funclist):
+    for string in stringstream:
+        for f in funclist:
+            string = f(string)
+        yield string
+
+
+def remove_words_containing_topLVL():
+    return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ])
+
+
+def replaceSpecialChars(replace_with=" "):
+    return lambda string: re.sub(regex_specialChars, replace_with, string.lower())
+
+
+def replaceNumbers(replace_with="NUMBER"):
+    return lambda string : textacy.preprocess.replace_numbers(string.lower(), replace_with=replace_with)
+
+
+def replacePhonenumbers(replace_with="PHONENUMBER"):
+    return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with)
+
+
+def replaceHardS(replace_with="ss"):
+    return lambda string: re.sub(r'[ß]',replace_with,string.lower())
+
+
+def fixUnicode():
+    return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
+
+
+def lemmatizeWord(word,filepath=LEMMAS):
+    """http://www.lexiconista.com/datasets/lemmatization/"""
+    for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
+        if word.lower() == line.split()[1].strip().lower():
+            return line.split()[0].strip().lower()
+    return word.lower() # falls nix gefunden wurde
+
+
+def lemmatize():
+    #todo https://alpha.spacy.io/docs/usage/adding-languages#lemmatizer
+    return lambda tok: lemmatizeWord(tok.lower_)
+
+
+def processTextstream(textstream, string_funclist, tok_funclist,parser=DE_PARSER):
+    """
+    :param textstream: string-gen
+    :param funclist: [func]
+    :param parser: spacy-parser
+    :return: string-gen
+    """
+    pipe = parser.pipe(stringcleaning(textstream,string_funclist))
+
+    for doc in pipe:
+
+        tokens = [tok for tok in doc]
+
+        tokens = processTokens(tokens,tok_funclist,parser)
+
+        yield " ".join([tok.lower_ for tok in tokens])
+
+
+def processTokens(tokens, funclist, parser):
+    # in:tokenlist, funclist
+    # out: tokenlist
+    for f in funclist:
+
+        tokens = list(filter(f, tokens))
+
+    return tokens
+
+
+string_comp=[
+    replaceHardS(),
+    remove_words_containing_topLVL(),
+    replaceSpecialChars(),
+]
+
+tok_comp=[
+    removeENT(["PERSON"]),
+    remove_words_containing_Numbers(),
+    #keepPOS(["NOUN"]),
+    removePOS(["PUNCT","SPACE","NUM"]),
+    removeWords(de_stop_words),
+
+    remove_long_words(),
+    remove_short_words(),
+
+    remove_first_names()
+]
+
+
+
+"""
+pipe=[
+
+    ##String
+    
+    fixUnicode(),
+    replaceHardS(),
+    resolveAbbrivations(),
+    
+    remove_words_containing_topLVL(),
+    
+    replaceSpecialChars(" "), (mit Leerzeichen erstzen, dadruch werden Terme wie 8203;verfügung getrennt
+    
+    remove_words_containing_Numbers(),
+
+
+
+    ##spacyParse
+    
+    removeENT("PERSON"),
+    keepPOS(["NOUN"]),
+
+    #ODER
+        
+    lemmatize(),
+    removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")),
+
+
+    # evtl.
+    spellCorrection(),
+    keepUniqeTokens(),
+
+]
+
+
+
+
+"""
+
+
+
+
+path2csv = "M42-Export/Tickets_med.csv"
+
+ticketcorpus = textacy.Corpus(DE_PARSER)
+
+
+## add files to textacy-corpus,
+printlog("add texts to textacy-corpus")
+ticketcorpus.add_texts(
+    processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp,tok_funclist=tok_comp)
+)
+
+for i in range(10):
+    printRandomDoc(ticketcorpus)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+words = [
+    "uniaccount",
+    "nr54065467",
+    "nr54065467",
+    "455a33c5,"
+    "tvt?=",
+    "tanja.saborowski@tu-dortmund.de",
+    "-",
+    "m-sw1-vl4053.itmc.tu-dortmund.de",
+    "------problem--------"
+]
+
+topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
+specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
+
+for w in words:
+    print(stringcleaning(w,string_comp))
+    #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
+    #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
+    #result = specialFinder.sub(" ", w)
+    #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
+
+    #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
+
+"""
+
+spracherkennung
+alles nach grüße ist irrelevant außer PS:
+
+vllt kategorien in unterkategorien aufteilen 
+
+allg: 
+utf-korregieren, 
+
+emails, urls, nummern raus 
+vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
+
+sinnvoller wörter von müll trennen: 8203;verfügung -> bei sonderzeichen wörter trennen
+
+abkürzungen raus: m.a, o.ä.     
+
+wörter korrigieren
+
+sinnlose bsp: nr54065467  455a33c5   tvt?=      ------problem--------
+
+"""
+
+
+
+
+
--- a/vornamen.txt
+++ b/vornamen.txt