textcleaning macht immer noch keinen spass

2017-09-19 14:42:38 +02:00 · 2017-09-19 14:42:38 +02:00 · a380b57bfc
parent 20d9eed5b3
commit a380b57bfc
3 changed files with 139 additions and 41 deletions
--- a/test.py
+++ b/test.py
@ -456,7 +456,7 @@ vllt kategorien in unterkategorien aufteilen
 allg: 
 utf-korregieren, bei sonderzeichen wörter trennen
-namen raus
+namen raus, addressen nach grüßen
 emails, urls, nummern raus 
 vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
--- a/testo.py
+++ b/testo.py
@ -187,6 +187,13 @@ def remove_short_words():
 def remove_first_names():
    return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN]
 #falls wort nicht in vocab, erst schauen ob teilwort bekannt ist, falls ja, schauen ob es davor oder danach bullshit stehen hat. ggf trennen
 ############# strings
 def stringcleaning(stringstream, funclist):
@ -196,6 +203,10 @@ def stringcleaning(stringstream, funclist):
        yield string
 def seperate_words_on_regex(regex=regex_specialChars):
    return lambda string: " ".join(re.compile(regex).split(string))
 def remove_words_containing_topLVL():
    return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ])
@ -212,10 +223,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER"):
    return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with)
-def replaceHardS(replace_with="ss"):
+def replaceSharpS(replace_with="ss"):
    return lambda string: re.sub(r'[ß]',replace_with,string.lower())
 def replaceRockDots():
    return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
 def fixUnicode():
    return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
@ -230,28 +246,36 @@ def lemmatizeWord(word,filepath=LEMMAS):
 def lemmatize():
    #todo https://alpha.spacy.io/docs/usage/adding-languages#lemmatizer
-    return lambda tok: lemmatizeWord(tok.lower_)
+    return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
-def processTextstream(textstream, string_funclist, tok_funclist,parser=DE_PARSER):
+def processTextstream(textstream, string_funclist, tok_funclist, parser=DE_PARSER, single_doc_func=None):
    """
    :param textstream: string-gen
    :param funclist: [func]
    :param parser: spacy-parser
    :return: string-gen
    """
    #zuerst die string-methoden
    pipe = parser.pipe(stringcleaning(textstream,string_funclist))
-
+    tokens=[]
    for doc in pipe:
        tokens = [tok for tok in doc]
-        tokens = processTokens(tokens,tok_funclist,parser)
+        #dann die auf tokens
        tokens = processTokens(tokens,tok_funclist)
-        yield " ".join([tok.lower_ for tok in tokens])
+        if single_doc_func is not None:
            yield single_doc_func(parser(" ".join([tok.lower_ for tok in tokens])))
        else:
            yield " ".join([tok.lower_ for tok in tokens])
-def processTokens(tokens, funclist, parser):
+
 def processTokens(tokens, funclist):
    # in:tokenlist, funclist
    # out: tokenlist
    for f in funclist:
@ -261,25 +285,33 @@ def processTokens(tokens, funclist, parser):
    return tokens
 string_comp=[
-    replaceHardS(),
+    fixUnicode(),
    replaceRockDots(),
    remove_words_containing_topLVL(),
-    replaceSpecialChars(),
+    seperate_words_on_regex()
 ]
 tok_comp=[
-    removeENT(["PERSON"]),
+    #removeENT(["PERSON"]),
    remove_words_containing_Numbers(),
    #keepPOS(["NOUN"]),
    removePOS(["PUNCT","SPACE","NUM"]),
    removeWords(de_stop_words),
    remove_long_words(),
    remove_short_words(),
-    remove_first_names()
+    remove_first_names(),
    #keepPOS(["NOUN"]),
 ]
 single_doc_func = lemmatize()
 """
@ -332,7 +364,7 @@ ticketcorpus = textacy.Corpus(DE_PARSER)
 ## add files to textacy-corpus,
 printlog("add texts to textacy-corpus")
 ticketcorpus.add_texts(
-    processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp,tok_funclist=tok_comp)
+    processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp, tok_funclist=tok_comp, single_doc_func=single_doc_func)
 )
 for i in range(10):
@ -362,33 +394,6 @@ for i in range(10):
 words = [
    "uniaccount",
    "nr54065467",
    "nr54065467",
    "455a33c5,"
    "tvt?=",
    "tanja.saborowski@tu-dortmund.de",
    "-",
    "m-sw1-vl4053.itmc.tu-dortmund.de",
    "------problem--------"
 ]
 topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
 specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
 for w in words:
    print(stringcleaning(w,string_comp))
    #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
    #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
    #result = specialFinder.sub(" ", w)
    #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
    #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
 """
--- a/testra.py
+++ b/testra.py
@ -0,0 +1,93 @@
 # -*- coding: utf-8 -*-
 import corenlp as corenlp
 import os
 import re
 import textacy
 import nltk
 from textblob_de import TextBlobDE
 from textblob_de import PatternParser
 filepath = "lemmatization-de.txt"
 blob = TextBlobDE(str(textacy.fileio.read_file("teststring.txt")),parser=PatternParser(pprint=True, lemmata=True))
 print(blob.parse())
 #erste spalte zu {lemma : id} . zweite spalte zu {word : id}
 """http://www.lexiconista.com/datasets/lemmatization/"""
 lemma2id = {}
 word2id = {}
 for id,line in enumerate(list(textacy.fileio.read_file_lines(filepath=filepath))):
    lemma = line.split()[0].strip().lower()
    if lemma not in lemma2id:
        lemma2id[lemma] = id
    word = line.split()[1].strip().lower()
    word2id[word] = lemma2id[word]
 """
 regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
 def stringcleaning(stringstream, funclist):
    for string in stringstream:
        for f in funclist:
            string = f(string)
        yield string
 def seperate_words_on_regex(regex=regex_specialChars):
    return lambda string: " ".join(re.compile(regex).split(string))
 words = [
    "uniaccount",
    "nr54065467",
    "nr54065467",
    "455a33c5,"
    "tvt?=",
    "tanja.saborowski@tu-dortmund.de",
    "-",
    "m-sw1-vl4053.itmc.tu-dortmund.de",
    "------problem--------"
 ]
 topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
 specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
 for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
    print(s.strip())
    #print(stringcleaning(w,string_comp))
    #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
    #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
    #result = specialFinder.sub(" ", w)
    #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
    #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
 """