textcleaning macht immer noch keinen spass

2017-09-19 14:42:38 +02:00 · 2017-09-19 14:42:38 +02:00 · a380b57bfc
parent 20d9eed5b3
commit a380b57bfc
3 changed files with 139 additions and 41 deletions
--- a/test.py
+++ b/test.py
@ -456,7 +456,7 @@ vllt kategorien in unterkategorien aufteilen

 allg: 
 utf-korregieren, bei sonderzeichen wörter trennen
-namen raus
+namen raus, addressen nach grüßen

 emails, urls, nummern raus 
 vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
--- a/testo.py
+++ b/testo.py
@ -187,6 +187,13 @@ def remove_short_words():
 def remove_first_names():
    return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN]

+
+
+#falls wort nicht in vocab, erst schauen ob teilwort bekannt ist, falls ja, schauen ob es davor oder danach bullshit stehen hat. ggf trennen
+
+
+
+
 ############# strings

 def stringcleaning(stringstream, funclist):
@ -196,6 +203,10 @@ def stringcleaning(stringstream, funclist):
        yield string


+def seperate_words_on_regex(regex=regex_specialChars):
+    return lambda string: " ".join(re.compile(regex).split(string))
+
+
 def remove_words_containing_topLVL():
    return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ])

@ -212,10 +223,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER"):
    return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with)


-def replaceHardS(replace_with="ss"):
+def replaceSharpS(replace_with="ss"):
    return lambda string: re.sub(r'[ß]',replace_with,string.lower())


+def replaceRockDots():
+    return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
+
+
+
 def fixUnicode():
    return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')

@ -230,28 +246,36 @@ def lemmatizeWord(word,filepath=LEMMAS):

 def lemmatize():
    #todo https://alpha.spacy.io/docs/usage/adding-languages#lemmatizer
-    return lambda tok: lemmatizeWord(tok.lower_)
+    return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])


-def processTextstream(textstream, string_funclist, tok_funclist,parser=DE_PARSER):
+def processTextstream(textstream, string_funclist, tok_funclist, parser=DE_PARSER, single_doc_func=None):
    """
    :param textstream: string-gen
    :param funclist: [func]
    :param parser: spacy-parser
    :return: string-gen
    """
+    #zuerst die string-methoden
    pipe = parser.pipe(stringcleaning(textstream,string_funclist))
-
+    tokens=[]
    for doc in pipe:

        tokens = [tok for tok in doc]

-        tokens = processTokens(tokens,tok_funclist,parser)
+        #dann die auf tokens
+        tokens = processTokens(tokens,tok_funclist)

-        yield " ".join([tok.lower_ for tok in tokens])
+        if single_doc_func is not None:
+            yield single_doc_func(parser(" ".join([tok.lower_ for tok in tokens])))
+        else:
+            yield " ".join([tok.lower_ for tok in tokens])


-def processTokens(tokens, funclist, parser):
+
+
+
+def processTokens(tokens, funclist):
    # in:tokenlist, funclist
    # out: tokenlist
    for f in funclist:
@ -261,25 +285,33 @@ def processTokens(tokens, funclist, parser):
    return tokens


+
+
 string_comp=[
-    replaceHardS(),
+    fixUnicode(),
+    replaceRockDots(),
    remove_words_containing_topLVL(),
-    replaceSpecialChars(),
+    seperate_words_on_regex()
 ]

 tok_comp=[
-    removeENT(["PERSON"]),
+    #removeENT(["PERSON"]),
    remove_words_containing_Numbers(),
-    #keepPOS(["NOUN"]),
    removePOS(["PUNCT","SPACE","NUM"]),
    removeWords(de_stop_words),

    remove_long_words(),
    remove_short_words(),

-    remove_first_names()
+    remove_first_names(),
+
+    #keepPOS(["NOUN"]),
+
 ]

+single_doc_func = lemmatize()
+
+


 """
@ -332,7 +364,7 @@ ticketcorpus = textacy.Corpus(DE_PARSER)
 ## add files to textacy-corpus,
 printlog("add texts to textacy-corpus")
 ticketcorpus.add_texts(
-    processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp,tok_funclist=tok_comp)
+    processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp, tok_funclist=tok_comp, single_doc_func=single_doc_func)
 )

 for i in range(10):
@ -362,33 +394,6 @@ for i in range(10):



-
-
-
-
-words = [
-    "uniaccount",
-    "nr54065467",
-    "nr54065467",
-    "455a33c5,"
-    "tvt?=",
-    "tanja.saborowski@tu-dortmund.de",
-    "-",
-    "m-sw1-vl4053.itmc.tu-dortmund.de",
-    "------problem--------"
-]
-
-topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
-specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
-
-for w in words:
-    print(stringcleaning(w,string_comp))
-    #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
-    #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
-    #result = specialFinder.sub(" ", w)
-    #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
-
-    #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))

 """

--- a/testra.py
+++ b/testra.py
@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+import corenlp as corenlp
+import os
+import re
+import textacy
+import nltk
+from textblob_de import TextBlobDE
+from textblob_de import PatternParser
+
+filepath = "lemmatization-de.txt"
+
+
+
+
+blob = TextBlobDE(str(textacy.fileio.read_file("teststring.txt")),parser=PatternParser(pprint=True, lemmata=True))
+
+print(blob.parse())
+
+
+
+
+
+#erste spalte zu {lemma : id} . zweite spalte zu {word : id}
+
+
+
+"""http://www.lexiconista.com/datasets/lemmatization/"""
+
+lemma2id = {}
+word2id = {}
+
+for id,line in enumerate(list(textacy.fileio.read_file_lines(filepath=filepath))):
+
+    lemma = line.split()[0].strip().lower()
+    if lemma not in lemma2id:
+        lemma2id[lemma] = id
+
+    word = line.split()[1].strip().lower()
+
+    word2id[word] = lemma2id[word]
+
+
+
+
+
+
+
+
+
+
+"""
+regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
+
+
+def stringcleaning(stringstream, funclist):
+    for string in stringstream:
+        for f in funclist:
+            string = f(string)
+        yield string
+
+
+def seperate_words_on_regex(regex=regex_specialChars):
+    return lambda string: " ".join(re.compile(regex).split(string))
+
+
+words = [
+    "uniaccount",
+    "nr54065467",
+    "nr54065467",
+    "455a33c5,"
+    "tvt?=",
+    "tanja.saborowski@tu-dortmund.de",
+    "-",
+    "m-sw1-vl4053.itmc.tu-dortmund.de",
+    "------problem--------"
+]
+
+
+
+topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
+specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
+
+for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
+    print(s.strip())
+
+    #print(stringcleaning(w,string_comp))
+    #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
+    #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
+    #result = specialFinder.sub(" ", w)
+    #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
+
+    #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
+"""