From 05b4f514d5323426e28bcdbbbef75556abbc2676 Mon Sep 17 00:00:00 2001
From: "jannis.grundmann" <jannis.grundmann@tu-dortmund.de>
Date: Tue, 5 Sep 2017 11:52:39 +0200
Subject: [PATCH] spacy-pipeline / python funciton-composing versucht

---
 preprocessing.py |   4 +-
 test.py          | 124 +++++++++++++++++++++++++++++++++++++++++++++++
 textCleaning.py  |  57 +++++++++++++++-------
 3 files changed, 166 insertions(+), 19 deletions(-)
 create mode 100644 test.py

diff --git a/preprocessing.py b/preprocessing.py
index f33836a..9fb59fd 100644
--- a/preprocessing.py
+++ b/preprocessing.py
@@ -406,6 +406,7 @@ def printRandomDoc(textacyCorpus):
     print()
 
 ####################'####################'####################'####################'####################'##############
+# todo config-file
 
 import de_core_news_md
 DATAPATH = "ticketSamples.xml"
@@ -421,8 +422,7 @@ custom_words = ["grüßen", "fragen"]
 
 ####################'####################'####################'####################'####################'##############
 
-#todo joar diese pipe halt und vllt ne config-file
-
+#todo https://spacy.io/docs/usage/customizing-pipeline
 
 ## files to textacy-corpus
 textacyCorpus = textacy.Corpus(PARSER)
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..08db3a2
--- /dev/null
+++ b/test.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+import spacy
+import textacy
+from spacy.tokens import Doc
+
+# -*- coding: utf-8 -*-
+import re
+import spacy
+import functools
+
+import textacy
+
+
+class TextCleaner:
+
+    def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
+        """
+        :param parser: spacy-parser
+        :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
+        :param customClass_symbols:[str]
+        :param customClass_words:[str]
+        :param customClassPOS:[str]
+        :param keep4All: [str]
+        """
+        if thesaurus is None:
+            DATAPATH_thesaurus = "openthesaurus.csv"
+
+            ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
+            self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
+        else:
+            self.thesaurus = thesaurus
+
+        self.parser = parser
+
+
+
+        self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
+        self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
+        self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
+        self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
+
+
+
+        # to remove
+        self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||",
+                   ";", ":",
+                   "…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
+        self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])
+
+
+
+        # to keep
+        self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]  # ,"PERSON"]
+        self.pos2keep = ["NOUN"]  # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen  http://aclweb.org/anthology/U15-1013
+
+        self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
+        self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
+
+
+        keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep
+
+
+        # modify those to remove with those to keep
+        for sym in keep:
+            try:
+                self.symbols.remove(sym)
+            except ValueError:
+                pass
+        for sym in keep:
+            try:
+                self.stop_words.remove(sym)
+            except ValueError:
+                pass
+
+
+    def loadString(self,string):
+        self.currentDoc = self.parser(string)
+
+
+    def removeWhitespace(self, string):
+        return " ".join([tok.text for tok in self.currentDoc if not tok.is_space])
+
+
+    def removePunctuation(self, string, custom_symbols=None, keep=None):
+        symbols = self.symbols + (custom_symbols if custom_symbols is not None else [])
+        if hasattr(keep, '__iter__'):
+            for k in keep:
+                try:
+                    symbols.remove(k)
+                except ValueError:
+                    pass
+
+        return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols])
+
+
+#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/
+parser = spacy.load('de')
+cleaner = TextCleaner(parser)
+corpus = textacy.Corpus(parser)
+
+
+def compose(self,*functions):
+    return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
+
+def composeo(*functions):
+    return functools.reduce(lambda f, g: lambda x: f(g(x)), functions)
+
+#pipeline = compose(functools.partial(removeWhitespace,lemmatize=True))#, cleaner.normalizeSynonyms)
+
+pipeline = composeo(cleaner.removePunctuation, cleaner.removeWhitespace,  cleaner.loadString)
+
+def pipe1(string):
+    cleaner.loadString(string)
+    string = cleaner.removeWhitespace(string)
+    string = cleaner.removePunctuation(string)
+    return string
+
+
+string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n           Dieses Ticket wird geschlossen \n            \n test"
+print(pipe1(string))
+corpus.add_text(pipeline(string))
+
+print(corpus[0].text)
+
diff --git a/textCleaning.py b/textCleaning.py
index a014728..ef6a819 100644
--- a/textCleaning.py
+++ b/textCleaning.py
@@ -8,14 +8,14 @@ import textacy
 
 class TextCleaner:
 
-    def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None,  keep4Class=None):
+    def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
         """
         :param parser: spacy-parser
         :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
         :param customClass_symbols:[str]
         :param customClass_words:[str]
         :param customClassPOS:[str]
-        :param keep4Class: [str]
+        :param keep4All: [str]
         """
         if thesaurus is None:
             DATAPATH_thesaurus = "openthesaurus.csv"
@@ -48,11 +48,11 @@ class TextCleaner:
         self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]  # ,"PERSON"]
         self.pos2keep = ["NOUN"]  # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen  http://aclweb.org/anthology/U15-1013
 
-        self.entities2keep = self.entities2keep + (keep4Class if keep4Class is not None else [])
-        self.pos2keep = self.pos2keep + (keep4Class if keep4Class is not None else [])
+        self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
+        self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
 
 
-        keep = (keep4Class if hasattr(keep4Class, '__iter__') else []) + self.pos2keep + self.entities2keep
+        keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep
 
 
         # modify those to remove with those to keep
@@ -60,18 +60,25 @@ class TextCleaner:
             try:
                 self.symbols.remove(sym)
             except ValueError:
-                try:
-                    self.stop_words.remove(sym)
-                except ValueError:
-                    pass
+                pass
+        for sym in keep:
+            try:
+                self.stop_words.remove(sym)
+            except ValueError:
+                pass
 
 
         # idee self.currentDoc = spacy.Doc für jeden String aber nicht füpr jede methode
+    def loadString(self,string):
+        self.currentDoc = self.parser(string)
 
+    """   
     def removeWhitespace(self, string):
         string = self.whitespaceFinder.sub(" ", string)
         return string
-
+    """
+    def removeWhitespace(self, string):
+        return string
 
     def removePunctuation(self, string, custom_symbols=None, keep=None):
 
@@ -225,11 +232,7 @@ class TextCleaner:
 
 
 
-cleaner = TextCleaner(parser=spacy.load('de'))
-
-string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n           Dieses Ticket wird geschlossen \n            \n test"
-
-
+"""
 #################################################################################################################
 
 #todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/
@@ -239,7 +242,27 @@ def compose(self,*functions):
 pipeline = compose(functools.partial(cleaner.keepPOSandENT,lemmatize=True))#, cleaner.normalizeSynonyms)
 
 #################################################################################################################
-print(cleaner.removePunctuation(string))
-print(cleaner.keepPOSandENT(string))
+"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+