From 05b4f514d5323426e28bcdbbbef75556abbc2676 Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Tue, 5 Sep 2017 11:52:39 +0200 Subject: [PATCH] spacy-pipeline / python funciton-composing versucht --- preprocessing.py | 4 +- test.py | 124 +++++++++++++++++++++++++++++++++++++++++++++++ textCleaning.py | 57 +++++++++++++++------- 3 files changed, 166 insertions(+), 19 deletions(-) create mode 100644 test.py diff --git a/preprocessing.py b/preprocessing.py index f33836a..9fb59fd 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -406,6 +406,7 @@ def printRandomDoc(textacyCorpus): print() ####################'####################'####################'####################'####################'############## +# todo config-file import de_core_news_md DATAPATH = "ticketSamples.xml" @@ -421,8 +422,7 @@ custom_words = ["grüßen", "fragen"] ####################'####################'####################'####################'####################'############## -#todo joar diese pipe halt und vllt ne config-file - +#todo https://spacy.io/docs/usage/customizing-pipeline ## files to textacy-corpus textacyCorpus = textacy.Corpus(PARSER) diff --git a/test.py b/test.py new file mode 100644 index 0000000..08db3a2 --- /dev/null +++ b/test.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- +import spacy +import textacy +from spacy.tokens import Doc + +# -*- coding: utf-8 -*- +import re +import spacy +import functools + +import textacy + + +class TextCleaner: + + def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None): + """ + :param parser: spacy-parser + :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...] + :param customClass_symbols:[str] + :param customClass_words:[str] + :param customClassPOS:[str] + :param keep4All: [str] + """ + if thesaurus is None: + DATAPATH_thesaurus = "openthesaurus.csv" + + ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt + self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) + else: + self.thesaurus = thesaurus + + self.parser = parser + + + + self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) + self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) + self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + + + + # to remove + self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||", + ";", ":", + "…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else []) + self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else []) + + + + # to keep + self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"] + self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + + self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else []) + self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else []) + + + keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep + + + # modify those to remove with those to keep + for sym in keep: + try: + self.symbols.remove(sym) + except ValueError: + pass + for sym in keep: + try: + self.stop_words.remove(sym) + except ValueError: + pass + + + def loadString(self,string): + self.currentDoc = self.parser(string) + + + def removeWhitespace(self, string): + return " ".join([tok.text for tok in self.currentDoc if not tok.is_space]) + + + def removePunctuation(self, string, custom_symbols=None, keep=None): + symbols = self.symbols + (custom_symbols if custom_symbols is not None else []) + if hasattr(keep, '__iter__'): + for k in keep: + try: + symbols.remove(k) + except ValueError: + pass + + return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols]) + + +#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/ +parser = spacy.load('de') +cleaner = TextCleaner(parser) +corpus = textacy.Corpus(parser) + + +def compose(self,*functions): + return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x) + +def composeo(*functions): + return functools.reduce(lambda f, g: lambda x: f(g(x)), functions) + +#pipeline = compose(functools.partial(removeWhitespace,lemmatize=True))#, cleaner.normalizeSynonyms) + +pipeline = composeo(cleaner.removePunctuation, cleaner.removeWhitespace, cleaner.loadString) + +def pipe1(string): + cleaner.loadString(string) + string = cleaner.removeWhitespace(string) + string = cleaner.removePunctuation(string) + return string + + +string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" +print(pipe1(string)) +corpus.add_text(pipeline(string)) + +print(corpus[0].text) + diff --git a/textCleaning.py b/textCleaning.py index a014728..ef6a819 100644 --- a/textCleaning.py +++ b/textCleaning.py @@ -8,14 +8,14 @@ import textacy class TextCleaner: - def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4Class=None): + def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None): """ :param parser: spacy-parser :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...] :param customClass_symbols:[str] :param customClass_words:[str] :param customClassPOS:[str] - :param keep4Class: [str] + :param keep4All: [str] """ if thesaurus is None: DATAPATH_thesaurus = "openthesaurus.csv" @@ -48,11 +48,11 @@ class TextCleaner: self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"] self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 - self.entities2keep = self.entities2keep + (keep4Class if keep4Class is not None else []) - self.pos2keep = self.pos2keep + (keep4Class if keep4Class is not None else []) + self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else []) + self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else []) - keep = (keep4Class if hasattr(keep4Class, '__iter__') else []) + self.pos2keep + self.entities2keep + keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep # modify those to remove with those to keep @@ -60,18 +60,25 @@ class TextCleaner: try: self.symbols.remove(sym) except ValueError: - try: - self.stop_words.remove(sym) - except ValueError: - pass + pass + for sym in keep: + try: + self.stop_words.remove(sym) + except ValueError: + pass # idee self.currentDoc = spacy.Doc für jeden String aber nicht füpr jede methode + def loadString(self,string): + self.currentDoc = self.parser(string) + """ def removeWhitespace(self, string): string = self.whitespaceFinder.sub(" ", string) return string - + """ + def removeWhitespace(self, string): + return string def removePunctuation(self, string, custom_symbols=None, keep=None): @@ -225,11 +232,7 @@ class TextCleaner: -cleaner = TextCleaner(parser=spacy.load('de')) - -string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" - - +""" ################################################################################################################# #todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/ @@ -239,7 +242,27 @@ def compose(self,*functions): pipeline = compose(functools.partial(cleaner.keepPOSandENT,lemmatize=True))#, cleaner.normalizeSynonyms) ################################################################################################################# -print(cleaner.removePunctuation(string)) -print(cleaner.keepPOSandENT(string)) +""" + + + + + + + + + + + + + + + + + + + + +