diff --git a/preprocessing.py b/preprocessing.py index 9fb59fd..89b6317 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -422,7 +422,6 @@ custom_words = ["grüßen", "fragen"] ####################'####################'####################'####################'####################'############## -#todo https://spacy.io/docs/usage/customizing-pipeline ## files to textacy-corpus textacyCorpus = textacy.Corpus(PARSER) diff --git a/test.py b/test.py index 08db3a2..9560698 100644 --- a/test.py +++ b/test.py @@ -32,15 +32,17 @@ class TextCleaner: self.parser = parser - - - self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) + #self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + # to keep + self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"] + self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 - + """ + # to remove self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||", ";", ":", @@ -48,11 +50,6 @@ class TextCleaner: self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else []) - - # to keep - self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"] - self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 - self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else []) self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else []) @@ -71,7 +68,7 @@ class TextCleaner: self.stop_words.remove(sym) except ValueError: pass - + """ def loadString(self,string): self.currentDoc = self.parser(string) @@ -93,32 +90,124 @@ class TextCleaner: return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols]) -#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/ +def cleanDoc(doc, toDelete=None, toKeep=None): + """ + :param doc: spacyDoc + :param toDelete: [str] pos_ , ent_type_ or tag_ + :return: str tokenlist + """ + #keep + tokenlist = [] + for tok in doc: + if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep: + tokenlist.append(tok.text) + + #delete + tokenlist = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete] + + result = " ".join(tokenlist) + return result #problem: kein doc und daher nicht komponierbar + + +def keepinDoc(doc, toKeep=None): + """ + :param doc: spacyDoc + :param toDelete: [str] + :return: str tokenlist + """ + return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]) + + +#todo https://mathieularose.com/function-composition-in-python/ parser = spacy.load('de') cleaner = TextCleaner(parser) -corpus = textacy.Corpus(parser) +corpus_raw = textacy.Corpus(parser) +corpus_clean = textacy.Corpus(parser) + +def foo(doc, toKeep=None): + + words = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep] + spaces = [True] * len(words) + + return Doc(doc.vocab,words=words,spaces=spaces) + +def foo2(doc, toDelete=None):#, toKeep=None): + """ + :param doc: spacyDoc + :param toDelete: [str] pos_ , ent_type_ or tag_ + :return: str tokenlist + """ + #keep + #tokenlist = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep] + + #delete + + words = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete] + spaces = [True] * len(words) + + return Doc(doc.vocab, words=words, spaces=spaces) +""" def compose(self,*functions): return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x) def composeo(*functions): return functools.reduce(lambda f, g: lambda x: f(g(x)), functions) +""" -#pipeline = compose(functools.partial(removeWhitespace,lemmatize=True))#, cleaner.normalizeSynonyms) +def double(a): + return a*2 -pipeline = composeo(cleaner.removePunctuation, cleaner.removeWhitespace, cleaner.loadString) +def add(a, b): + return a+b +def compose(*functions): + def compose2(f, g): + return lambda x: f(g(x)) + return functools.reduce(compose2, functions, lambda x: x) + + + + + +#pipeline = compose(removeFromDoc, cleaner.removeWhitespace, cleaner.loadString) +""" def pipe1(string): cleaner.loadString(string) string = cleaner.removeWhitespace(string) string = cleaner.removePunctuation(string) return string +""" + +def cleaningPipe(spacy_pipe, composition): + for doc in spacy_pipe: + yield composition(doc) + + +pipeline = compose( + functools.partial(foo2, toDelete=["PUNCT", "SPACE"]), + functools.partial(foo, toKeep=["NOUN"])) string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" -print(pipe1(string)) -corpus.add_text(pipeline(string)) -print(corpus[0].text) +doc = parser(string) + +#print(removeFromDoc(doc,toDelete=["PUNCT"])) + +print(pipeline(doc.text)) + + + +for txt in cleaningPipe(parser.pipe([string]),pipeline): + print(txt) +""" +corpus_raw.add_text(string) +for doc in parser.pipe([string]): + doc.text = removeFromDoc(doc, toDelete=["PUNCT"]) +""" + +#corpus_clean.add_texts(cleaningPipe(parser.pipe([string]),pipeline)) +#print(corpus_raw[0].text)