# -*- coding: utf-8 -*- import spacy import textacy from spacy.tokens import Doc # -*- coding: utf-8 -*- import re import spacy import functools import textacy class TextCleaner: def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None): """ :param parser: spacy-parser :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...] :param customClass_symbols:[str] :param customClass_words:[str] :param customClassPOS:[str] :param keep4All: [str] """ if thesaurus is None: DATAPATH_thesaurus = "openthesaurus.csv" ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) else: self.thesaurus = thesaurus self.parser = parser #self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) # to keep self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"] self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 """ # to remove self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||", ";", ":", "…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else []) self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else []) self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else []) self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else []) keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep # modify those to remove with those to keep for sym in keep: try: self.symbols.remove(sym) except ValueError: pass for sym in keep: try: self.stop_words.remove(sym) except ValueError: pass """ def loadString(self,string): self.currentDoc = self.parser(string) def removeWhitespace(self, string): return " ".join([tok.text for tok in self.currentDoc if not tok.is_space]) def removePunctuation(self, string, custom_symbols=None, keep=None): symbols = self.symbols + (custom_symbols if custom_symbols is not None else []) if hasattr(keep, '__iter__'): for k in keep: try: symbols.remove(k) except ValueError: pass return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols]) def cleanDoc(doc, toDelete=None, toKeep=None): """ :param doc: spacyDoc :param toDelete: [str] pos_ , ent_type_ or tag_ :return: str tokenlist """ #keep tokenlist = [] for tok in doc: if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep: tokenlist.append(tok.text) #delete tokenlist = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete] result = " ".join(tokenlist) return result #problem: kein doc und daher nicht komponierbar def keepinDoc(doc, toKeep=None): """ :param doc: spacyDoc :param toDelete: [str] :return: str tokenlist """ return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]) #todo https://mathieularose.com/function-composition-in-python/ parser = spacy.load('de') cleaner = TextCleaner(parser) corpus_raw = textacy.Corpus(parser) corpus_clean = textacy.Corpus(parser) def foo(doc, toKeep=None): words = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep] spaces = [True] * len(words) return Doc(doc.vocab,words=words,spaces=spaces) def foo2(doc, toDelete=None):#, toKeep=None): """ :param doc: spacyDoc :param toDelete: [str] pos_ , ent_type_ or tag_ :return: str tokenlist """ #keep #tokenlist = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep] #delete words = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete] spaces = [True] * len(words) return Doc(doc.vocab, words=words, spaces=spaces) """ def compose(self,*functions): return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x) def composeo(*functions): return functools.reduce(lambda f, g: lambda x: f(g(x)), functions) """ def double(a): return a*2 def add(a, b): return a+b def compose(*functions): def compose2(f, g): return lambda x: f(g(x)) return functools.reduce(compose2, functions, lambda x: x) #pipeline = compose(removeFromDoc, cleaner.removeWhitespace, cleaner.loadString) """ def pipe1(string): cleaner.loadString(string) string = cleaner.removeWhitespace(string) string = cleaner.removePunctuation(string) return string """ def cleaningPipe(spacy_pipe, composition): for doc in spacy_pipe: yield composition(doc) pipeline = compose( functools.partial(foo2, toDelete=["PUNCT", "SPACE"]), functools.partial(foo, toKeep=["NOUN"])) string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" doc = parser(string) #print(removeFromDoc(doc,toDelete=["PUNCT"])) print(pipeline(doc.text)) for txt in cleaningPipe(parser.pipe([string]),pipeline): print(txt) """ corpus_raw.add_text(string) for doc in parser.pipe([string]): doc.text = removeFromDoc(doc, toDelete=["PUNCT"]) """ #corpus_clean.add_texts(cleaningPipe(parser.pipe([string]),pipeline)) #print(corpus_raw[0].text)