# -*- coding: utf-8 -*- import spacy import textacy from spacy.tokens import Doc # -*- coding: utf-8 -*- import re import spacy import functools import textacy class TextCleaner: def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None): """ :param parser: spacy-parser :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...] :param customClass_symbols:[str] :param customClass_words:[str] :param customClassPOS:[str] :param keep4All: [str] """ if thesaurus is None: DATAPATH_thesaurus = "openthesaurus.csv" ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) else: self.thesaurus = thesaurus self.parser = parser self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) # to remove self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||", ";", ":", "…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else []) self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else []) # to keep self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"] self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else []) self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else []) keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep # modify those to remove with those to keep for sym in keep: try: self.symbols.remove(sym) except ValueError: pass for sym in keep: try: self.stop_words.remove(sym) except ValueError: pass def loadString(self,string): self.currentDoc = self.parser(string) def removeWhitespace(self, string): return " ".join([tok.text for tok in self.currentDoc if not tok.is_space]) def removePunctuation(self, string, custom_symbols=None, keep=None): symbols = self.symbols + (custom_symbols if custom_symbols is not None else []) if hasattr(keep, '__iter__'): for k in keep: try: symbols.remove(k) except ValueError: pass return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols]) #todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/ parser = spacy.load('de') cleaner = TextCleaner(parser) corpus = textacy.Corpus(parser) def compose(self,*functions): return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x) def composeo(*functions): return functools.reduce(lambda f, g: lambda x: f(g(x)), functions) #pipeline = compose(functools.partial(removeWhitespace,lemmatize=True))#, cleaner.normalizeSynonyms) pipeline = composeo(cleaner.removePunctuation, cleaner.removeWhitespace, cleaner.loadString) def pipe1(string): cleaner.loadString(string) string = cleaner.removeWhitespace(string) string = cleaner.removePunctuation(string) return string string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" print(pipe1(string)) corpus.add_text(pipeline(string)) print(corpus[0].text)