# -*- coding: utf-8 -*- import re import spacy import textacy import xml.etree.ElementTree as ET DATAPATH_thesaurus = "openthesaurus.csv" PARSER = spacy.load('de') def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): """ https://spacy.io/docs/usage/pos-tagging cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols: ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] """ if custom_symbols is not None: custom_symbols = custom_symbols else: custom_symbols = [] if keep is not None: keep = keep else: keep = [] # List of symbols we don't care about symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols # parse with spaCy spacy_doc = parser(string) tokens = [] pos = ["NUM", "SPACE", "PUNCT"] for p in keep: pos.remove(p) # append Tokens to a list for tok in spacy_doc: if tok.pos_ not in pos: tokens.append(tok.text.lower().strip()) # remove symbols tokens = [tok for tok in tokens if tok not in symbols] # remove whitespace remove_whitespace(" ".join(tokens)) return " ".join(tokens) def cleanText_words(string, parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False): # use preprocessing if customPreprocessing is not None: string = customPreprocessing(string) if custom_stopwords is not None: custom_stopwords = custom_stopwords else: custom_stopwords = [] if custom_words is not None: custom_words = custom_words else: custom_words = [] # custom stoplist # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS stoplist =list(stop_words) + custom_stopwords # replace twitter mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) string = mentionFinder.sub("MENTION", string) # replace emails emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) string = emailFinder.sub("EMAIL", string) # replace urls urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) string = urlFinder.sub("URL", string) # replace HTML symbols string = string.replace("&", "and").replace(">", ">").replace("<", "<") # parse with spaCy spacy_doc = parser(string) tokens = [] added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 # append Tokens to a list for tok in spacy_doc: if tok.pos_ in added_POS: if lemmatize: tokens.append(tok.lemma_.lower().strip()) else: tokens.append(tok.text.lower().strip()) # add entities if tok.ent_type_ in added_entities: tokens.append(tok.text.lower()) # remove stopwords tokens = [tok for tok in tokens if tok not in stoplist] # remove custom_words tokens = [tok for tok in tokens if tok not in custom_words] # remove single characters tokens = [tok for tok in tokens if len(tok)>1] # remove large strings of whitespace #remove_whitespace(" ".join(tokens)) #idee abkürzungen auflösen (v.a. TU -> Technische Universität) #if normalize_synonyms: # tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] return " ".join(tokens) def remove_whitespace(sentence): whitespaceFinder = re.compile(r'(\r\n|\r|\n|\s)', re.IGNORECASE) sentence = whitespaceFinder.sub(" ", sentence) return sentence def cleanText_normalize(string, parser=PARSER, customPreprocessing=cleanText_words, lemmatize=True): # use preprocessing if customPreprocessing is not None: string = customPreprocessing(string) string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" print(cleanText_symbols(string=string, parser=PARSER, keep=["NUM"])) string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" print(cleanText_symbols(string=string, parser=PARSER, keep=None))