From 11e77fad06427eec1d19d9d98fe264d77ecac5c8 Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Fri, 1 Sep 2017 14:27:03 +0200 Subject: [PATCH] textcleaning refactored --- preprocessing.py | 61 +++++------- test.py | 165 ------------------------------- textCleaning.py | 245 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 267 insertions(+), 204 deletions(-) delete mode 100644 test.py create mode 100644 textCleaning.py diff --git a/preprocessing.py b/preprocessing.py index 536f426..f33836a 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -190,73 +190,57 @@ def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_ LANGUAGE = 'de' +#PARSER = de_core_news_md.load() PARSER = spacy.load(LANGUAGE) +from textCleaning import TextCleaner + +cleaner = TextCleaner(parser=PARSER) -def generateTextfromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False): +def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False): import xml.etree.ElementTree as ET tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) - root = tree.getroot() - """ - for subject in root.iter(textfield): - if clean: - yield cleanText(subject.text) - else: - yield subject.text - """ + + for ticket in root: text = "ERROR" for field in ticket: if field.tag == textfield: if clean: - text = cleanText_words(field.text,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) + text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) else: text = field.text yield text -def generateMetadatafromXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]): +def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]): import xml.etree.ElementTree as ET tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) root = tree.getroot() - """ - metadata = dict.fromkeys(keys) - - for ticket in root.findall('ticket'): - for key in metadata: - metadata[key] = ticket.find(key).text - - yield metadata - """ for ticket in root: metadata = {} for field in ticket: if field.tag != textfield: if field.tag == "Zusammenfassung": - # idee lösung nur whitespace entfernen, zusammenfassung auch von symbolen befreien - metadata[field.tag] = cleanText_symbols(field.text) + metadata[field.tag] = cleaner.removePunctuation(field.text) elif field.tag == "Loesung": - metadata[field.tag] = remove_whitespace(field.text) + metadata[field.tag] = cleaner.removeWhitespace(field.text) else: metadata[field.tag] = field.text yield metadata + + +""" def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): - """ - https://spacy.io/docs/usage/pos-tagging - cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols: - - ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] - - """ if custom_symbols is not None: custom_symbols = custom_symbols else: @@ -360,18 +344,21 @@ def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=No #remove_whitespace(" ".join(tokens)) - #idee abkürzungen auflösen (v.a. TU -> Technische Universität) + #idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis if normalize_synonyms: tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] return " ".join(set(tokens)) -def remove_whitespace(sentence): +def cleanText_removeWhitespace(sentence): whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) sentence = whitespaceFinder.sub(" ", sentence) return sentence +#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms + + def getFirstSynonym(word, thesaurus_gen): word = word.lower() @@ -407,7 +394,7 @@ def getHauptform(syn_block, word, default_return_first_Syn=False): if not re.match(r'\([^)]+\)', w): return w return word # zur Not, das ursrpüngliche Wort zurückgeben - +""" def printRandomDoc(textacyCorpus): print() @@ -434,18 +421,14 @@ custom_words = ["grüßen", "fragen"] ####################'####################'####################'####################'####################'############## -#PARSER = de_core_news_md.load() - -THESAURUS_list=list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt - - +#todo joar diese pipe halt und vllt ne config-file ## files to textacy-corpus textacyCorpus = textacy.Corpus(PARSER) print("add texts to textacy-corpus...") -textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH,normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromXML(DATAPATH)) +textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH)) #for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize): diff --git a/test.py b/test.py deleted file mode 100644 index 201e1c9..0000000 --- a/test.py +++ /dev/null @@ -1,165 +0,0 @@ -# -*- coding: utf-8 -*- -import re - -import spacy -import textacy -import xml.etree.ElementTree as ET - - -DATAPATH_thesaurus = "openthesaurus.csv" - - -PARSER = spacy.load('de') - - - -def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): - """ - https://spacy.io/docs/usage/pos-tagging - - cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols: - - ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] - - """ - if custom_symbols is not None: - custom_symbols = custom_symbols - else: - custom_symbols = [] - - if keep is not None: - keep = keep - else: - keep = [] - - # List of symbols we don't care about - symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols - - # parse with spaCy - spacy_doc = parser(string) - tokens = [] - - pos = ["NUM", "SPACE", "PUNCT"] - for p in keep: - pos.remove(p) - - - # append Tokens to a list - for tok in spacy_doc: - if tok.pos_ not in pos: - tokens.append(tok.text.lower().strip()) - - - # remove symbols - tokens = [tok for tok in tokens if tok not in symbols] - - # remove whitespace - remove_whitespace(" ".join(tokens)) - - return " ".join(tokens) - -def cleanText_words(string, parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False): - - # use preprocessing - if customPreprocessing is not None: - string = customPreprocessing(string) - - if custom_stopwords is not None: - custom_stopwords = custom_stopwords - else: - custom_stopwords = [] - - if custom_words is not None: - custom_words = custom_words - else: - custom_words = [] - - - # custom stoplist - # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import - stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS - - stoplist =list(stop_words) + custom_stopwords - - # replace twitter - mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) - string = mentionFinder.sub("MENTION", string) - - # replace emails - emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) - string = emailFinder.sub("EMAIL", string) - - # replace urls - urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) - string = urlFinder.sub("URL", string) - - # replace HTML symbols - string = string.replace("&", "and").replace(">", ">").replace("<", "<") - - - - # parse with spaCy - spacy_doc = parser(string) - tokens = [] - - added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] - added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 - - # append Tokens to a list - for tok in spacy_doc: - if tok.pos_ in added_POS: - if lemmatize: - tokens.append(tok.lemma_.lower().strip()) - else: - tokens.append(tok.text.lower().strip()) - - # add entities - if tok.ent_type_ in added_entities: - tokens.append(tok.text.lower()) - - - - # remove stopwords - tokens = [tok for tok in tokens if tok not in stoplist] - - # remove custom_words - tokens = [tok for tok in tokens if tok not in custom_words] - - # remove single characters - tokens = [tok for tok in tokens if len(tok)>1] - - # remove large strings of whitespace - #remove_whitespace(" ".join(tokens)) - - - #idee abkürzungen auflösen (v.a. TU -> Technische Universität) - - #if normalize_synonyms: - # tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] - - return " ".join(tokens) - -def remove_whitespace(sentence): - whitespaceFinder = re.compile(r'(\r\n|\r|\n|\s)', re.IGNORECASE) - sentence = whitespaceFinder.sub(" ", sentence) - return sentence - -def cleanText_normalize(string, parser=PARSER, customPreprocessing=cleanText_words, lemmatize=True): - # use preprocessing - if customPreprocessing is not None: - string = customPreprocessing(string) - - - - - - - - - -string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" -print(cleanText_symbols(string=string, parser=PARSER, keep=["NUM"])) - - -string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" -print(cleanText_symbols(string=string, parser=PARSER, keep=None)) diff --git a/textCleaning.py b/textCleaning.py new file mode 100644 index 0000000..a014728 --- /dev/null +++ b/textCleaning.py @@ -0,0 +1,245 @@ +# -*- coding: utf-8 -*- +import re +import spacy +import functools + +import textacy + + +class TextCleaner: + + def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4Class=None): + """ + :param parser: spacy-parser + :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...] + :param customClass_symbols:[str] + :param customClass_words:[str] + :param customClassPOS:[str] + :param keep4Class: [str] + """ + if thesaurus is None: + DATAPATH_thesaurus = "openthesaurus.csv" + + ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt + self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) + else: + self.thesaurus = thesaurus + + self.parser = parser + + + + self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) + self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) + self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + + + + # to remove + self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||", + ";", ":", + "…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else []) + self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else []) + + + + # to keep + self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"] + self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + + self.entities2keep = self.entities2keep + (keep4Class if keep4Class is not None else []) + self.pos2keep = self.pos2keep + (keep4Class if keep4Class is not None else []) + + + keep = (keep4Class if hasattr(keep4Class, '__iter__') else []) + self.pos2keep + self.entities2keep + + + # modify those to remove with those to keep + for sym in keep: + try: + self.symbols.remove(sym) + except ValueError: + try: + self.stop_words.remove(sym) + except ValueError: + pass + + + # idee self.currentDoc = spacy.Doc für jeden String aber nicht füpr jede methode + + def removeWhitespace(self, string): + string = self.whitespaceFinder.sub(" ", string) + return string + + + def removePunctuation(self, string, custom_symbols=None, keep=None): + + + symbols = self.symbols + (custom_symbols if custom_symbols is not None else []) + + if hasattr(keep, '__iter__'): + for k in keep: + try: + symbols.remove(k) + except ValueError: + pass + + + # parse with spaCy + doc = self.parser(string) + tokens = [] + + # append Tokens to a list + for tok in doc: + if not tok.is_punct and not tok.is_space and tok.text not in symbols: + tokens.append(tok.text) + + return " ".join(tokens) + + def resolveAbbreviations(self,string): + return string #todo + + + def keepPOSandENT(self, string, customPOS=None, customEnt=None, remove=None): + + pos2keep = self.pos2keep + (customPOS if customPOS is not None else []) + ent = self.entities2keep + (customEnt if customEnt is not None else []) + + if hasattr(remove, '__iter__'): + for k in remove: + try: + ent.remove(k) + except ValueError: + try: + pos2keep.remove(k) + except ValueError: + pass + + # parse with spaCy + spacy_doc = self.parser(string) + tokens = [] + + # append Tokens to a list + for tok in spacy_doc: + + if tok.pos_ in pos2keep: + tokens.append(tok.text) + + if tok.ent_type_ in ent: + tokens.append(tok.text) + + return " ".join(set(tokens)) + + + + def removeWords(self,string, custom_words=None, keep=None, lemmatize=False): + + wordlist = self.stop_words + (custom_words if custom_words is not None else []) + if hasattr(keep, '__iter__'): + for k in keep: + try: + wordlist.remove(k) + except ValueError: + pass + + + + string = self.urlFinder.sub("URL", string) + string = self.emailFinder.sub("EMAIL", string) + string = self.mentionFinder.sub("MENTION", string) + string = string.replace("&", "and").replace(">", ">").replace("<", "<") + + + # parse with spaCy + spacy_doc = self.parser(string) + tokens = [] + + # append Tokens to a list + for tok in spacy_doc: + + #do not include stopwords/customwords and single chars + if tok.text not in wordlist and len(tok)>1: + if lemmatize: + tokens.append(tok.lemma_) + else: + tokens.append(tok.lower_) + return " ".join(set(tokens)) + + + + + + + + def normalizeSynonyms(self, string, default_return_first_Syn=False): + # parse with spaCy + spacy_doc = self.parser(string) + tokens = [] + + tokens = [str(self.getFirstSynonym(tok, self.thesaurus, default_return_first_Syn=default_return_first_Syn)) for tok in spacy_doc] + + return " ".join(set(tokens)) + + + + def getFirstSynonym(self,word, thesaurus, default_return_first_Syn=False): + if not isinstance(word, str): + return word + + + word = word.lower() + + + # durch den thesaurrus iterieren + for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen + + for syn in syn_block: + syn = syn.lower() + if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist + if word == syn: + return self.getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn) + else: # falls es ein satz ist + if word in syn: + return self.getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn) + return word # zur Not, das ursrpüngliche Wort zurückgeben + + def getHauptform(self,syn_block, word, default_return_first_Syn=False): + + for syn in syn_block: + syn = syn.lower() + + if "hauptform" in syn and len(syn.split(" ")) <= 2: + # nicht ausgeben, falls es in Klammern steht + for w in syn.split(" "): + if not re.match(r'\([^)]+\)', w): + return w + + if default_return_first_Syn: + # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht + for w in syn_block: + if not re.match(r'\([^)]+\)', w): + return w + return word # zur Not, das ursrpüngliche Wort zurückgeben + + + + +cleaner = TextCleaner(parser=spacy.load('de')) + +string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" + + +################################################################################################################# + +#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/ +def compose(self,*functions): + return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x) + +pipeline = compose(functools.partial(cleaner.keepPOSandENT,lemmatize=True))#, cleaner.normalizeSynonyms) + +################################################################################################################# +print(cleaner.removePunctuation(string)) +print(cleaner.keepPOSandENT(string)) + +