# -*- coding: utf-8 -*- import functools import re import spacy import textacy from spacy.tokens import Doc from spacy.tagger import Tagger import xml.etree.ElementTree as ET PARSER = spacy.load('de') stop_words = list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) def compose(*functions): def compose2(f, g): return lambda x: f(g(x)) return functools.reduce(compose2, functions, lambda x: x) def cleanTexts(textstream, parser, attr): #input str-stream output str-stream pipe = parser.pipe(textstream) for doc in pipe: tokens = [tok.text for tok in doc if tok.pos_ not in attr and tok.tag_ not in attr and tok.ent_ not in attr and tok.text not in attr and tok.lower_ not in attr] yield " ".join(tokens) """ def cleanDoc_lemmatize(doc,parser=PARSER): return parser(" ".join([tok.lemma_ for tok in doc ])) def cleanDoc_STOPS(doc,parser=PARSER, stop_words=None, keep=None): if stop_words is None: stop_words = list(__import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS) if hasattr(keep, '__iter__'): for k in keep: try: stop_words.remove(k) except ValueError: pass return parser(" ".join([tok.text for tok in doc if tok.text not in stop_words])) def cleanDoc_ENT(doc,parser=PARSER, keeponly=False, attr=["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]): if keeponly: return parser(" ".join([tok.text for tok in doc if tok.ent_ in attr])) else: return parser(" ".join([tok.text for tok in doc if tok.ent_ not in attr])) def cleanDoc_POS(doc,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]): if keeponly: return parser(" ".join([tok.text for tok in doc if tok.pos_ in attr])) else: return parser(" ".join([tok.text for tok in doc if tok.pos_ not in attr])) """ def cleanTexts_POS(spacypipe, keeponly=False, attr=["SPACE", "PUNCT"]): """ :param spacypipe: spacypipe :param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted :param attr: [str] pos_ or ent_type_ :yields: stream of strings: full-length cleaned text """ if keeponly: for doc in spacypipe: yield " ".join([tok.text for tok in doc if tok.pos_ in attr]) else: for doc in spacypipe: yield " ".join([tok.text for tok in doc if tok.pos_ not in attr]) def cleanText_POS(text,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]): """ :param txt: str :param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted :param attr: [str] pos_ or ent_type_ :return: str """ doc = parser(text) if keeponly: return " ".join([tok.text for tok in doc if tok.pos_ in attr]) else: return " ".join([tok.text for tok in doc if tok.pos_ not in attr]) def removeWhitespace(string): return re.sub(r'(\r\n|\r|\n|(\s)+)', ' ', string) def removeWords(string, words): big_regex = re.compile('|'.join(map(re.escape, words))) return big_regex.sub("", string) def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung', cleaning_function=None): """ generates strings from XML :param path2xml: :param main_textfield: :param cleaning_function: :yields strings """ import xml.etree.ElementTree as ET tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) root = tree.getroot() for ticket in root: text = "ERROR" for field in ticket: if field.tag == main_textfield: if cleaning_function: text = cleaning_function(field.text) else: text = field.text yield text def generateMetadatafromTicketXML(path2xml, key_function_pairs_to_clean, leave_out=['Beschreibung']): import xml.etree.ElementTree as ET tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) root = tree.getroot() for ticket in root: metadata = {} for field in ticket: if field.tag not in leave_out: if field.tag in key_function_pairs_to_clean: metadata[field.tag] = key_function_pairs_to_clean[field.tag](field.text) else: metadata[field.tag] = field.text yield metadata string = "Frau Hinrichs überdenkt die tu Situation a@bc.de und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" #print(removeWords(string,["die", "neue"])) # in:str out:str cleanString = compose( cleanText_POS, functools.partial(textacy.preprocess.replace_emails, replace_with=u'EMAIL') ) key_function_pairs_to_clean = { "Loesung":removeWhitespace, "Zusammenfassung":cleanText_POS } """ # in:str-gen out:str-gen cleanStream = compose( removeSTOP, lemmatize, cleanEnt ) """ # content: xml -> stringCleaning -> pipe -> docCleaning -> corpi # metadata:xml -> -> stringCleaning -> corpi corpus = textacy.Corpus(PARSER) corpus.add_texts( cleanTexts(generateMainTextfromTicketXML("ticketSamples.xml"),PARSER,["PUNCT","SPACE","PERSON"])#, #generateMetadatafromTicketXML("ticketSamples.xml",key_function_pairs_to_clean=key_function_pairs_to_clean) ) print(corpus[0].text)