# -*- coding: utf-8 -*- import functools import re import xml.etree.ElementTree as ET import spacy import textacy path2xml = "ticketSamples.xml" import de_core_news_md PARSER = de_core_news_md.load() corpus = textacy.Corpus(PARSER) def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'): """ generates strings from XML :param path2xml: :param main_textfield: :param cleaning_function: :yields strings """ tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) root = tree.getroot() for ticket in root: for field in ticket: if field.tag == main_textfield: yield field.text def printRandomDoc(textacyCorpus): import random print() print("len(textacyCorpus) = %i" % len(textacyCorpus)) randIndex = int((len(textacyCorpus) - 1) * random.random()) print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) print() def processTextstream(textstream, funclist, parser=PARSER): # input:str-stream output:str-stream pipe = parser.pipe(textstream) for doc in pipe: tokens = [tok for tok in doc] for f in funclist: tokens = filter(f,tokens) #tokens = map(funclist,tokens) yield " ".join([tok.lower_ for tok in tokens]) def keepPOS(pos_list): return lambda tok : tok.pos_ in pos_list def removePOS(pos_list): return lambda tok : tok.pos_ not in pos_list def removeWords(words, keep=None): #todo in:str oder str-list if hasattr(keep, '__iter__'): for k in keep: try: words.remove(k) except ValueError: pass return lambda tok : tok.lower_ not in words emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) def replaceEmails(replace_with="EMAIL"): return lambda tok : emailFinder.sub(replace_with, tok.lower_) stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) clean_in_content=[ removePOS(["SPACE"]), removePOS(["PUNCT"]), removeWords(stop_words,keep=["und"]), replaceEmails ] ## add files to textacy-corpus, print("add texts to textacy-corpus...") corpus.add_texts( processTextstream(generateMainTextfromTicketXML(path2xml), clean_in_content), ) printRandomDoc(corpus) #todo https://stackoverflow.com/questions/15200048/how-to-get-the-parameters-type-and-return-type-of-a-function