diff --git a/test.py b/test.py new file mode 100644 index 0000000..2fae1c7 --- /dev/null +++ b/test.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +import functools +import re +import xml.etree.ElementTree as ET + +import spacy +import textacy + +path2xml = "ticketSamples.xml" +import de_core_news_md + + +PARSER = de_core_news_md.load() +corpus = textacy.Corpus(PARSER) + + + +def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'): + """ + generates strings from XML + :param path2xml: + :param main_textfield: + :param cleaning_function: + :yields strings + """ + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + root = tree.getroot() + + for ticket in root: + for field in ticket: + if field.tag == main_textfield: + yield field.text + + +def printRandomDoc(textacyCorpus): + import random + print() + + print("len(textacyCorpus) = %i" % len(textacyCorpus)) + randIndex = int((len(textacyCorpus) - 1) * random.random()) + print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) + + print() + + + +def processTextstream(textstream, funclist, parser=PARSER): + # input:str-stream output:str-stream + pipe = parser.pipe(textstream) + + for doc in pipe: + tokens = [tok for tok in doc] + for f in funclist: + tokens = filter(f,tokens) + #tokens = map(funclist,tokens) + yield " ".join([tok.lower_ for tok in tokens]) + + + + +def keepPOS(pos_list): + return lambda tok : tok.pos_ in pos_list + +def removePOS(pos_list): + return lambda tok : tok.pos_ not in pos_list + +def removeWords(words, keep=None): + #todo in:str oder str-list + if hasattr(keep, '__iter__'): + for k in keep: + try: + words.remove(k) + except ValueError: + pass + return lambda tok : tok.lower_ not in words + +emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + + +def replaceEmails(replace_with="EMAIL"): + return lambda tok : emailFinder.sub(replace_with, tok.lower_) + + +stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + +clean_in_content=[ + removePOS(["SPACE"]), + removePOS(["PUNCT"]), + removeWords(stop_words,keep=["und"]), + replaceEmails +] + + + +## add files to textacy-corpus, +print("add texts to textacy-corpus...") +corpus.add_texts( + processTextstream(generateMainTextfromTicketXML(path2xml), clean_in_content), +) + +printRandomDoc(corpus) + +#todo https://stackoverflow.com/questions/15200048/how-to-get-the-parameters-type-and-return-type-of-a-function + + + + + + + + + + + + + + + + + + +