topicModelingTickets/test.py

# -*- coding: utf-8 -*-
import functools
import re
import xml.etree.ElementTree as ET

import spacy
import textacy

path2xml = "ticketSamples.xml"
import de_core_news_md


PARSER = de_core_news_md.load()
corpus = textacy.Corpus(PARSER)


def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'):
    """
    generates strings from XML
    :param path2xml:
    :param main_textfield:
    :param cleaning_function:
    :yields strings
    """
    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
    root = tree.getroot()

    for ticket in root:
        for field in ticket:
            if field.tag == main_textfield:
                    yield field.text


def printRandomDoc(textacyCorpus):
    import random
    print()

    print("len(textacyCorpus) = %i" % len(textacyCorpus))
    randIndex = int((len(textacyCorpus) - 1) * random.random())
    print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))

    print()


def processTextstream(textstream, funclist, parser=PARSER):
    # input:str-stream output:str-stream
    pipe = parser.pipe(textstream)

    for doc in pipe:
        tokens = [tok for tok in doc]
        for f in funclist:
            tokens = filter(f,tokens)
            #tokens = map(funclist,tokens)
        yield " ".join([tok.lower_ for tok in tokens])


def keepPOS(pos_list):
    return lambda tok : tok.pos_ in pos_list

def removePOS(pos_list):
    return lambda tok : tok.pos_ not in pos_list

def removeWords(words, keep=None):
    #todo in:str oder str-list
    if hasattr(keep, '__iter__'):
        for k in keep:
            try:
                words.remove(k)
            except ValueError:
                pass
    return lambda tok :  tok.lower_ not in words

emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)


def replaceEmails(replace_with="EMAIL"):
    return lambda tok : emailFinder.sub(replace_with, tok.lower_)


stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)

clean_in_content=[
    removePOS(["SPACE"]),
    removePOS(["PUNCT"]),
    removeWords(stop_words,keep=["und"]),
    replaceEmails
]


## add files to textacy-corpus,
print("add texts to textacy-corpus...")
corpus.add_texts(
    processTextstream(generateMainTextfromTicketXML(path2xml), clean_in_content),
)

printRandomDoc(corpus)

#todo https://stackoverflow.com/questions/15200048/how-to-get-the-parameters-type-and-return-type-of-a-function
preprocessingpipe verfeinert 2017-09-11 17:29:54 +02:00			`# -- coding: utf-8 --`
			`import functools`
			`import re`
			`import xml.etree.ElementTree as ET`

			`import spacy`
			`import textacy`

			`path2xml = "ticketSamples.xml"`
			`import de_core_news_md`


			`PARSER = de_core_news_md.load()`
			`corpus = textacy.Corpus(PARSER)`



			`def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'):`
			`"""`
			`generates strings from XML`
			`:param path2xml:`
			`:param main_textfield:`
			`:param cleaning_function:`
			`:yields strings`
			`"""`
			`tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))`
			`root = tree.getroot()`

			`for ticket in root:`
			`for field in ticket:`
			`if field.tag == main_textfield:`
			`yield field.text`


			`def printRandomDoc(textacyCorpus):`
			`import random`
			`print()`

			`print("len(textacyCorpus) = %i" % len(textacyCorpus))`
			`randIndex = int((len(textacyCorpus) - 1) * random.random())`
			`print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))`

			`print()`



			`def processTextstream(textstream, funclist, parser=PARSER):`
			`# input:str-stream output:str-stream`
			`pipe = parser.pipe(textstream)`

			`for doc in pipe:`
			`tokens = [tok for tok in doc]`
			`for f in funclist:`
			`tokens = filter(f,tokens)`
			`#tokens = map(funclist,tokens)`
			`yield " ".join([tok.lower_ for tok in tokens])`




			`def keepPOS(pos_list):`
			`return lambda tok : tok.pos_ in pos_list`

			`def removePOS(pos_list):`
			`return lambda tok : tok.pos_ not in pos_list`

			`def removeWords(words, keep=None):`
			`#todo in:str oder str-list`
			`if hasattr(keep, '__iter__'):`
			`for k in keep:`
			`try:`
			`words.remove(k)`
			`except ValueError:`
			`pass`
			`return lambda tok : tok.lower_ not in words`

			`emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)`


			`def replaceEmails(replace_with="EMAIL"):`
			`return lambda tok : emailFinder.sub(replace_with, tok.lower_)`


			`stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)`

			`clean_in_content=[`
			`removePOS(["SPACE"]),`
			`removePOS(["PUNCT"]),`
			`removeWords(stop_words,keep=["und"]),`
			`replaceEmails`
			`]`



			`## add files to textacy-corpus,`
			`print("add texts to textacy-corpus...")`
			`corpus.add_texts(`
			`processTextstream(generateMainTextfromTicketXML(path2xml), clean_in_content),`
			`)`

			`printRandomDoc(corpus)`

			`#todo https://stackoverflow.com/questions/15200048/how-to-get-the-parameters-type-and-return-type-of-a-function`