topicModelingTickets/old/testo.py

# -*- coding: utf-8 -*-
import functools
import re

import spacy
import textacy
from spacy.tokens import Doc
from spacy.tagger import Tagger

import xml.etree.ElementTree as ET

PARSER = spacy.load('de')
stop_words = list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)

def compose(*functions):
    def compose2(f, g):
        return lambda x: f(g(x))
    return functools.reduce(compose2, functions, lambda x: x)


def cleanTexts(textstream, parser, attr):

    #input str-stream output str-stream
    pipe = parser.pipe(textstream)

    for doc in pipe:

        tokens = [tok.text for tok in doc
                  if tok.pos_ not in attr
                  and tok.tag_ not in attr
                  and tok.ent_ not in attr
                  and tok.text not in attr
                  and tok.lower_ not in attr]


        yield " ".join(tokens)


        """
def cleanDoc_lemmatize(doc,parser=PARSER):
    return parser(" ".join([tok.lemma_ for tok in doc ]))


def cleanDoc_STOPS(doc,parser=PARSER, stop_words=None, keep=None):
    if stop_words is None:
        stop_words = list(__import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS)

    if hasattr(keep, '__iter__'):
        for k in keep:
            try:
                stop_words.remove(k)
            except ValueError:
                pass

    return parser(" ".join([tok.text for tok in doc if tok.text not in stop_words]))


def cleanDoc_ENT(doc,parser=PARSER, keeponly=False, attr=["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]):
    if keeponly:
        return  parser(" ".join([tok.text for tok in doc if tok.ent_ in attr]))
    else:
        return  parser(" ".join([tok.text for tok in doc if tok.ent_ not in attr]))


def cleanDoc_POS(doc,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
    if keeponly:
        return parser(" ".join([tok.text for tok in doc if tok.pos_ in attr]))
    else:
        return parser(" ".join([tok.text for tok in doc if tok.pos_ not in attr]))
"""


def cleanTexts_POS(spacypipe, keeponly=False, attr=["SPACE", "PUNCT"]):
    """
    :param spacypipe: spacypipe
    :param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
    :param attr: [str] pos_ or ent_type_
    :yields: stream of strings: full-length cleaned text
    """
    if keeponly:
        for doc in spacypipe:
            yield " ".join([tok.text for tok in doc if tok.pos_ in attr])

    else:
        for doc in spacypipe:
            yield " ".join([tok.text for tok in doc if tok.pos_ not in attr])

def cleanText_POS(text,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
    """
    :param txt: str
    :param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
    :param attr: [str] pos_ or ent_type_
    :return: str
    """
    doc = parser(text)

    if keeponly:
        return " ".join([tok.text for tok in doc if tok.pos_ in attr])
    else:
        return " ".join([tok.text for tok in doc if tok.pos_ not in attr])


def removeWhitespace(string):
    return re.sub(r'(\r\n|\r|\n|(\s)+)', ' ', string)

def removeWords(string, words):
    big_regex = re.compile('|'.join(map(re.escape, words)))
    return big_regex.sub("", string)


def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung', cleaning_function=None):
    """
    generates strings from XML
    :param path2xml:
    :param main_textfield:
    :param cleaning_function:
    :yields strings
    """
    import xml.etree.ElementTree as ET

    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
    root = tree.getroot()


    for ticket in root:
        text = "ERROR"
        for field in ticket:
            if field.tag == main_textfield:
                if cleaning_function:
                    text = cleaning_function(field.text)
                else:
                    text = field.text
        yield text

def generateMetadatafromTicketXML(path2xml, key_function_pairs_to_clean, leave_out=['Beschreibung']):
    import xml.etree.ElementTree as ET

    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))

    root = tree.getroot()

    for ticket in root:
        metadata = {}
        for field in ticket:
            if field.tag not in leave_out:

                if field.tag in key_function_pairs_to_clean:
                    metadata[field.tag] = key_function_pairs_to_clean[field.tag](field.text)
                else:
                    metadata[field.tag] = field.text

        yield metadata


string = "Frau Hinrichs überdenkt die tu Situation a@bc.de und 545453 macht ' dann neue Anträge. \n           Dieses Ticket wird geschlossen \n            \n test"

#print(removeWords(string,["die", "neue"]))

# in:str out:str
cleanString = compose(
    cleanText_POS,
    functools.partial(textacy.preprocess.replace_emails, replace_with=u'EMAIL')
)

key_function_pairs_to_clean = {
    "Loesung":removeWhitespace,
    "Zusammenfassung":cleanText_POS
}
"""
# in:str-gen out:str-gen
cleanStream = compose(
    removeSTOP,
    lemmatize,
    cleanEnt
)
"""
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpus
# metadata:xml -> -> stringCleaning -> corpus

corpus = textacy.Corpus(PARSER)


corpus.add_texts(
    cleanTexts(generateMainTextfromTicketXML("ticketSamples.xml"),PARSER,["PUNCT","SPACE","PERSON"])#,
    #generateMetadatafromTicketXML("ticketSamples.xml",key_function_pairs_to_clean=key_function_pairs_to_clean)
)

print(corpus[0].text)