topicModelingTickets/old/testo.py

# -*- coding: utf-8 -*-
import functools
import re

import spacy
import textacy
from spacy.tokens import Doc
from spacy.tagger import Tagger

import xml.etree.ElementTree as ET

PARSER = spacy.load('de')
stop_words = list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)

def compose(*functions):
    def compose2(f, g):
        return lambda x: f(g(x))
    return functools.reduce(compose2, functions, lambda x: x)


def cleanTexts(textstream, parser, attr):

    #input str-stream output str-stream
    pipe = parser.pipe(textstream)

    for doc in pipe:

        tokens = [tok.text for tok in doc
                  if tok.pos_ not in attr
                  and tok.tag_ not in attr
                  and tok.ent_ not in attr
                  and tok.text not in attr
                  and tok.lower_ not in attr]


        yield " ".join(tokens)


        """
def cleanDoc_lemmatize(doc,parser=PARSER):
    return parser(" ".join([tok.lemma_ for tok in doc ]))


def cleanDoc_STOPS(doc,parser=PARSER, stop_words=None, keep=None):
    if stop_words is None:
        stop_words = list(__import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS)

    if hasattr(keep, '__iter__'):
        for k in keep:
            try:
                stop_words.remove(k)
            except ValueError:
                pass

    return parser(" ".join([tok.text for tok in doc if tok.text not in stop_words]))


def cleanDoc_ENT(doc,parser=PARSER, keeponly=False, attr=["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]):
    if keeponly:
        return  parser(" ".join([tok.text for tok in doc if tok.ent_ in attr]))
    else:
        return  parser(" ".join([tok.text for tok in doc if tok.ent_ not in attr]))


def cleanDoc_POS(doc,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
    if keeponly:
        return parser(" ".join([tok.text for tok in doc if tok.pos_ in attr]))
    else:
        return parser(" ".join([tok.text for tok in doc if tok.pos_ not in attr]))
"""


def cleanTexts_POS(spacypipe, keeponly=False, attr=["SPACE", "PUNCT"]):
    """
    :param spacypipe: spacypipe
    :param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
    :param attr: [str] pos_ or ent_type_
    :yields: stream of strings: full-length cleaned text
    """
    if keeponly:
        for doc in spacypipe:
            yield " ".join([tok.text for tok in doc if tok.pos_ in attr])

    else:
        for doc in spacypipe:
            yield " ".join([tok.text for tok in doc if tok.pos_ not in attr])

def cleanText_POS(text,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
    """
    :param txt: str
    :param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
    :param attr: [str] pos_ or ent_type_
    :return: str
    """
    doc = parser(text)

    if keeponly:
        return " ".join([tok.text for tok in doc if tok.pos_ in attr])
    else:
        return " ".join([tok.text for tok in doc if tok.pos_ not in attr])


def removeWhitespace(string):
    return re.sub(r'(\r\n|\r|\n|(\s)+)', ' ', string)

def removeWords(string, words):
    big_regex = re.compile('|'.join(map(re.escape, words)))
    return big_regex.sub("", string)


def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung', cleaning_function=None):
    """
    generates strings from XML
    :param path2xml:
    :param main_textfield:
    :param cleaning_function:
    :yields strings
    """
    import xml.etree.ElementTree as ET

    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
    root = tree.getroot()


    for ticket in root:
        text = "ERROR"
        for field in ticket:
            if field.tag == main_textfield:
                if cleaning_function:
                    text = cleaning_function(field.text)
                else:
                    text = field.text
        yield text

def generateMetadatafromTicketXML(path2xml, key_function_pairs_to_clean, leave_out=['Beschreibung']):
    import xml.etree.ElementTree as ET

    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))

    root = tree.getroot()

    for ticket in root:
        metadata = {}
        for field in ticket:
            if field.tag not in leave_out:

                if field.tag in key_function_pairs_to_clean:
                    metadata[field.tag] = key_function_pairs_to_clean[field.tag](field.text)
                else:
                    metadata[field.tag] = field.text

        yield metadata


string = "Frau Hinrichs überdenkt die tu Situation a@bc.de und 545453 macht ' dann neue Anträge. \n           Dieses Ticket wird geschlossen \n            \n test"

#print(removeWords(string,["die", "neue"]))

# in:str out:str
cleanString = compose(
    cleanText_POS,
    functools.partial(textacy.preprocess.replace_emails, replace_with=u'EMAIL')
)

key_function_pairs_to_clean = {
    "Loesung":removeWhitespace,
    "Zusammenfassung":cleanText_POS
}
"""
# in:str-gen out:str-gen
cleanStream = compose(
    removeSTOP,
    lemmatize,
    cleanEnt
)
"""
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpi
# metadata:xml -> -> stringCleaning -> corpi

corpus = textacy.Corpus(PARSER)


corpus.add_texts(
    cleanTexts(generateMainTextfromTicketXML("ticketSamples.xml"),PARSER,["PUNCT","SPACE","PERSON"])#,
    #generateMetadatafromTicketXML("ticketSamples.xml",key_function_pairs_to_clean=key_function_pairs_to_clean)
)

print(corpus[0].text)
fehlende ordner geadded 2017-09-11 13:03:20 +02:00			`# -- coding: utf-8 --`
			`import functools`
			`import re`

			`import spacy`
			`import textacy`
			`from spacy.tokens import Doc`
			`from spacy.tagger import Tagger`

			`import xml.etree.ElementTree as ET`

			`PARSER = spacy.load('de')`
			`stop_words = list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)`

			`def compose(*functions):`
			`def compose2(f, g):`
			`return lambda x: f(g(x))`
			`return functools.reduce(compose2, functions, lambda x: x)`


			`def cleanTexts(textstream, parser, attr):`

			`#input str-stream output str-stream`
			`pipe = parser.pipe(textstream)`

			`for doc in pipe:`

			`tokens = [tok.text for tok in doc`
			`if tok.pos_ not in attr`
			`and tok.tag_ not in attr`
			`and tok.ent_ not in attr`
			`and tok.text not in attr`
			`and tok.lower_ not in attr]`


			`yield " ".join(tokens)`


			`"""`
			`def cleanDoc_lemmatize(doc,parser=PARSER):`
			`return parser(" ".join([tok.lemma_ for tok in doc ]))`


			`def cleanDoc_STOPS(doc,parser=PARSER, stop_words=None, keep=None):`
			`if stop_words is None:`
			`stop_words = list(__import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS)`

			`if hasattr(keep, '__iter__'):`
			`for k in keep:`
			`try:`
			`stop_words.remove(k)`
			`except ValueError:`
			`pass`

			`return parser(" ".join([tok.text for tok in doc if tok.text not in stop_words]))`



			`def cleanDoc_ENT(doc,parser=PARSER, keeponly=False, attr=["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]):`
			`if keeponly:`
			`return parser(" ".join([tok.text for tok in doc if tok.ent_ in attr]))`
			`else:`
			`return parser(" ".join([tok.text for tok in doc if tok.ent_ not in attr]))`



			`def cleanDoc_POS(doc,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):`
			`if keeponly:`
			`return parser(" ".join([tok.text for tok in doc if tok.pos_ in attr]))`
			`else:`
			`return parser(" ".join([tok.text for tok in doc if tok.pos_ not in attr]))`
			`"""`


			`def cleanTexts_POS(spacypipe, keeponly=False, attr=["SPACE", "PUNCT"]):`
			`"""`
			`:param spacypipe: spacypipe`
			`:param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted`
			`:param attr: [str] pos_ or ent_type_`
			`:yields: stream of strings: full-length cleaned text`
			`"""`
			`if keeponly:`
			`for doc in spacypipe:`
			`yield " ".join([tok.text for tok in doc if tok.pos_ in attr])`

			`else:`
			`for doc in spacypipe:`
			`yield " ".join([tok.text for tok in doc if tok.pos_ not in attr])`

			`def cleanText_POS(text,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):`
			`"""`
			`:param txt: str`
			`:param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted`
			`:param attr: [str] pos_ or ent_type_`
			`:return: str`
			`"""`
			`doc = parser(text)`

			`if keeponly:`
			`return " ".join([tok.text for tok in doc if tok.pos_ in attr])`
			`else:`
			`return " ".join([tok.text for tok in doc if tok.pos_ not in attr])`


			`def removeWhitespace(string):`
			`return re.sub(r'(\r\n\|\r\|\n\|(\s)+)', ' ', string)`

			`def removeWords(string, words):`
			`big_regex = re.compile('\|'.join(map(re.escape, words)))`
			`return big_regex.sub("", string)`






			`def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung', cleaning_function=None):`
			`"""`
			`generates strings from XML`
			`:param path2xml:`
			`:param main_textfield:`
			`:param cleaning_function:`
			`:yields strings`
			`"""`
			`import xml.etree.ElementTree as ET`

			`tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))`
			`root = tree.getroot()`


			`for ticket in root:`
			`text = "ERROR"`
			`for field in ticket:`
			`if field.tag == main_textfield:`
			`if cleaning_function:`
			`text = cleaning_function(field.text)`
			`else:`
			`text = field.text`
			`yield text`

			`def generateMetadatafromTicketXML(path2xml, key_function_pairs_to_clean, leave_out=['Beschreibung']):`
			`import xml.etree.ElementTree as ET`

			`tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))`

			`root = tree.getroot()`

			`for ticket in root:`
			`metadata = {}`
			`for field in ticket:`
			`if field.tag not in leave_out:`

			`if field.tag in key_function_pairs_to_clean:`
			`metadata[field.tag] = key_function_pairs_to_clean[field.tag](field.text)`
			`else:`
			`metadata[field.tag] = field.text`

			`yield metadata`




			`string = "Frau Hinrichs überdenkt die tu Situation a@bc.de und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"`

			`#print(removeWords(string,["die", "neue"]))`

			`# in:str out:str`
			`cleanString = compose(`
			`cleanText_POS,`
			`functools.partial(textacy.preprocess.replace_emails, replace_with=u'EMAIL')`
			`)`

			`key_function_pairs_to_clean = {`
			`"Loesung":removeWhitespace,`
			`"Zusammenfassung":cleanText_POS`
			`}`
			`"""`
			`# in:str-gen out:str-gen`
			`cleanStream = compose(`
			`removeSTOP,`
			`lemmatize,`
			`cleanEnt`
			`)`
			`"""`
aufgeräumt 2017-10-16 14:01:38 +02:00			`# content: xml -> stringCleaning -> pipe -> docCleaning -> corpi`
			`# metadata:xml -> -> stringCleaning -> corpi`
fehlende ordner geadded 2017-09-11 13:03:20 +02:00
			`corpus = textacy.Corpus(PARSER)`




			`corpus.add_texts(`
			`cleanTexts(generateMainTextfromTicketXML("ticketSamples.xml"),PARSER,["PUNCT","SPACE","PERSON"])#,`
			`#generateMetadatafromTicketXML("ticketSamples.xml",key_function_pairs_to_clean=key_function_pairs_to_clean)`
			`)`

			`print(corpus[0].text)`