topicModelingTickets/test.py

# -*- coding: utf-8 -*-
import spacy
import textacy
from spacy.tokens import Doc

# -*- coding: utf-8 -*-
import re
import spacy
import functools

import textacy


class TextCleaner:

    def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
        """
        :param parser: spacy-parser
        :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
        :param customClass_symbols:[str]
        :param customClass_words:[str]
        :param customClassPOS:[str]
        :param keep4All: [str]
        """
        if thesaurus is None:
            DATAPATH_thesaurus = "openthesaurus.csv"

            ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
            self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
        else:
            self.thesaurus = thesaurus

        self.parser = parser

        #self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
        self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
        self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
        self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)

        # to keep
        self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]  # ,"PERSON"]
        self.pos2keep = ["NOUN"]  # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen  http://aclweb.org/anthology/U15-1013

        """
        
        # to remove
        self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||",
                   ";", ":",
                   "…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
        self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])


        self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
        self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])


        keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep


        # modify those to remove with those to keep
        for sym in keep:
            try:
                self.symbols.remove(sym)
            except ValueError:
                pass
        for sym in keep:
            try:
                self.stop_words.remove(sym)
            except ValueError:
                pass
        """

    def loadString(self,string):
        self.currentDoc = self.parser(string)


    def removeWhitespace(self, string):
        return " ".join([tok.text for tok in self.currentDoc if not tok.is_space])


    def removePunctuation(self, string, custom_symbols=None, keep=None):
        symbols = self.symbols + (custom_symbols if custom_symbols is not None else [])
        if hasattr(keep, '__iter__'):
            for k in keep:
                try:
                    symbols.remove(k)
                except ValueError:
                    pass

        return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols])


def cleanDoc(doc, toDelete=None, toKeep=None):
    """
    :param doc: spacyDoc
    :param toDelete: [str] pos_ , ent_type_  or  tag_
    :return: str tokenlist
    """
    #keep
    tokenlist = []
    for tok in doc:
        if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep:
            tokenlist.append(tok.text)

    #delete
    tokenlist = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete]

    result = " ".join(tokenlist)
    return result   #problem: kein doc und daher nicht komponierbar


def keepinDoc(doc, toKeep=None):
    """
    :param doc: spacyDoc
    :param toDelete: [str]
    :return: str tokenlist
    """
    return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep])


#todo  https://mathieularose.com/function-composition-in-python/
parser = spacy.load('de')
cleaner = TextCleaner(parser)
corpus_raw = textacy.Corpus(parser)
corpus_clean = textacy.Corpus(parser)

def foo(doc, toKeep=None):

    words = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]
    spaces = [True] * len(words)

    return Doc(doc.vocab,words=words,spaces=spaces)

def foo2(doc, toDelete=None):#, toKeep=None):
    """
    :param doc: spacyDoc
    :param toDelete: [str] pos_ , ent_type_  or  tag_
    :return: str tokenlist
    """
    #keep
    #tokenlist = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]

    #delete

    words = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete]
    spaces = [True] * len(words)

    return Doc(doc.vocab, words=words, spaces=spaces)


"""
def compose(self,*functions):
    return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)

def composeo(*functions):
    return functools.reduce(lambda f, g: lambda x: f(g(x)), functions)
"""

def double(a):
    return a*2

def add(a, b):
    return a+b

def compose(*functions):
    def compose2(f, g):
        return lambda x: f(g(x))
    return functools.reduce(compose2, functions, lambda x: x)


#pipeline = compose(removeFromDoc, cleaner.removeWhitespace, cleaner.loadString)
"""
def pipe1(string):
    cleaner.loadString(string)
    string = cleaner.removeWhitespace(string)
    string = cleaner.removePunctuation(string)
    return string
"""

def cleaningPipe(spacy_pipe, composition):
    for doc in spacy_pipe:
        yield composition(doc)


pipeline = compose(
                   functools.partial(foo2, toDelete=["PUNCT", "SPACE"]),
                    functools.partial(foo, toKeep=["NOUN"]))


string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n           Dieses Ticket wird geschlossen \n            \n test"

doc = parser(string)

#print(removeFromDoc(doc,toDelete=["PUNCT"]))

print(pipeline(doc.text))


for txt in cleaningPipe(parser.pipe([string]),pipeline):
    print(txt)
"""
corpus_raw.add_text(string)
for doc in parser.pipe([string]):
    doc.text = removeFromDoc(doc, toDelete=["PUNCT"])
"""

#corpus_clean.add_texts(cleaningPipe(parser.pipe([string]),pipeline))
#print(corpus_raw[0].text)