topicModelingTickets/test.py

# -*- coding: utf-8 -*-
import re

import spacy
import textacy
import xml.etree.ElementTree as ET


DATAPATH_thesaurus = "openthesaurus.csv"


PARSER = spacy.load('de')


def cleanText_symbols(string,  parser=PARSER, custom_symbols=None, keep=None):
    """
    https://spacy.io/docs/usage/pos-tagging

    cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols:

    ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"]

    """
    if custom_symbols is not None:
        custom_symbols = custom_symbols
    else:
        custom_symbols = []

    if keep is not None:
        keep = keep
    else:
        keep = []

    # List of symbols we don't care about
    symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols

    # parse with spaCy
    spacy_doc = parser(string)
    tokens = []

    pos = ["NUM", "SPACE", "PUNCT"]
    for p in keep:
        pos.remove(p)


    # append Tokens to a list
    for tok in spacy_doc:
            if tok.pos_ not in pos:
                tokens.append(tok.text.lower().strip())


    # remove symbols
    tokens = [tok for tok in tokens if tok not in symbols]

    # remove whitespace
    remove_whitespace(" ".join(tokens))

    return " ".join(tokens)

def cleanText_words(string, parser=PARSER,  custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False):

    # use preprocessing
    if customPreprocessing is not None:
        string = customPreprocessing(string)

    if custom_stopwords is not None:
        custom_stopwords = custom_stopwords
    else:
        custom_stopwords = []

    if custom_words is not None:
        custom_words = custom_words
    else:
        custom_words = []


    # custom stoplist
    # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
    stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS

    stoplist =list(stop_words) + custom_stopwords

    # replace twitter
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    string = mentionFinder.sub("MENTION", string)

    # replace emails
    emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
    string = emailFinder.sub("EMAIL", string)

    # replace urls
    urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
    string = urlFinder.sub("URL", string)

    # replace HTML symbols
    string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")


    # parse with spaCy
    spacy_doc = parser(string)
    tokens = []

    added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
    added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen  http://aclweb.org/anthology/U15-1013

    # append Tokens to a list
    for tok in spacy_doc:
            if tok.pos_ in added_POS:
                if lemmatize:
                    tokens.append(tok.lemma_.lower().strip())
                else:
                    tokens.append(tok.text.lower().strip())

            # add entities
            if tok.ent_type_ in added_entities:
                tokens.append(tok.text.lower())


    # remove stopwords
    tokens = [tok for tok in tokens if tok not in stoplist]

    # remove custom_words
    tokens = [tok for tok in tokens if tok not in custom_words]

    # remove single characters
    tokens = [tok for tok in tokens if len(tok)>1]

    # remove large strings of whitespace
    #remove_whitespace(" ".join(tokens))


    #idee abkürzungen auflösen (v.a. TU -> Technische Universität)

    #if normalize_synonyms:
    #    tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]

    return " ".join(tokens)

def remove_whitespace(sentence):
    whitespaceFinder = re.compile(r'(\r\n|\r|\n|\s)', re.IGNORECASE)
    sentence = whitespaceFinder.sub(" ", sentence)
    return sentence

def cleanText_normalize(string, parser=PARSER, customPreprocessing=cleanText_words, lemmatize=True):
    # use preprocessing
    if customPreprocessing is not None:
        string = customPreprocessing(string)


string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n           Dieses Ticket wird geschlossen \n            \n test"
print(cleanText_symbols(string=string, parser=PARSER, keep=["NUM"]))


string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n           Dieses Ticket wird geschlossen \n            \n test"
print(cleanText_symbols(string=string, parser=PARSER, keep=None))