topicModelingTickets/test.py

# -*- coding: utf-8 -*-
import re

import spacy
import textacy
import xml.etree.ElementTree as ET


DATAPATH_thesaurus = "openthesaurus.csv"


PARSER = spacy.load('de')


def cleanText_symbols(string,  parser=PARSER, custom_symbols=None, keep=None):
    """
    https://spacy.io/docs/usage/pos-tagging

    cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols:

    ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"]

    """
    if custom_symbols is not None:
        custom_symbols = custom_symbols
    else:
        custom_symbols = []

    if keep is not None:
        keep = keep
    else:
        keep = []

    # List of symbols we don't care about
    symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols

    # parse with spaCy
    spacy_doc = parser(string)
    tokens = []

    pos = ["NUM", "SPACE", "PUNCT"]
    for p in keep:
        pos.remove(p)


    # append Tokens to a list
    for tok in spacy_doc:
            if tok.pos_ not in pos:
                tokens.append(tok.text.lower().strip())


    # remove symbols
    tokens = [tok for tok in tokens if tok not in symbols]

    # remove whitespace
    remove_whitespace(" ".join(tokens))

    return " ".join(tokens)

def cleanText_words(string, parser=PARSER,  custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False):

    # use preprocessing
    if customPreprocessing is not None:
        string = customPreprocessing(string)

    if custom_stopwords is not None:
        custom_stopwords = custom_stopwords
    else:
        custom_stopwords = []

    if custom_words is not None:
        custom_words = custom_words
    else:
        custom_words = []


    # custom stoplist
    # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
    stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS

    stoplist =list(stop_words) + custom_stopwords

    # replace twitter
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    string = mentionFinder.sub("MENTION", string)

    # replace emails
    emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
    string = emailFinder.sub("EMAIL", string)

    # replace urls
    urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
    string = urlFinder.sub("URL", string)

    # replace HTML symbols
    string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")


    # parse with spaCy
    spacy_doc = parser(string)
    tokens = []

    added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
    added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen  http://aclweb.org/anthology/U15-1013

    # append Tokens to a list
    for tok in spacy_doc:
            if tok.pos_ in added_POS:
                if lemmatize:
                    tokens.append(tok.lemma_.lower().strip())
                else:
                    tokens.append(tok.text.lower().strip())

            # add entities
            if tok.ent_type_ in added_entities:
                tokens.append(tok.text.lower())


    # remove stopwords
    tokens = [tok for tok in tokens if tok not in stoplist]

    # remove custom_words
    tokens = [tok for tok in tokens if tok not in custom_words]

    # remove single characters
    tokens = [tok for tok in tokens if len(tok)>1]

    # remove large strings of whitespace
    #remove_whitespace(" ".join(tokens))


    #idee abkürzungen auflösen (v.a. TU -> Technische Universität)

    #if normalize_synonyms:
    #    tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]

    return " ".join(tokens)

def remove_whitespace(sentence):
    whitespaceFinder = re.compile(r'(\r\n|\r|\n|\s)', re.IGNORECASE)
    sentence = whitespaceFinder.sub(" ", sentence)
    return sentence

def cleanText_normalize(string, parser=PARSER, customPreprocessing=cleanText_words, lemmatize=True):
    # use preprocessing
    if customPreprocessing is not None:
        string = customPreprocessing(string)


string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n           Dieses Ticket wird geschlossen \n            \n test"
print(cleanText_symbols(string=string, parser=PARSER, keep=["NUM"]))


string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n           Dieses Ticket wird geschlossen \n            \n test"
print(cleanText_symbols(string=string, parser=PARSER, keep=None))
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
+								# -*- coding: utf-8 -*-
 								import re
 								import spacy
 								import textacy
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
+								import xml.etree.ElementTree as ET
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
 								DATAPATH_thesaurus = "openthesaurus.csv"
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								PARSER = spacy.load('de')
 								def cleanText_symbols(string,  parser=PARSER, custom_symbols=None, keep=None):
 								    """
 								    https://spacy.io/docs/usage/pos-tagging
 								    cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols:
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"]
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    """
 								    if custom_symbols is not None:
 								        custom_symbols = custom_symbols
 								    else:
 								        custom_symbols = []
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    if keep is not None:
 								        keep = keep
 								    else:
 								        keep = []
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # List of symbols we don't care about
 								    symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # parse with spaCy
 								    spacy_doc = parser(string)
 								    tokens = []
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    pos = ["NUM", "SPACE", "PUNCT"]
 								    for p in keep:
 								        pos.remove(p)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # append Tokens to a list
 								    for tok in spacy_doc:
 								            if tok.pos_ not in pos:
 								                tokens.append(tok.text.lower().strip())
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # remove symbols
 								    tokens = [tok for tok in tokens if tok not in symbols]
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # remove whitespace
 								    remove_whitespace(" ".join(tokens))
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    return " ".join(tokens)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								def cleanText_words(string, parser=PARSER,  custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False):
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # use preprocessing
 								    if customPreprocessing is not None:
 								        string = customPreprocessing(string)
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    if custom_stopwords is not None:
 								        custom_stopwords = custom_stopwords
 								    else:
 								        custom_stopwords = []
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    if custom_words is not None:
 								        custom_words = custom_words
 								    else:
 								        custom_words = []
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # custom stoplist
 								    # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
 								    stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    stoplist =list(stop_words) + custom_stopwords
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # replace twitter
 								    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
 								    string = mentionFinder.sub("MENTION", string)
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # replace emails
 								    emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
 								    string = emailFinder.sub("EMAIL", string)
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # replace urls
 								    urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
 								    string = urlFinder.sub("URL", string)
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # replace HTML symbols
 								    string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # parse with spaCy
 								    spacy_doc = parser(string)
 								    tokens = []
 								    added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
 								    added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen  http://aclweb.org/anthology/U15-1013
 								    # append Tokens to a list
 								    for tok in spacy_doc:
 								            if tok.pos_ in added_POS:
 								                if lemmatize:
 								                    tokens.append(tok.lemma_.lower().strip())
 								                else:
 								                    tokens.append(tok.text.lower().strip())
 								            # add entities
 								            if tok.ent_type_ in added_entities:
 								                tokens.append(tok.text.lower())
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # remove stopwords
 								    tokens = [tok for tok in tokens if tok not in stoplist]
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # remove custom_words
 								    tokens = [tok for tok in tokens if tok not in custom_words]
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # remove single characters
 								    tokens = [tok for tok in tokens if len(tok)>1]
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    # remove large strings of whitespace
 								    #remove_whitespace(" ".join(tokens))
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    #idee abkürzungen auflösen (v.a. TU -> Technische Universität)
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    #if normalize_synonyms:
 								    #    tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								    return " ".join(tokens)
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								def remove_whitespace(sentence):
 								    whitespaceFinder = re.compile(r'(\r\n|\r|\n|\s)', re.IGNORECASE)
 								    sentence = whitespaceFinder.sub(" ", sentence)
 								    return sentence
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								def cleanText_normalize(string, parser=PARSER, customPreprocessing=cleanText_words, lemmatize=True):
 								    # use preprocessing
 								    if customPreprocessing is not None:
 								        string = customPreprocessing(string)
-												openthesaurus debugging

											
										
										
											2017-08-30 12:56:59 +02:00
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n           Dieses Ticket wird geschlossen \n            \n test"
 								print(cleanText_symbols(string=string, parser=PARSER, keep=["NUM"]))
-												xml2Corpus

openthesaurus eingebunden

											
										
										
											2017-08-29 15:01:17 +02:00
-												textcleaning verfeinert

											
										
										
											2017-08-31 14:54:01 +02:00
+								string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n           Dieses Ticket wird geschlossen \n            \n test"
 								print(cleanText_symbols(string=string, parser=PARSER, keep=None))