topicModelingTickets/test.py

166 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import re
import spacy
import textacy
import xml.etree.ElementTree as ET
DATAPATH_thesaurus = "openthesaurus.csv"
PARSER = spacy.load('de')
def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None):
"""
https://spacy.io/docs/usage/pos-tagging
cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols:
["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"]
"""
if custom_symbols is not None:
custom_symbols = custom_symbols
else:
custom_symbols = []
if keep is not None:
keep = keep
else:
keep = []
# List of symbols we don't care about
symbols = ["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"] + custom_symbols
# parse with spaCy
spacy_doc = parser(string)
tokens = []
pos = ["NUM", "SPACE", "PUNCT"]
for p in keep:
pos.remove(p)
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ not in pos:
tokens.append(tok.text.lower().strip())
# remove symbols
tokens = [tok for tok in tokens if tok not in symbols]
# remove whitespace
remove_whitespace(" ".join(tokens))
return " ".join(tokens)
def cleanText_words(string, parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False):
# use preprocessing
if customPreprocessing is not None:
string = customPreprocessing(string)
if custom_stopwords is not None:
custom_stopwords = custom_stopwords
else:
custom_stopwords = []
if custom_words is not None:
custom_words = custom_words
else:
custom_words = []
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS
stoplist =list(stop_words) + custom_stopwords
# replace twitter
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
string = mentionFinder.sub("MENTION", string)
# replace emails
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
string = emailFinder.sub("EMAIL", string)
# replace urls
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
string = urlFinder.sub("URL", string)
# replace HTML symbols
string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
# parse with spaCy
spacy_doc = parser(string)
tokens = []
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ in added_POS:
if lemmatize:
tokens.append(tok.lemma_.lower().strip())
else:
tokens.append(tok.text.lower().strip())
# add entities
if tok.ent_type_ in added_entities:
tokens.append(tok.text.lower())
# remove stopwords
tokens = [tok for tok in tokens if tok not in stoplist]
# remove custom_words
tokens = [tok for tok in tokens if tok not in custom_words]
# remove single characters
tokens = [tok for tok in tokens if len(tok)>1]
# remove large strings of whitespace
#remove_whitespace(" ".join(tokens))
#idee abkürzungen auflösen (v.a. TU -> Technische Universität)
#if normalize_synonyms:
# tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
return " ".join(tokens)
def remove_whitespace(sentence):
whitespaceFinder = re.compile(r'(\r\n|\r|\n|\s)', re.IGNORECASE)
sentence = whitespaceFinder.sub(" ", sentence)
return sentence
def cleanText_normalize(string, parser=PARSER, customPreprocessing=cleanText_words, lemmatize=True):
# use preprocessing
if customPreprocessing is not None:
string = customPreprocessing(string)
string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
print(cleanText_symbols(string=string, parser=PARSER, keep=["NUM"]))
string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
print(cleanText_symbols(string=string, parser=PARSER, keep=None))