topicModelingTickets/test.py

125 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import spacy
import textacy
from spacy.tokens import Doc
# -*- coding: utf-8 -*-
import re
import spacy
import functools
import textacy
class TextCleaner:
def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
"""
:param parser: spacy-parser
:param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
:param customClass_symbols:[str]
:param customClass_words:[str]
:param customClassPOS:[str]
:param keep4All: [str]
"""
if thesaurus is None:
DATAPATH_thesaurus = "openthesaurus.csv"
## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
else:
self.thesaurus = thesaurus
self.parser = parser
self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
# to remove
self.symbols = ["-----", "---", "...", "", "", ".", "-", "<", ">", ",", "?", "!", "..", "nt", "n't", "|", "||",
";", ":",
"", "s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])
# to keep
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep
# modify those to remove with those to keep
for sym in keep:
try:
self.symbols.remove(sym)
except ValueError:
pass
for sym in keep:
try:
self.stop_words.remove(sym)
except ValueError:
pass
def loadString(self,string):
self.currentDoc = self.parser(string)
def removeWhitespace(self, string):
return " ".join([tok.text for tok in self.currentDoc if not tok.is_space])
def removePunctuation(self, string, custom_symbols=None, keep=None):
symbols = self.symbols + (custom_symbols if custom_symbols is not None else [])
if hasattr(keep, '__iter__'):
for k in keep:
try:
symbols.remove(k)
except ValueError:
pass
return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols])
#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/
parser = spacy.load('de')
cleaner = TextCleaner(parser)
corpus = textacy.Corpus(parser)
def compose(self,*functions):
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
def composeo(*functions):
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions)
#pipeline = compose(functools.partial(removeWhitespace,lemmatize=True))#, cleaner.normalizeSynonyms)
pipeline = composeo(cleaner.removePunctuation, cleaner.removeWhitespace, cleaner.loadString)
def pipe1(string):
cleaner.loadString(string)
string = cleaner.removeWhitespace(string)
string = cleaner.removePunctuation(string)
return string
string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
print(pipe1(string))
corpus.add_text(pipeline(string))
print(corpus[0].text)