topicModelingTickets/old/test.py

214 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import spacy
import textacy
from spacy.tokens import Doc
# -*- coding: utf-8 -*-
import re
import spacy
import functools
import textacy
class TextCleaner:
def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
"""
:param parser: spacy-parser
:param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
:param customClass_symbols:[str]
:param customClass_words:[str]
:param customClassPOS:[str]
:param keep4All: [str]
"""
if thesaurus is None:
DATAPATH_thesaurus = "openthesaurus.csv"
## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
else:
self.thesaurus = thesaurus
self.parser = parser
#self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
# to keep
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
"""
# to remove
self.symbols = ["-----", "---", "...", "", "", ".", "-", "<", ">", ",", "?", "!", "..", "nt", "n't", "|", "||",
";", ":",
"", "s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])
self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep
# modify those to remove with those to keep
for sym in keep:
try:
self.symbols.remove(sym)
except ValueError:
pass
for sym in keep:
try:
self.stop_words.remove(sym)
except ValueError:
pass
"""
def loadString(self,string):
self.currentDoc = self.parser(string)
def removeWhitespace(self, string):
return " ".join([tok.text for tok in self.currentDoc if not tok.is_space])
def removePunctuation(self, string, custom_symbols=None, keep=None):
symbols = self.symbols + (custom_symbols if custom_symbols is not None else [])
if hasattr(keep, '__iter__'):
for k in keep:
try:
symbols.remove(k)
except ValueError:
pass
return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols])
def cleanDoc(doc, toDelete=None, toKeep=None):
"""
:param doc: spacyDoc
:param toDelete: [str] pos_ , ent_type_ or tag_
:return: str tokenlist
"""
#keep
tokenlist = []
for tok in doc:
if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep:
tokenlist.append(tok.text)
#delete
tokenlist = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete]
result = " ".join(tokenlist)
return result #problem: kein doc und daher nicht komponierbar
def keepinDoc(doc, toKeep=None):
"""
:param doc: spacyDoc
:param toDelete: [str]
:return: str tokenlist
"""
return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep])
# https://mathieularose.com/function-composition-in-python/
parser = spacy.load('de')
cleaner = TextCleaner(parser)
corpus_raw = textacy.Corpus(parser)
corpus_clean = textacy.Corpus(parser)
def foo(doc, toKeep=None):
words = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]
spaces = [True] * len(words)
return Doc(doc.vocab,words=words,spaces=spaces)
def foo2(doc, toDelete=None):#, toKeep=None):
"""
:param doc: spacyDoc
:param toDelete: [str] pos_ , ent_type_ or tag_
:return: str tokenlist
"""
#keep
#tokenlist = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]
#delete
words = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete]
spaces = [True] * len(words)
return Doc(doc.vocab, words=words, spaces=spaces)
"""
def compose(self,*functions):
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
def composeo(*functions):
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions)
"""
def double(a):
return a*2
def add(a, b):
return a+b
def compose(*functions):
def compose2(f, g):
return lambda x: f(g(x))
return functools.reduce(compose2, functions, lambda x: x)
#pipeline = compose(removeFromDoc, cleaner.removeWhitespace, cleaner.loadString)
"""
def pipe1(string):
cleaner.loadString(string)
string = cleaner.removeWhitespace(string)
string = cleaner.removePunctuation(string)
return string
"""
def cleaningPipe(spacy_pipe, composition):
for doc in spacy_pipe:
yield composition(doc)
pipeline = compose(
functools.partial(foo2, toDelete=["PUNCT", "SPACE"]),
functools.partial(foo, toKeep=["NOUN"]))
string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
doc = parser(string)
#print(removeFromDoc(doc,toDelete=["PUNCT"]))
print(pipeline(doc.text))
for txt in cleaningPipe(parser.pipe([string]),pipeline):
print(txt)
"""
corpus_raw.add_text(string)
for doc in parser.pipe([string]):
doc.text = removeFromDoc(doc, toDelete=["PUNCT"])
"""
#corpus_clean.add_texts(cleaningPipe(parser.pipe([string]),pipeline))
#print(corpus_raw[0].text)