spacy-pipeline / python funciton-composing versucht
This commit is contained in:
parent
11e77fad06
commit
05b4f514d5
|
@ -406,6 +406,7 @@ def printRandomDoc(textacyCorpus):
|
||||||
print()
|
print()
|
||||||
|
|
||||||
####################'####################'####################'####################'####################'##############
|
####################'####################'####################'####################'####################'##############
|
||||||
|
# todo config-file
|
||||||
|
|
||||||
import de_core_news_md
|
import de_core_news_md
|
||||||
DATAPATH = "ticketSamples.xml"
|
DATAPATH = "ticketSamples.xml"
|
||||||
|
@ -421,8 +422,7 @@ custom_words = ["grüßen", "fragen"]
|
||||||
|
|
||||||
####################'####################'####################'####################'####################'##############
|
####################'####################'####################'####################'####################'##############
|
||||||
|
|
||||||
#todo joar diese pipe halt und vllt ne config-file
|
#todo https://spacy.io/docs/usage/customizing-pipeline
|
||||||
|
|
||||||
|
|
||||||
## files to textacy-corpus
|
## files to textacy-corpus
|
||||||
textacyCorpus = textacy.Corpus(PARSER)
|
textacyCorpus = textacy.Corpus(PARSER)
|
||||||
|
|
|
@ -0,0 +1,124 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import spacy
|
||||||
|
import textacy
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import spacy
|
||||||
|
import functools
|
||||||
|
|
||||||
|
import textacy
|
||||||
|
|
||||||
|
|
||||||
|
class TextCleaner:
|
||||||
|
|
||||||
|
def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
|
||||||
|
"""
|
||||||
|
:param parser: spacy-parser
|
||||||
|
:param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
|
||||||
|
:param customClass_symbols:[str]
|
||||||
|
:param customClass_words:[str]
|
||||||
|
:param customClassPOS:[str]
|
||||||
|
:param keep4All: [str]
|
||||||
|
"""
|
||||||
|
if thesaurus is None:
|
||||||
|
DATAPATH_thesaurus = "openthesaurus.csv"
|
||||||
|
|
||||||
|
## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
|
||||||
|
self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
|
||||||
|
else:
|
||||||
|
self.thesaurus = thesaurus
|
||||||
|
|
||||||
|
self.parser = parser
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
|
||||||
|
self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||||
|
self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||||
|
self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# to remove
|
||||||
|
self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||",
|
||||||
|
";", ":",
|
||||||
|
"…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
|
||||||
|
self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# to keep
|
||||||
|
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
|
||||||
|
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
|
||||||
|
|
||||||
|
self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
|
||||||
|
self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
|
||||||
|
|
||||||
|
|
||||||
|
keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep
|
||||||
|
|
||||||
|
|
||||||
|
# modify those to remove with those to keep
|
||||||
|
for sym in keep:
|
||||||
|
try:
|
||||||
|
self.symbols.remove(sym)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
for sym in keep:
|
||||||
|
try:
|
||||||
|
self.stop_words.remove(sym)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def loadString(self,string):
|
||||||
|
self.currentDoc = self.parser(string)
|
||||||
|
|
||||||
|
|
||||||
|
def removeWhitespace(self, string):
|
||||||
|
return " ".join([tok.text for tok in self.currentDoc if not tok.is_space])
|
||||||
|
|
||||||
|
|
||||||
|
def removePunctuation(self, string, custom_symbols=None, keep=None):
|
||||||
|
symbols = self.symbols + (custom_symbols if custom_symbols is not None else [])
|
||||||
|
if hasattr(keep, '__iter__'):
|
||||||
|
for k in keep:
|
||||||
|
try:
|
||||||
|
symbols.remove(k)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols])
|
||||||
|
|
||||||
|
|
||||||
|
#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/
|
||||||
|
parser = spacy.load('de')
|
||||||
|
cleaner = TextCleaner(parser)
|
||||||
|
corpus = textacy.Corpus(parser)
|
||||||
|
|
||||||
|
|
||||||
|
def compose(self,*functions):
|
||||||
|
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
|
||||||
|
|
||||||
|
def composeo(*functions):
|
||||||
|
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions)
|
||||||
|
|
||||||
|
#pipeline = compose(functools.partial(removeWhitespace,lemmatize=True))#, cleaner.normalizeSynonyms)
|
||||||
|
|
||||||
|
pipeline = composeo(cleaner.removePunctuation, cleaner.removeWhitespace, cleaner.loadString)
|
||||||
|
|
||||||
|
def pipe1(string):
|
||||||
|
cleaner.loadString(string)
|
||||||
|
string = cleaner.removeWhitespace(string)
|
||||||
|
string = cleaner.removePunctuation(string)
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
|
||||||
|
print(pipe1(string))
|
||||||
|
corpus.add_text(pipeline(string))
|
||||||
|
|
||||||
|
print(corpus[0].text)
|
||||||
|
|
|
@ -8,14 +8,14 @@ import textacy
|
||||||
|
|
||||||
class TextCleaner:
|
class TextCleaner:
|
||||||
|
|
||||||
def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4Class=None):
|
def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
|
||||||
"""
|
"""
|
||||||
:param parser: spacy-parser
|
:param parser: spacy-parser
|
||||||
:param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
|
:param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
|
||||||
:param customClass_symbols:[str]
|
:param customClass_symbols:[str]
|
||||||
:param customClass_words:[str]
|
:param customClass_words:[str]
|
||||||
:param customClassPOS:[str]
|
:param customClassPOS:[str]
|
||||||
:param keep4Class: [str]
|
:param keep4All: [str]
|
||||||
"""
|
"""
|
||||||
if thesaurus is None:
|
if thesaurus is None:
|
||||||
DATAPATH_thesaurus = "openthesaurus.csv"
|
DATAPATH_thesaurus = "openthesaurus.csv"
|
||||||
|
@ -48,11 +48,11 @@ class TextCleaner:
|
||||||
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
|
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
|
||||||
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
|
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
|
||||||
|
|
||||||
self.entities2keep = self.entities2keep + (keep4Class if keep4Class is not None else [])
|
self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
|
||||||
self.pos2keep = self.pos2keep + (keep4Class if keep4Class is not None else [])
|
self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
|
||||||
|
|
||||||
|
|
||||||
keep = (keep4Class if hasattr(keep4Class, '__iter__') else []) + self.pos2keep + self.entities2keep
|
keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep
|
||||||
|
|
||||||
|
|
||||||
# modify those to remove with those to keep
|
# modify those to remove with those to keep
|
||||||
|
@ -60,18 +60,25 @@ class TextCleaner:
|
||||||
try:
|
try:
|
||||||
self.symbols.remove(sym)
|
self.symbols.remove(sym)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
try:
|
pass
|
||||||
self.stop_words.remove(sym)
|
for sym in keep:
|
||||||
except ValueError:
|
try:
|
||||||
pass
|
self.stop_words.remove(sym)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# idee self.currentDoc = spacy.Doc für jeden String aber nicht füpr jede methode
|
# idee self.currentDoc = spacy.Doc für jeden String aber nicht füpr jede methode
|
||||||
|
def loadString(self,string):
|
||||||
|
self.currentDoc = self.parser(string)
|
||||||
|
|
||||||
|
"""
|
||||||
def removeWhitespace(self, string):
|
def removeWhitespace(self, string):
|
||||||
string = self.whitespaceFinder.sub(" ", string)
|
string = self.whitespaceFinder.sub(" ", string)
|
||||||
return string
|
return string
|
||||||
|
"""
|
||||||
|
def removeWhitespace(self, string):
|
||||||
|
return string
|
||||||
|
|
||||||
def removePunctuation(self, string, custom_symbols=None, keep=None):
|
def removePunctuation(self, string, custom_symbols=None, keep=None):
|
||||||
|
|
||||||
|
@ -225,11 +232,7 @@ class TextCleaner:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cleaner = TextCleaner(parser=spacy.load('de'))
|
"""
|
||||||
|
|
||||||
string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
|
|
||||||
|
|
||||||
|
|
||||||
#################################################################################################################
|
#################################################################################################################
|
||||||
|
|
||||||
#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/
|
#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/
|
||||||
|
@ -239,7 +242,27 @@ def compose(self,*functions):
|
||||||
pipeline = compose(functools.partial(cleaner.keepPOSandENT,lemmatize=True))#, cleaner.normalizeSynonyms)
|
pipeline = compose(functools.partial(cleaner.keepPOSandENT,lemmatize=True))#, cleaner.normalizeSynonyms)
|
||||||
|
|
||||||
#################################################################################################################
|
#################################################################################################################
|
||||||
print(cleaner.removePunctuation(string))
|
"""
|
||||||
print(cleaner.keepPOSandENT(string))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue