composing geht irgendwie aber mehr probleme
This commit is contained in:
parent
05b4f514d5
commit
73a13551c6
|
@ -422,7 +422,6 @@ custom_words = ["grüßen", "fragen"]
|
||||||
|
|
||||||
####################'####################'####################'####################'####################'##############
|
####################'####################'####################'####################'####################'##############
|
||||||
|
|
||||||
#todo https://spacy.io/docs/usage/customizing-pipeline
|
|
||||||
|
|
||||||
## files to textacy-corpus
|
## files to textacy-corpus
|
||||||
textacyCorpus = textacy.Corpus(PARSER)
|
textacyCorpus = textacy.Corpus(PARSER)
|
||||||
|
|
123
test.py
123
test.py
|
@ -32,15 +32,17 @@ class TextCleaner:
|
||||||
|
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
|
|
||||||
|
#self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
|
||||||
|
|
||||||
self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
|
|
||||||
self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||||
self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||||
self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||||
|
|
||||||
|
# to keep
|
||||||
|
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
|
||||||
|
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
# to remove
|
# to remove
|
||||||
self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||",
|
self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||",
|
||||||
";", ":",
|
";", ":",
|
||||||
|
@ -48,11 +50,6 @@ class TextCleaner:
|
||||||
self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])
|
self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# to keep
|
|
||||||
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
|
|
||||||
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
|
|
||||||
|
|
||||||
self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
|
self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
|
||||||
self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
|
self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
|
||||||
|
|
||||||
|
@ -71,7 +68,7 @@ class TextCleaner:
|
||||||
self.stop_words.remove(sym)
|
self.stop_words.remove(sym)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
"""
|
||||||
|
|
||||||
def loadString(self,string):
|
def loadString(self,string):
|
||||||
self.currentDoc = self.parser(string)
|
self.currentDoc = self.parser(string)
|
||||||
|
@ -93,32 +90,124 @@ class TextCleaner:
|
||||||
return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols])
|
return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols])
|
||||||
|
|
||||||
|
|
||||||
#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/
|
def cleanDoc(doc, toDelete=None, toKeep=None):
|
||||||
|
"""
|
||||||
|
:param doc: spacyDoc
|
||||||
|
:param toDelete: [str] pos_ , ent_type_ or tag_
|
||||||
|
:return: str tokenlist
|
||||||
|
"""
|
||||||
|
#keep
|
||||||
|
tokenlist = []
|
||||||
|
for tok in doc:
|
||||||
|
if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep:
|
||||||
|
tokenlist.append(tok.text)
|
||||||
|
|
||||||
|
#delete
|
||||||
|
tokenlist = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete]
|
||||||
|
|
||||||
|
result = " ".join(tokenlist)
|
||||||
|
return result #problem: kein doc und daher nicht komponierbar
|
||||||
|
|
||||||
|
|
||||||
|
def keepinDoc(doc, toKeep=None):
|
||||||
|
"""
|
||||||
|
:param doc: spacyDoc
|
||||||
|
:param toDelete: [str]
|
||||||
|
:return: str tokenlist
|
||||||
|
"""
|
||||||
|
return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep])
|
||||||
|
|
||||||
|
|
||||||
|
#todo https://mathieularose.com/function-composition-in-python/
|
||||||
parser = spacy.load('de')
|
parser = spacy.load('de')
|
||||||
cleaner = TextCleaner(parser)
|
cleaner = TextCleaner(parser)
|
||||||
corpus = textacy.Corpus(parser)
|
corpus_raw = textacy.Corpus(parser)
|
||||||
|
corpus_clean = textacy.Corpus(parser)
|
||||||
|
|
||||||
|
def foo(doc, toKeep=None):
|
||||||
|
|
||||||
|
words = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]
|
||||||
|
spaces = [True] * len(words)
|
||||||
|
|
||||||
|
return Doc(doc.vocab,words=words,spaces=spaces)
|
||||||
|
|
||||||
|
def foo2(doc, toDelete=None):#, toKeep=None):
|
||||||
|
"""
|
||||||
|
:param doc: spacyDoc
|
||||||
|
:param toDelete: [str] pos_ , ent_type_ or tag_
|
||||||
|
:return: str tokenlist
|
||||||
|
"""
|
||||||
|
#keep
|
||||||
|
#tokenlist = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]
|
||||||
|
|
||||||
|
#delete
|
||||||
|
|
||||||
|
words = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete]
|
||||||
|
spaces = [True] * len(words)
|
||||||
|
|
||||||
|
return Doc(doc.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
def compose(self,*functions):
|
def compose(self,*functions):
|
||||||
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
|
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
|
||||||
|
|
||||||
def composeo(*functions):
|
def composeo(*functions):
|
||||||
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions)
|
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions)
|
||||||
|
"""
|
||||||
|
|
||||||
#pipeline = compose(functools.partial(removeWhitespace,lemmatize=True))#, cleaner.normalizeSynonyms)
|
def double(a):
|
||||||
|
return a*2
|
||||||
|
|
||||||
pipeline = composeo(cleaner.removePunctuation, cleaner.removeWhitespace, cleaner.loadString)
|
def add(a, b):
|
||||||
|
return a+b
|
||||||
|
|
||||||
|
def compose(*functions):
|
||||||
|
def compose2(f, g):
|
||||||
|
return lambda x: f(g(x))
|
||||||
|
return functools.reduce(compose2, functions, lambda x: x)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#pipeline = compose(removeFromDoc, cleaner.removeWhitespace, cleaner.loadString)
|
||||||
|
"""
|
||||||
def pipe1(string):
|
def pipe1(string):
|
||||||
cleaner.loadString(string)
|
cleaner.loadString(string)
|
||||||
string = cleaner.removeWhitespace(string)
|
string = cleaner.removeWhitespace(string)
|
||||||
string = cleaner.removePunctuation(string)
|
string = cleaner.removePunctuation(string)
|
||||||
return string
|
return string
|
||||||
|
"""
|
||||||
|
|
||||||
|
def cleaningPipe(spacy_pipe, composition):
|
||||||
|
for doc in spacy_pipe:
|
||||||
|
yield composition(doc)
|
||||||
|
|
||||||
|
|
||||||
|
pipeline = compose(
|
||||||
|
functools.partial(foo2, toDelete=["PUNCT", "SPACE"]),
|
||||||
|
functools.partial(foo, toKeep=["NOUN"]))
|
||||||
|
|
||||||
|
|
||||||
string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
|
string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
|
||||||
print(pipe1(string))
|
|
||||||
corpus.add_text(pipeline(string))
|
|
||||||
|
|
||||||
print(corpus[0].text)
|
doc = parser(string)
|
||||||
|
|
||||||
|
#print(removeFromDoc(doc,toDelete=["PUNCT"]))
|
||||||
|
|
||||||
|
print(pipeline(doc.text))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for txt in cleaningPipe(parser.pipe([string]),pipeline):
|
||||||
|
print(txt)
|
||||||
|
"""
|
||||||
|
corpus_raw.add_text(string)
|
||||||
|
for doc in parser.pipe([string]):
|
||||||
|
doc.text = removeFromDoc(doc, toDelete=["PUNCT"])
|
||||||
|
"""
|
||||||
|
|
||||||
|
#corpus_clean.add_texts(cleaningPipe(parser.pipe([string]),pipeline))
|
||||||
|
#print(corpus_raw[0].text)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue