200 lines
5.4 KiB
Python
200 lines
5.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
import functools
|
|
import re
|
|
|
|
import spacy
|
|
import textacy
|
|
from spacy.tokens import Doc
|
|
from spacy.tagger import Tagger
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
PARSER = spacy.load('de')
|
|
stop_words = list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
|
|
|
|
def compose(*functions):
|
|
def compose2(f, g):
|
|
return lambda x: f(g(x))
|
|
return functools.reduce(compose2, functions, lambda x: x)
|
|
|
|
|
|
def cleanTexts(textstream, parser, attr):
|
|
|
|
#input str-stream output str-stream
|
|
pipe = parser.pipe(textstream)
|
|
|
|
for doc in pipe:
|
|
|
|
tokens = [tok.text for tok in doc
|
|
if tok.pos_ not in attr
|
|
and tok.tag_ not in attr
|
|
and tok.ent_ not in attr
|
|
and tok.text not in attr
|
|
and tok.lower_ not in attr]
|
|
|
|
|
|
yield " ".join(tokens)
|
|
|
|
|
|
"""
|
|
def cleanDoc_lemmatize(doc,parser=PARSER):
|
|
return parser(" ".join([tok.lemma_ for tok in doc ]))
|
|
|
|
|
|
def cleanDoc_STOPS(doc,parser=PARSER, stop_words=None, keep=None):
|
|
if stop_words is None:
|
|
stop_words = list(__import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS)
|
|
|
|
if hasattr(keep, '__iter__'):
|
|
for k in keep:
|
|
try:
|
|
stop_words.remove(k)
|
|
except ValueError:
|
|
pass
|
|
|
|
return parser(" ".join([tok.text for tok in doc if tok.text not in stop_words]))
|
|
|
|
|
|
|
|
def cleanDoc_ENT(doc,parser=PARSER, keeponly=False, attr=["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]):
|
|
if keeponly:
|
|
return parser(" ".join([tok.text for tok in doc if tok.ent_ in attr]))
|
|
else:
|
|
return parser(" ".join([tok.text for tok in doc if tok.ent_ not in attr]))
|
|
|
|
|
|
|
|
def cleanDoc_POS(doc,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
|
|
if keeponly:
|
|
return parser(" ".join([tok.text for tok in doc if tok.pos_ in attr]))
|
|
else:
|
|
return parser(" ".join([tok.text for tok in doc if tok.pos_ not in attr]))
|
|
"""
|
|
|
|
|
|
def cleanTexts_POS(spacypipe, keeponly=False, attr=["SPACE", "PUNCT"]):
|
|
"""
|
|
:param spacypipe: spacypipe
|
|
:param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
|
|
:param attr: [str] pos_ or ent_type_
|
|
:yields: stream of strings: full-length cleaned text
|
|
"""
|
|
if keeponly:
|
|
for doc in spacypipe:
|
|
yield " ".join([tok.text for tok in doc if tok.pos_ in attr])
|
|
|
|
else:
|
|
for doc in spacypipe:
|
|
yield " ".join([tok.text for tok in doc if tok.pos_ not in attr])
|
|
|
|
def cleanText_POS(text,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
|
|
"""
|
|
:param txt: str
|
|
:param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
|
|
:param attr: [str] pos_ or ent_type_
|
|
:return: str
|
|
"""
|
|
doc = parser(text)
|
|
|
|
if keeponly:
|
|
return " ".join([tok.text for tok in doc if tok.pos_ in attr])
|
|
else:
|
|
return " ".join([tok.text for tok in doc if tok.pos_ not in attr])
|
|
|
|
|
|
def removeWhitespace(string):
|
|
return re.sub(r'(\r\n|\r|\n|(\s)+)', ' ', string)
|
|
|
|
def removeWords(string, words):
|
|
big_regex = re.compile('|'.join(map(re.escape, words)))
|
|
return big_regex.sub("", string)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung', cleaning_function=None):
|
|
"""
|
|
generates strings from XML
|
|
:param path2xml:
|
|
:param main_textfield:
|
|
:param cleaning_function:
|
|
:yields strings
|
|
"""
|
|
import xml.etree.ElementTree as ET
|
|
|
|
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
|
root = tree.getroot()
|
|
|
|
|
|
for ticket in root:
|
|
text = "ERROR"
|
|
for field in ticket:
|
|
if field.tag == main_textfield:
|
|
if cleaning_function:
|
|
text = cleaning_function(field.text)
|
|
else:
|
|
text = field.text
|
|
yield text
|
|
|
|
def generateMetadatafromTicketXML(path2xml, key_function_pairs_to_clean, leave_out=['Beschreibung']):
|
|
import xml.etree.ElementTree as ET
|
|
|
|
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
|
|
|
root = tree.getroot()
|
|
|
|
for ticket in root:
|
|
metadata = {}
|
|
for field in ticket:
|
|
if field.tag not in leave_out:
|
|
|
|
if field.tag in key_function_pairs_to_clean:
|
|
metadata[field.tag] = key_function_pairs_to_clean[field.tag](field.text)
|
|
else:
|
|
metadata[field.tag] = field.text
|
|
|
|
yield metadata
|
|
|
|
|
|
|
|
|
|
string = "Frau Hinrichs überdenkt die tu Situation a@bc.de und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
|
|
|
|
#print(removeWords(string,["die", "neue"]))
|
|
|
|
# in:str out:str
|
|
cleanString = compose(
|
|
cleanText_POS,
|
|
functools.partial(textacy.preprocess.replace_emails, replace_with=u'EMAIL')
|
|
)
|
|
|
|
key_function_pairs_to_clean = {
|
|
"Loesung":removeWhitespace,
|
|
"Zusammenfassung":cleanText_POS
|
|
}
|
|
"""
|
|
# in:str-gen out:str-gen
|
|
cleanStream = compose(
|
|
removeSTOP,
|
|
lemmatize,
|
|
cleanEnt
|
|
)
|
|
"""
|
|
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpi
|
|
# metadata:xml -> -> stringCleaning -> corpi
|
|
|
|
corpus = textacy.Corpus(PARSER)
|
|
|
|
|
|
|
|
|
|
corpus.add_texts(
|
|
cleanTexts(generateMainTextfromTicketXML("ticketSamples.xml"),PARSER,["PUNCT","SPACE","PERSON"])#,
|
|
#generateMetadatafromTicketXML("ticketSamples.xml",key_function_pairs_to_clean=key_function_pairs_to_clean)
|
|
)
|
|
|
|
print(corpus[0].text)
|
|
|