topicModelingTickets/old/testo.py

200 lines
5.4 KiB
Python

# -*- coding: utf-8 -*-
import functools
import re
import spacy
import textacy
from spacy.tokens import Doc
from spacy.tagger import Tagger
import xml.etree.ElementTree as ET
PARSER = spacy.load('de')
stop_words = list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
def compose(*functions):
def compose2(f, g):
return lambda x: f(g(x))
return functools.reduce(compose2, functions, lambda x: x)
def cleanTexts(textstream, parser, attr):
#input str-stream output str-stream
pipe = parser.pipe(textstream)
for doc in pipe:
tokens = [tok.text for tok in doc
if tok.pos_ not in attr
and tok.tag_ not in attr
and tok.ent_ not in attr
and tok.text not in attr
and tok.lower_ not in attr]
yield " ".join(tokens)
"""
def cleanDoc_lemmatize(doc,parser=PARSER):
return parser(" ".join([tok.lemma_ for tok in doc ]))
def cleanDoc_STOPS(doc,parser=PARSER, stop_words=None, keep=None):
if stop_words is None:
stop_words = list(__import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS)
if hasattr(keep, '__iter__'):
for k in keep:
try:
stop_words.remove(k)
except ValueError:
pass
return parser(" ".join([tok.text for tok in doc if tok.text not in stop_words]))
def cleanDoc_ENT(doc,parser=PARSER, keeponly=False, attr=["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]):
if keeponly:
return parser(" ".join([tok.text for tok in doc if tok.ent_ in attr]))
else:
return parser(" ".join([tok.text for tok in doc if tok.ent_ not in attr]))
def cleanDoc_POS(doc,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
if keeponly:
return parser(" ".join([tok.text for tok in doc if tok.pos_ in attr]))
else:
return parser(" ".join([tok.text for tok in doc if tok.pos_ not in attr]))
"""
def cleanTexts_POS(spacypipe, keeponly=False, attr=["SPACE", "PUNCT"]):
"""
:param spacypipe: spacypipe
:param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
:param attr: [str] pos_ or ent_type_
:yields: stream of strings: full-length cleaned text
"""
if keeponly:
for doc in spacypipe:
yield " ".join([tok.text for tok in doc if tok.pos_ in attr])
else:
for doc in spacypipe:
yield " ".join([tok.text for tok in doc if tok.pos_ not in attr])
def cleanText_POS(text,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
"""
:param txt: str
:param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
:param attr: [str] pos_ or ent_type_
:return: str
"""
doc = parser(text)
if keeponly:
return " ".join([tok.text for tok in doc if tok.pos_ in attr])
else:
return " ".join([tok.text for tok in doc if tok.pos_ not in attr])
def removeWhitespace(string):
return re.sub(r'(\r\n|\r|\n|(\s)+)', ' ', string)
def removeWords(string, words):
big_regex = re.compile('|'.join(map(re.escape, words)))
return big_regex.sub("", string)
def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung', cleaning_function=None):
"""
generates strings from XML
:param path2xml:
:param main_textfield:
:param cleaning_function:
:yields strings
"""
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
text = "ERROR"
for field in ticket:
if field.tag == main_textfield:
if cleaning_function:
text = cleaning_function(field.text)
else:
text = field.text
yield text
def generateMetadatafromTicketXML(path2xml, key_function_pairs_to_clean, leave_out=['Beschreibung']):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
metadata = {}
for field in ticket:
if field.tag not in leave_out:
if field.tag in key_function_pairs_to_clean:
metadata[field.tag] = key_function_pairs_to_clean[field.tag](field.text)
else:
metadata[field.tag] = field.text
yield metadata
string = "Frau Hinrichs überdenkt die tu Situation a@bc.de und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
#print(removeWords(string,["die", "neue"]))
# in:str out:str
cleanString = compose(
cleanText_POS,
functools.partial(textacy.preprocess.replace_emails, replace_with=u'EMAIL')
)
key_function_pairs_to_clean = {
"Loesung":removeWhitespace,
"Zusammenfassung":cleanText_POS
}
"""
# in:str-gen out:str-gen
cleanStream = compose(
removeSTOP,
lemmatize,
cleanEnt
)
"""
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpus
# metadata:xml -> -> stringCleaning -> corpus
corpus = textacy.Corpus(PARSER)
corpus.add_texts(
cleanTexts(generateMainTextfromTicketXML("ticketSamples.xml"),PARSER,["PUNCT","SPACE","PERSON"])#,
#generateMetadatafromTicketXML("ticketSamples.xml",key_function_pairs_to_clean=key_function_pairs_to_clean)
)
print(corpus[0].text)