preprocessing erstmal soweit fertig.

das mit der config wird noch verfeinert
This commit is contained in:
jannis.grundmann 2017-09-11 12:12:28 +02:00
parent f09a261816
commit 4dbb07ae3f
6 changed files with 658 additions and 372 deletions

26
config.ini Normal file
View File

@ -0,0 +1,26 @@
[default]
thesauruspath = openthesaurus.csv
path2xml = ticketSamples.xml
language = de
[preprocessing]
ents = WORK_OF_ART,ORG,PRODUCT,LOC
custom_words = grüßen,fragen
#lemmatize = True
default_return_first_Syn = False

466
old/preprocessing.py Normal file
View File

@ -0,0 +1,466 @@
# -*- coding: utf-8 -*-
import csv
import random
import sys
import spacy
import textacy
"""
import keras
import numpy as np
from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout
from keras.models import Sequential
import keras.backend as K
"""
csv.field_size_limit(sys.maxsize)
"""
def getFirstSynonym(word, thesaurus_gen):
word = word.lower()
# TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
# durch den thesaurrus iterieren
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
# durch den synonymblock iterieren
for syn in syn_block:
syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
# falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
if word in syn:
# Hauptform suchen
if "auptform" in syn:
# nicht ausgeben, falls es in Klammern steht
for w in syn:
if not re.match(r'\([^)]+\)', w) and w is not None:
return w
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
if len(syn) == 1:
w = syn[0]
if not re.match(r'\([^)]+\)', w) and w is not None:
return w
return word # zur Not die eingabe ausgeben
"""
"""
def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):
# use preprocessing
if customPreprocessing is not None:
string = customPreprocessing(string)
if custom_stopwords is not None:
custom_stopwords = custom_stopwords
else:
custom_stopwords = []
if custom_words is not None:
custom_words = custom_words
else:
custom_words = []
if custom_symbols is not None:
custom_symbols = custom_symbols
else:
custom_symbols = []
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS
stoplist =list(stop_words) + custom_stopwords
# List of symbols we don't care about either
symbols = ["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"] + custom_symbols
# get rid of newlines
string = string.strip().replace("\n", " ").replace("\r", " ")
# replace twitter
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
string = mentionFinder.sub("MENTION", string)
# replace emails
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
string = emailFinder.sub("EMAIL", string)
# replace urls
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
string = urlFinder.sub("URL", string)
# replace HTML symbols
string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
# parse with spaCy
spacy_doc = PARSER(string)
tokens = []
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ in added_POS:
if lemmatize:
tokens.append(tok.lemma_.lower().strip())
else:
tokens.append(tok.text.lower().strip())
# add entities
if tok.ent_type_ in added_entities:
tokens.append(tok.text.lower())
# remove stopwords
tokens = [tok for tok in tokens if tok not in stoplist]
# remove symbols
tokens = [tok for tok in tokens if tok not in symbols]
# remove custom_words
tokens = [tok for tok in tokens if tok not in custom_words]
# remove single characters
tokens = [tok for tok in tokens if len(tok)>1]
# remove large strings of whitespace
remove_large_strings_of_whitespace(" ".join(tokens))
#idee abkürzungen auflösen (v.a. TU -> Technische Universität)
if normalize_synonyms:
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
return " ".join(tokens)
def remove_large_strings_of_whitespace(sentence):
whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE)
sentence = whitespaceFinder.sub(" ", sentence)
tokenlist = sentence.split(" ")
while "" in tokenlist:
tokenlist.remove("")
while " " in tokenlist:
tokenlist.remove(" ")
return " ".join(tokenlist)
"""
"""
def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
metadata = {}
text = "ERROR"
for field in ticket:
if field.tag == textfield:
if clean:
text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
else:
text = field.text
else:
#idee hier auch cleanen?
metadata[field.tag] = field.text
yield text, metadata
"""
LANGUAGE = 'de'
#PARSER = de_core_news_md.load()
PARSER = spacy.load(LANGUAGE)
from old.textCleaning import TextCleaner
cleaner = TextCleaner(parser=PARSER)
def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
text = "ERROR"
for field in ticket:
if field.tag == textfield:
if clean:
text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
else:
text = field.text
yield text
def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
metadata = {}
for field in ticket:
if field.tag != textfield:
if field.tag == "Zusammenfassung":
metadata[field.tag] = cleaner.removePunctuation(field.text)
elif field.tag == "Loesung":
metadata[field.tag] = cleaner.removeWhitespace(field.text)
else:
metadata[field.tag] = field.text
yield metadata
"""
def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None):
if custom_symbols is not None:
custom_symbols = custom_symbols
else:
custom_symbols = []
if keep is not None:
keep = keep
else:
keep = []
# List of symbols we don't care about
symbols = ["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"] + custom_symbols
# parse with spaCy
spacy_doc = parser(string)
tokens = []
pos = ["NUM", "SPACE", "PUNCT"]
for p in keep:
pos.remove(p)
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ not in pos and tok.text not in symbols:
tokens.append(tok.text)
return " ".join(tokens)
def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False):
# use preprocessing
if customPreprocessing is not None:
string = customPreprocessing(string)
if custom_stopwords is not None:
custom_stopwords = custom_stopwords
else:
custom_stopwords = []
if custom_words is not None:
custom_words = custom_words
else:
custom_words = []
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS
stoplist =list(stop_words) + custom_stopwords
# replace twitter
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
string = mentionFinder.sub("MENTION", string)
# replace emails
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
string = emailFinder.sub("EMAIL", string)
# replace urls
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
string = urlFinder.sub("URL", string)
# replace HTML symbols
string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
# parse with spaCy
spacy_doc = parser(string)
tokens = []
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ in added_POS:
if lemmatize:
tokens.append(tok.lemma_.lower().strip())
else:
tokens.append(tok.text.lower().strip())
# add entities
if tok.ent_type_ in added_entities:
tokens.append(tok.text.lower())
# remove stopwords
tokens = [tok for tok in tokens if tok not in stoplist]
# remove custom_words
tokens = [tok for tok in tokens if tok not in custom_words]
# remove single characters
tokens = [tok for tok in tokens if len(tok)>1]
# remove large strings of whitespace
#remove_whitespace(" ".join(tokens))
#idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis
if normalize_synonyms:
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
return " ".join(set(tokens))
def cleanText_removeWhitespace(sentence):
whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
sentence = whitespaceFinder.sub(" ", sentence)
return sentence
#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms
def getFirstSynonym(word, thesaurus_gen):
word = word.lower()
# durch den thesaurrus iterieren
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
for syn in syn_block:
syn = syn.lower()
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
if word == syn:
return getHauptform(syn_block, word)
else: # falls es ein satz ist
if word in syn:
return getHauptform(syn_block, word)
return word # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform(syn_block, word, default_return_first_Syn=False):
for syn in syn_block:
syn = syn.lower()
if "hauptform" in syn and len(syn.split(" ")) <= 2:
# nicht ausgeben, falls es in Klammern steht
for w in syn.split(" "):
if not re.match(r'\([^)]+\)', w):
return w
if default_return_first_Syn:
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block:
if not re.match(r'\([^)]+\)', w):
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
"""
def printRandomDoc(textacyCorpus):
print()
print("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
print()
####################'####################'####################'####################'####################'##############
# todo config-file
DATAPATH = "ticketSamples.xml"
DATAPATH_thesaurus = "openthesaurus.csv"
normalize_Synonyms = True
clean = True
lemmatize = True
custom_words = ["grüßen", "fragen"]
####################'####################'####################'####################'####################'##############
## files to textacy-corpus
textacyCorpus = textacy.Corpus(PARSER)
print("add texts to textacy-corpus...")
textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))
#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize):
# textacyCorpus.add_text(txt,dic)
for doc in textacyCorpus:
print(doc.metadata)
print(doc.text)
#print(textacyCorpus[2].text)
#printRandomDoc(textacyCorpus)
#print(textacyCorpus[len(textacyCorpus)-1].text)
print()
print()

View File

@ -118,7 +118,7 @@ def keepinDoc(doc, toKeep=None):
return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]) return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep])
#todo https://mathieularose.com/function-composition-in-python/ # https://mathieularose.com/function-composition-in-python/
parser = spacy.load('de') parser = spacy.load('de')
cleaner = TextCleaner(parser) cleaner = TextCleaner(parser)
corpus_raw = textacy.Corpus(parser) corpus_raw = textacy.Corpus(parser)

View File

@ -106,10 +106,6 @@ class TextCleaner:
return " ".join(tokens) return " ".join(tokens)
def resolveAbbreviations(self,string):
return string #todo
def keepPOSandENT(self, string, customPOS=None, customEnt=None, remove=None): def keepPOSandENT(self, string, customPOS=None, customEnt=None, remove=None):
pos2keep = self.pos2keep + (customPOS if customPOS is not None else []) pos2keep = self.pos2keep + (customPOS if customPOS is not None else [])
@ -142,6 +138,10 @@ class TextCleaner:
def resolveAbbreviations(self,string):
return string #todo
def removeWords(self,string, custom_words=None, keep=None, lemmatize=False): def removeWords(self,string, custom_words=None, keep=None, lemmatize=False):
wordlist = self.stop_words + (custom_words if custom_words is not None else []) wordlist = self.stop_words + (custom_words if custom_words is not None else [])
@ -176,11 +176,6 @@ class TextCleaner:
return " ".join(set(tokens)) return " ".join(set(tokens))
def normalizeSynonyms(self, string, default_return_first_Syn=False): def normalizeSynonyms(self, string, default_return_first_Syn=False):
# parse with spaCy # parse with spaCy
spacy_doc = self.parser(string) spacy_doc = self.parser(string)
@ -190,8 +185,6 @@ class TextCleaner:
return " ".join(set(tokens)) return " ".join(set(tokens))
def getFirstSynonym(self,word, thesaurus, default_return_first_Syn=False): def getFirstSynonym(self,word, thesaurus, default_return_first_Syn=False):
if not isinstance(word, str): if not isinstance(word, str):
return word return word

View File

@ -1,5 +1,5 @@
TH;Technische_Universität (Hauptform);Technische Hochschule;TU
Passwort (Hauptform);Kodewort;Schlüsselwort;Zugangscode;Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Losungswort;Parole Passwort (Hauptform);Kodewort;Schlüsselwort;Zugangscode;Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Losungswort;Parole
TH;Technische_Universität (Hauptform);Technische Hochschule;TU
Fission;Kernfission;Kernspaltung;Atomspaltung Fission;Kernfission;Kernspaltung;Atomspaltung
Wiederaufnahme;Fortführung Wiederaufnahme;Fortführung
davonfahren;abdüsen (ugs.);aufbrechen;abfliegen;abfahren;(von etwas) fortfahren;abreisen;wegfahren;wegfliegen davonfahren;abdüsen (ugs.);aufbrechen;abfliegen;abfahren;(von etwas) fortfahren;abreisen;wegfahren;wegfliegen

Can't render this file because it is too large.

View File

@ -1,389 +1,190 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import csv import csv
import random import functools
import re import re
import spacy import spacy
import textacy
import sys import sys
import textacy
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
""" import io
import keras
import numpy as np
from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout
from keras.models import Sequential
import keras.backend as K
"""
csv.field_size_limit(sys.maxsize) csv.field_size_limit(sys.maxsize)
"""
def getFirstSynonym(word, thesaurus_gen):
word = word.lower()
# TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
# durch den thesaurrus iterieren # Load the configuration file
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen import configparser as ConfigParser
config = ConfigParser.ConfigParser()
# durch den synonymblock iterieren with open("config.ini") as f:
for syn in syn_block: config.read_file(f)
syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
# falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
if word in syn:
# Hauptform suchen
if "auptform" in syn:
# nicht ausgeben, falls es in Klammern steht
for w in syn:
if not re.match(r'\([^)]+\)', w) and w is not None:
return w
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
if len(syn) == 1:
w = syn[0]
if not re.match(r'\([^)]+\)', w) and w is not None:
return w
return word # zur Not die eingabe ausgeben
""" PARSER = spacy.load(config.get("default","language"))
""" corpus = textacy.Corpus(PARSER)
def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):
# use preprocessing thesauruspath = config.get("default","thesauruspath")
if customPreprocessing is not None: THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
string = customPreprocessing(string)
if custom_stopwords is not None: def compose(*functions):
custom_stopwords = custom_stopwords def compose2(f, g):
else: return lambda x: f(g(x))
custom_stopwords = [] return functools.reduce(compose2, functions, lambda x: x)
if custom_words is not None:
custom_words = custom_words
else:
custom_words = []
if custom_symbols is not None:
custom_symbols = custom_symbols
else:
custom_symbols = []
# custom stoplist ################ generate Content and Metadata ########################
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS
stoplist =list(stop_words) + custom_stopwords def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'):
# List of symbols we don't care about either """
symbols = ["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"] + custom_symbols generates strings from XML
:param path2xml:
:param main_textfield:
:param cleaning_function:
:yields strings
"""
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
for field in ticket:
if field.tag == main_textfield:
yield field.text
def generateMetadatafromTicketXML(path2xml, leave_out=['Beschreibung']):
# get rid of newlines
string = string.strip().replace("\n", " ").replace("\r", " ")
# replace twitter
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
string = mentionFinder.sub("MENTION", string)
# replace emails
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
string = emailFinder.sub("EMAIL", string)
# replace urls
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
string = urlFinder.sub("URL", string)
# replace HTML symbols
string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
# parse with spaCy
spacy_doc = PARSER(string)
tokens = []
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ in added_POS:
if lemmatize:
tokens.append(tok.lemma_.lower().strip())
else:
tokens.append(tok.text.lower().strip())
# add entities
if tok.ent_type_ in added_entities:
tokens.append(tok.text.lower())
# remove stopwords
tokens = [tok for tok in tokens if tok not in stoplist]
# remove symbols
tokens = [tok for tok in tokens if tok not in symbols]
# remove custom_words
tokens = [tok for tok in tokens if tok not in custom_words]
# remove single characters
tokens = [tok for tok in tokens if len(tok)>1]
# remove large strings of whitespace
remove_large_strings_of_whitespace(" ".join(tokens))
#idee abkürzungen auflösen (v.a. TU -> Technische Universität)
if normalize_synonyms:
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
return " ".join(tokens)
def remove_large_strings_of_whitespace(sentence):
whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE)
sentence = whitespaceFinder.sub(" ", sentence)
tokenlist = sentence.split(" ")
while "" in tokenlist:
tokenlist.remove("")
while " " in tokenlist:
tokenlist.remove(" ")
return " ".join(tokenlist)
"""
"""
def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot() root = tree.getroot()
for ticket in root: for ticket in root:
metadata = {} metadata = {}
text = "ERROR"
for field in ticket: for field in ticket:
if field.tag == textfield: if field.tag not in leave_out:
if clean:
text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
else:
text = field.text
else:
#idee hier auch cleanen?
metadata[field.tag] = field.text
yield text, metadata
"""
LANGUAGE = 'de'
#PARSER = de_core_news_md.load()
PARSER = spacy.load(LANGUAGE)
from textCleaning import TextCleaner
cleaner = TextCleaner(parser=PARSER)
def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
text = "ERROR"
for field in ticket:
if field.tag == textfield:
if clean:
text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
else:
text = field.text
yield text
def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
metadata = {}
for field in ticket:
if field.tag != textfield:
if field.tag == "Zusammenfassung":
metadata[field.tag] = cleaner.removePunctuation(field.text)
elif field.tag == "Loesung":
metadata[field.tag] = cleaner.removeWhitespace(field.text)
else:
metadata[field.tag] = field.text metadata[field.tag] = field.text
yield metadata yield metadata
def printRandomDoc(textacyCorpus):
import random
print()
print("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
print()
""" ################ Preprocess#########################
def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None):
if custom_symbols is not None: def processDictstream(dictstream, funcdict, parser=PARSER):
custom_symbols = custom_symbols for dic in dictstream:
result = {}
for key, value in dic.items():
if key in funcdict:
result[key] = funcdict[key](parser(value))
else: else:
custom_symbols = [] result[key] = key
yield result
if keep is not None: def processTextstream(textstream, func, parser=PARSER):
keep = keep # input str-stream output str-stream
else: pipe = parser.pipe(textstream)
keep = []
# List of symbols we don't care about for doc in pipe:
symbols = ["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"] + custom_symbols yield func(doc)
# parse with spaCy
spacy_doc = parser(string)
tokens = []
pos = ["NUM", "SPACE", "PUNCT"]
for p in keep:
pos.remove(p)
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ not in pos and tok.text not in symbols:
tokens.append(tok.text)
return " ".join(tokens)
def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False):
# use preprocessing
if customPreprocessing is not None:
string = customPreprocessing(string)
if custom_stopwords is not None:
custom_stopwords = custom_stopwords
else:
custom_stopwords = []
if custom_words is not None:
custom_words = custom_words
else:
custom_words = []
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS
stoplist =list(stop_words) + custom_stopwords
# replace twitter
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
string = mentionFinder.sub("MENTION", string)
# replace emails
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
string = emailFinder.sub("EMAIL", string)
# replace urls
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
string = urlFinder.sub("URL", string)
# replace HTML symbols
string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
# parse with spaCy
spacy_doc = parser(string)
tokens = []
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] def keepOnlyPOS(pos_list, parser=PARSER):
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 return lambda doc : parser(" ".join([tok.text for tok in doc if tok.pos_ in pos_list]))
# append Tokens to a list def removeAllPOS(pos_list, parser=PARSER):
for tok in spacy_doc: return lambda doc: parser(" ".join([tok.text for tok in doc if tok.pos_ not in pos_list]))
if tok.pos_ in added_POS:
if lemmatize:
tokens.append(tok.lemma_.lower().strip())
else:
tokens.append(tok.text.lower().strip())
# add entities def keepOnlyENT(ent_list,parser=PARSER):
if tok.ent_type_ in added_entities: return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ in ent_list]))
tokens.append(tok.text.lower())
def removeAllENT(ent_list, parser=PARSER):
return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ not in ent_list]))
# remove stopwords
tokens = [tok for tok in tokens if tok not in stoplist]
# remove custom_words doc2Set = lambda doc: str(set([tok.text for tok in doc]))
tokens = [tok for tok in tokens if tok not in custom_words] doc2String = lambda doc : doc.text
# remove single characters
tokens = [tok for tok in tokens if len(tok)>1]
# remove large strings of whitespace
#remove_whitespace(" ".join(tokens))
#idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
if normalize_synonyms: def replaceURLs(replace_with="URL",parser=PARSER):
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] #return lambda doc: parser(textacy.preprocess.replace_urls(doc.text,replace_with=replace_with))
return lambda doc: parser(urlFinder.sub(replace_with,doc.text))
return " ".join(set(tokens)) def replaceEmails(replace_with="EMAIL",parser=PARSER):
#return lambda doc: parser(textacy.preprocess.replace_emails(doc.text,replace_with=replace_with))
return lambda doc : parser(emailFinder.sub(replace_with, doc.text))
def cleanText_removeWhitespace(sentence): def replaceTwitterMentions(replace_with="TWITTER_MENTION",parser=PARSER):
whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) return lambda doc : parser(mentionFinder.sub(replace_with, doc.text))
sentence = whitespaceFinder.sub(" ", sentence)
return sentence
#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms def replaceNumbers(replace_with="NUMBER",parser=PARSER):
return lambda doc: parser(textacy.preprocess.replace_numbers(doc.text, replace_with=replace_with))
def replacePhonenumbers(replace_with="PHONE",parser=PARSER):
return lambda doc: parser(textacy.preprocess.replace_phone_numbers(doc.text, replace_with=replace_with))
def getFirstSynonym(word, thesaurus_gen):
def resolveAbbreviations(parser=PARSER):
pass #todo
def removeWords(words, keep=None,parser=PARSER):
if hasattr(keep, '__iter__'):
for k in keep:
try:
words.remove(k)
except ValueError:
pass
return lambda doc : parser(" ".join([tok.text for tok in doc if tok.lower_ not in words]))
def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))
def getFirstSynonym(word, thesaurus, default_return_first_Syn=False):
if not isinstance(word, str):
return str(word)
word = word.lower() word = word.lower()
# durch den thesaurrus iterieren # durch den thesaurrus iterieren
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
for syn in syn_block: for syn in syn_block:
syn = syn.lower() syn = syn.lower()
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
if word == syn: if word == syn:
return getHauptform(syn_block, word) return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
else: # falls es ein satz ist else: # falls es ein satz ist
if word in syn: if word in syn:
return getHauptform(syn_block, word) return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
return word # zur Not, das ursrpüngliche Wort zurückgeben return str(word) # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform(syn_block, word, default_return_first_Syn=False): def getHauptform(syn_block, word, default_return_first_Syn=False):
for syn in syn_block: for syn in syn_block:
syn = syn.lower() syn = syn.lower()
if "hauptform" in syn and len(syn.split(" ")) <= 2: if "hauptform" in syn and len(syn.split(" ")) <= 2:
# nicht ausgeben, falls es in Klammern steht # nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
for w in syn.split(" "): for w in syn.split(" "):
if not re.match(r'\([^)]+\)', w): if not re.match(r'\([^)]+\)', w):
return w return w
@ -394,58 +195,58 @@ def getHauptform(syn_block, word, default_return_first_Syn=False):
if not re.match(r'\([^)]+\)', w): if not re.match(r'\([^)]+\)', w):
return w return w
return word # zur Not, das ursrpüngliche Wort zurückgeben return word # zur Not, das ursrpüngliche Wort zurückgeben
"""
def printRandomDoc(textacyCorpus):
print()
print("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
print()
####################'####################'####################'####################'####################'##############
# todo config-file
import de_core_news_md
DATAPATH = "ticketSamples.xml"
DATAPATH_thesaurus = "openthesaurus.csv"
normalize_Synonyms = True stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",")
clean = True
lemmatize = True path2xml = config.get("default","path2xml")
content_generator = generateMainTextfromTicketXML(path2xml)
metadata_generator = generateMetadatafromTicketXML(path2xml)
ents = config.get("preprocessing","ents").split(",")
clean_in_content=compose(
doc2String,
#normalizeSynonyms(default_return_first_Syn=config.get("preprocessing","default_return_first_Syn")),
replaceEmails(),
replaceURLs(),
replaceTwitterMentions(),
removeWords(stop_words),
#removeAllPOS(["SPACE","PUNCT"]),
#removeAllENT(ents),
keepOnlyPOS(['NOUN'])
)
clean_in_meta = {
"Loesung":removeAllPOS(["SPACE"]),
"Zusammenfassung":removeAllPOS(["SPACE","PUNCT"])
}
contentStream = processTextstream(content_generator, func=clean_in_content)
metaStream = processDictstream(metadata_generator, funcdict=clean_in_meta)
corpus.add_texts(contentStream,metaStream)
print(corpus[0].text)
printRandomDoc(corpus)
custom_words = ["grüßen", "fragen"]
####################'####################'####################'####################'####################'##############
## files to textacy-corpus
textacyCorpus = textacy.Corpus(PARSER)
print("add texts to textacy-corpus...")
textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))
#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize):
# textacyCorpus.add_text(txt,dic)
for doc in textacyCorpus:
print(doc.metadata)
print(doc.text)
#print(textacyCorpus[2].text)
#printRandomDoc(textacyCorpus)
#print(textacyCorpus[len(textacyCorpus)-1].text)
print()
print()