From 4dbb07ae3fcfd76128a954df8787731da934f566 Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Mon, 11 Sep 2017 12:12:28 +0200 Subject: [PATCH] preprocessing erstmal soweit fertig. das mit der config wird noch verfeinert --- config.ini | 26 ++ old/preprocessing.py | 466 ++++++++++++++++++++++ test.py => old/test.py | 2 +- textCleaning.py => old/textCleaning.py | 15 +- openthesaurus.csv | 2 +- preprocessing.py | 519 ++++++++----------------- 6 files changed, 658 insertions(+), 372 deletions(-) create mode 100644 config.ini create mode 100644 old/preprocessing.py rename test.py => old/test.py (99%) rename textCleaning.py => old/textCleaning.py (99%) diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..5612339 --- /dev/null +++ b/config.ini @@ -0,0 +1,26 @@ +[default] + +thesauruspath = openthesaurus.csv +path2xml = ticketSamples.xml + +language = de + +[preprocessing] + +ents = WORK_OF_ART,ORG,PRODUCT,LOC + +custom_words = grüßen,fragen + +#lemmatize = True + +default_return_first_Syn = False + + + + + + + + + + diff --git a/old/preprocessing.py b/old/preprocessing.py new file mode 100644 index 0000000..6bd8c3e --- /dev/null +++ b/old/preprocessing.py @@ -0,0 +1,466 @@ +# -*- coding: utf-8 -*- +import csv +import random +import sys + +import spacy +import textacy + +""" +import keras +import numpy as np +from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout +from keras.models import Sequential +import keras.backend as K +""" +csv.field_size_limit(sys.maxsize) + +""" +def getFirstSynonym(word, thesaurus_gen): + + word = word.lower() + # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python + + + # durch den thesaurrus iterieren + for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen + + # durch den synonymblock iterieren + for syn in syn_block: + syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren) + + # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist) + if word in syn: + + # Hauptform suchen + if "auptform" in syn: + # nicht ausgeben, falls es in Klammern steht + for w in syn: + if not re.match(r'\([^)]+\)', w) and w is not None: + return w + + # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht + if len(syn) == 1: + w = syn[0] + if not re.match(r'\([^)]+\)', w) and w is not None: + return w + + return word # zur Not die eingabe ausgeben + + +""" +""" +def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False): + + # use preprocessing + if customPreprocessing is not None: + string = customPreprocessing(string) + + + + if custom_stopwords is not None: + custom_stopwords = custom_stopwords + else: + custom_stopwords = [] + + if custom_words is not None: + custom_words = custom_words + else: + custom_words = [] + + if custom_symbols is not None: + custom_symbols = custom_symbols + else: + custom_symbols = [] + + + # custom stoplist + # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import + stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS + + stoplist =list(stop_words) + custom_stopwords + # List of symbols we don't care about either + symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols + + + + # get rid of newlines + string = string.strip().replace("\n", " ").replace("\r", " ") + + # replace twitter + mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) + string = mentionFinder.sub("MENTION", string) + + # replace emails + emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + string = emailFinder.sub("EMAIL", string) + + # replace urls + urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + string = urlFinder.sub("URL", string) + + # replace HTML symbols + string = string.replace("&", "and").replace(">", ">").replace("<", "<") + + + + + # parse with spaCy + spacy_doc = PARSER(string) + tokens = [] + + added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] + added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ in added_POS: + if lemmatize: + tokens.append(tok.lemma_.lower().strip()) + else: + tokens.append(tok.text.lower().strip()) + + # add entities + if tok.ent_type_ in added_entities: + tokens.append(tok.text.lower()) + + + + # remove stopwords + tokens = [tok for tok in tokens if tok not in stoplist] + + # remove symbols + tokens = [tok for tok in tokens if tok not in symbols] + + # remove custom_words + tokens = [tok for tok in tokens if tok not in custom_words] + + # remove single characters + tokens = [tok for tok in tokens if len(tok)>1] + + # remove large strings of whitespace + remove_large_strings_of_whitespace(" ".join(tokens)) + + + #idee abkürzungen auflösen (v.a. TU -> Technische Universität) + + if normalize_synonyms: + tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] + + return " ".join(tokens) + + +def remove_large_strings_of_whitespace(sentence): + + whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE) + sentence = whitespaceFinder.sub(" ", sentence) + + tokenlist = sentence.split(" ") + + while "" in tokenlist: + tokenlist.remove("") + while " " in tokenlist: + tokenlist.remove(" ") + + return " ".join(tokenlist) +""" +""" +def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False): + import xml.etree.ElementTree as ET + + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + root = tree.getroot() + + for ticket in root: + metadata = {} + text = "ERROR" + for field in ticket: + if field.tag == textfield: + if clean: + text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) + else: + text = field.text + else: + #idee hier auch cleanen? + metadata[field.tag] = field.text + yield text, metadata +""" + + +LANGUAGE = 'de' +#PARSER = de_core_news_md.load() +PARSER = spacy.load(LANGUAGE) + +from old.textCleaning import TextCleaner + +cleaner = TextCleaner(parser=PARSER) + + +def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False): + import xml.etree.ElementTree as ET + + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + root = tree.getroot() + + + for ticket in root: + text = "ERROR" + for field in ticket: + if field.tag == textfield: + if clean: + text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) + else: + text = field.text + yield text + +def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]): + import xml.etree.ElementTree as ET + + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + + root = tree.getroot() + + for ticket in root: + metadata = {} + for field in ticket: + if field.tag != textfield: + if field.tag == "Zusammenfassung": + metadata[field.tag] = cleaner.removePunctuation(field.text) + elif field.tag == "Loesung": + metadata[field.tag] = cleaner.removeWhitespace(field.text) + else: + metadata[field.tag] = field.text + + yield metadata + + + + +""" +def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): + + if custom_symbols is not None: + custom_symbols = custom_symbols + else: + custom_symbols = [] + + if keep is not None: + keep = keep + else: + keep = [] + + # List of symbols we don't care about + symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols + + # parse with spaCy + spacy_doc = parser(string) + tokens = [] + + pos = ["NUM", "SPACE", "PUNCT"] + for p in keep: + pos.remove(p) + + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ not in pos and tok.text not in symbols: + tokens.append(tok.text) + + return " ".join(tokens) + +def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False): + + # use preprocessing + if customPreprocessing is not None: + string = customPreprocessing(string) + + if custom_stopwords is not None: + custom_stopwords = custom_stopwords + else: + custom_stopwords = [] + + if custom_words is not None: + custom_words = custom_words + else: + custom_words = [] + + + # custom stoplist + # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import + stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS + + stoplist =list(stop_words) + custom_stopwords + + # replace twitter + mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) + string = mentionFinder.sub("MENTION", string) + + # replace emails + emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + string = emailFinder.sub("EMAIL", string) + + # replace urls + urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + string = urlFinder.sub("URL", string) + + # replace HTML symbols + string = string.replace("&", "and").replace(">", ">").replace("<", "<") + + + + # parse with spaCy + spacy_doc = parser(string) + tokens = [] + + added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] + added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ in added_POS: + if lemmatize: + tokens.append(tok.lemma_.lower().strip()) + else: + tokens.append(tok.text.lower().strip()) + + # add entities + if tok.ent_type_ in added_entities: + tokens.append(tok.text.lower()) + + + + # remove stopwords + tokens = [tok for tok in tokens if tok not in stoplist] + + # remove custom_words + tokens = [tok for tok in tokens if tok not in custom_words] + + # remove single characters + tokens = [tok for tok in tokens if len(tok)>1] + + # remove large strings of whitespace + #remove_whitespace(" ".join(tokens)) + + + #idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis + + if normalize_synonyms: + tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] + + return " ".join(set(tokens)) + +def cleanText_removeWhitespace(sentence): + whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) + sentence = whitespaceFinder.sub(" ", sentence) + return sentence + +#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms + + +def getFirstSynonym(word, thesaurus_gen): + + word = word.lower() + + + # durch den thesaurrus iterieren + for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen + + for syn in syn_block: + syn = syn.lower() + if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist + if word == syn: + return getHauptform(syn_block, word) + else: # falls es ein satz ist + if word in syn: + return getHauptform(syn_block, word) + return word # zur Not, das ursrpüngliche Wort zurückgeben + +def getHauptform(syn_block, word, default_return_first_Syn=False): + + for syn in syn_block: + syn = syn.lower() + + if "hauptform" in syn and len(syn.split(" ")) <= 2: + # nicht ausgeben, falls es in Klammern steht + for w in syn.split(" "): + if not re.match(r'\([^)]+\)', w): + return w + + if default_return_first_Syn: + # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht + for w in syn_block: + if not re.match(r'\([^)]+\)', w): + return w + return word # zur Not, das ursrpüngliche Wort zurückgeben +""" + +def printRandomDoc(textacyCorpus): + print() + + print("len(textacyCorpus) = %i" % len(textacyCorpus)) + randIndex = int((len(textacyCorpus) - 1) * random.random()) + print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) + + print() + +####################'####################'####################'####################'####################'############## +# todo config-file + +DATAPATH = "ticketSamples.xml" +DATAPATH_thesaurus = "openthesaurus.csv" + + + +normalize_Synonyms = True +clean = True +lemmatize = True + +custom_words = ["grüßen", "fragen"] + +####################'####################'####################'####################'####################'############## + + +## files to textacy-corpus +textacyCorpus = textacy.Corpus(PARSER) + +print("add texts to textacy-corpus...") +textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH)) + + +#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize): +# textacyCorpus.add_text(txt,dic) + + + +for doc in textacyCorpus: + print(doc.metadata) + print(doc.text) + +#print(textacyCorpus[2].text) +#printRandomDoc(textacyCorpus) +#print(textacyCorpus[len(textacyCorpus)-1].text) + + +print() +print() + + + + + + + + + + + + + + + + + + + + + diff --git a/test.py b/old/test.py similarity index 99% rename from test.py rename to old/test.py index 9560698..fc2ee00 100644 --- a/test.py +++ b/old/test.py @@ -118,7 +118,7 @@ def keepinDoc(doc, toKeep=None): return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]) -#todo https://mathieularose.com/function-composition-in-python/ +# https://mathieularose.com/function-composition-in-python/ parser = spacy.load('de') cleaner = TextCleaner(parser) corpus_raw = textacy.Corpus(parser) diff --git a/textCleaning.py b/old/textCleaning.py similarity index 99% rename from textCleaning.py rename to old/textCleaning.py index dae6afb..da2fcd3 100644 --- a/textCleaning.py +++ b/old/textCleaning.py @@ -106,10 +106,6 @@ class TextCleaner: return " ".join(tokens) - def resolveAbbreviations(self,string): - return string #todo - - def keepPOSandENT(self, string, customPOS=None, customEnt=None, remove=None): pos2keep = self.pos2keep + (customPOS if customPOS is not None else []) @@ -142,6 +138,10 @@ class TextCleaner: + + + def resolveAbbreviations(self,string): + return string #todo def removeWords(self,string, custom_words=None, keep=None, lemmatize=False): wordlist = self.stop_words + (custom_words if custom_words is not None else []) @@ -176,11 +176,6 @@ class TextCleaner: return " ".join(set(tokens)) - - - - - def normalizeSynonyms(self, string, default_return_first_Syn=False): # parse with spaCy spacy_doc = self.parser(string) @@ -190,8 +185,6 @@ class TextCleaner: return " ".join(set(tokens)) - - def getFirstSynonym(self,word, thesaurus, default_return_first_Syn=False): if not isinstance(word, str): return word diff --git a/openthesaurus.csv b/openthesaurus.csv index caad708..e0c28df 100644 --- a/openthesaurus.csv +++ b/openthesaurus.csv @@ -1,5 +1,5 @@ -TH;Technische_Universität (Hauptform);Technische Hochschule;TU Passwort (Hauptform);Kodewort;Schlüsselwort;Zugangscode;Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Losungswort;Parole +TH;Technische_Universität (Hauptform);Technische Hochschule;TU Fission;Kernfission;Kernspaltung;Atomspaltung Wiederaufnahme;Fortführung davonfahren;abdüsen (ugs.);aufbrechen;abfliegen;abfahren;(von etwas) fortfahren;abreisen;wegfahren;wegfliegen diff --git a/preprocessing.py b/preprocessing.py index 89b6317..70cb127 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -1,389 +1,190 @@ # -*- coding: utf-8 -*- import csv -import random +import functools import re - import spacy -import textacy import sys - +import textacy import xml.etree.ElementTree as ET -""" -import keras -import numpy as np -from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout -from keras.models import Sequential -import keras.backend as K -""" +import io csv.field_size_limit(sys.maxsize) -""" -def getFirstSynonym(word, thesaurus_gen): - - word = word.lower() - # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python - # durch den thesaurrus iterieren - for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen - - # durch den synonymblock iterieren - for syn in syn_block: - syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren) - - # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist) - if word in syn: - - # Hauptform suchen - if "auptform" in syn: - # nicht ausgeben, falls es in Klammern steht - for w in syn: - if not re.match(r'\([^)]+\)', w) and w is not None: - return w - - # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht - if len(syn) == 1: - w = syn[0] - if not re.match(r'\([^)]+\)', w) and w is not None: - return w - - return word # zur Not die eingabe ausgeben +# Load the configuration file +import configparser as ConfigParser +config = ConfigParser.ConfigParser() +with open("config.ini") as f: + config.read_file(f) -""" -""" -def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False): +PARSER = spacy.load(config.get("default","language")) +corpus = textacy.Corpus(PARSER) - # use preprocessing - if customPreprocessing is not None: - string = customPreprocessing(string) +thesauruspath = config.get("default","thesauruspath") +THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) - if custom_stopwords is not None: - custom_stopwords = custom_stopwords - else: - custom_stopwords = [] - - if custom_words is not None: - custom_words = custom_words - else: - custom_words = [] - - if custom_symbols is not None: - custom_symbols = custom_symbols - else: - custom_symbols = [] +def compose(*functions): + def compose2(f, g): + return lambda x: f(g(x)) + return functools.reduce(compose2, functions, lambda x: x) - # custom stoplist - # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import - stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS +################ generate Content and Metadata ######################## - stoplist =list(stop_words) + custom_stopwords - # List of symbols we don't care about either - symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols +def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'): + """ + generates strings from XML + :param path2xml: + :param main_textfield: + :param cleaning_function: + :yields strings + """ + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + root = tree.getroot() + for ticket in root: + for field in ticket: + if field.tag == main_textfield: + yield field.text - - # get rid of newlines - string = string.strip().replace("\n", " ").replace("\r", " ") - - # replace twitter - mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) - string = mentionFinder.sub("MENTION", string) - - # replace emails - emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) - string = emailFinder.sub("EMAIL", string) - - # replace urls - urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) - string = urlFinder.sub("URL", string) - - # replace HTML symbols - string = string.replace("&", "and").replace(">", ">").replace("<", "<") - - - - - # parse with spaCy - spacy_doc = PARSER(string) - tokens = [] - - added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] - added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 - - # append Tokens to a list - for tok in spacy_doc: - if tok.pos_ in added_POS: - if lemmatize: - tokens.append(tok.lemma_.lower().strip()) - else: - tokens.append(tok.text.lower().strip()) - - # add entities - if tok.ent_type_ in added_entities: - tokens.append(tok.text.lower()) - - - - # remove stopwords - tokens = [tok for tok in tokens if tok not in stoplist] - - # remove symbols - tokens = [tok for tok in tokens if tok not in symbols] - - # remove custom_words - tokens = [tok for tok in tokens if tok not in custom_words] - - # remove single characters - tokens = [tok for tok in tokens if len(tok)>1] - - # remove large strings of whitespace - remove_large_strings_of_whitespace(" ".join(tokens)) - - - #idee abkürzungen auflösen (v.a. TU -> Technische Universität) - - if normalize_synonyms: - tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] - - return " ".join(tokens) - - -def remove_large_strings_of_whitespace(sentence): - - whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE) - sentence = whitespaceFinder.sub(" ", sentence) - - tokenlist = sentence.split(" ") - - while "" in tokenlist: - tokenlist.remove("") - while " " in tokenlist: - tokenlist.remove(" ") - - return " ".join(tokenlist) -""" -""" -def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False): - import xml.etree.ElementTree as ET - +def generateMetadatafromTicketXML(path2xml, leave_out=['Beschreibung']): tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) root = tree.getroot() for ticket in root: metadata = {} - text = "ERROR" for field in ticket: - if field.tag == textfield: - if clean: - text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) - else: - text = field.text - else: - #idee hier auch cleanen? - metadata[field.tag] = field.text - yield text, metadata -""" + if field.tag not in leave_out: - -LANGUAGE = 'de' -#PARSER = de_core_news_md.load() -PARSER = spacy.load(LANGUAGE) - -from textCleaning import TextCleaner - -cleaner = TextCleaner(parser=PARSER) - - -def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False): - import xml.etree.ElementTree as ET - - tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) - root = tree.getroot() - - - for ticket in root: - text = "ERROR" - for field in ticket: - if field.tag == textfield: - if clean: - text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) - else: - text = field.text - yield text - -def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]): - import xml.etree.ElementTree as ET - - tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) - - root = tree.getroot() - - for ticket in root: - metadata = {} - for field in ticket: - if field.tag != textfield: - if field.tag == "Zusammenfassung": - metadata[field.tag] = cleaner.removePunctuation(field.text) - elif field.tag == "Loesung": - metadata[field.tag] = cleaner.removeWhitespace(field.text) - else: metadata[field.tag] = field.text yield metadata +def printRandomDoc(textacyCorpus): + import random + print() + + print("len(textacyCorpus) = %i" % len(textacyCorpus)) + randIndex = int((len(textacyCorpus) - 1) * random.random()) + print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) + + print() + +################ Preprocess######################### + +def processDictstream(dictstream, funcdict, parser=PARSER): + for dic in dictstream: + result = {} + for key, value in dic.items(): + if key in funcdict: + result[key] = funcdict[key](parser(value)) + else: + result[key] = key + yield result + +def processTextstream(textstream, func, parser=PARSER): + # input str-stream output str-stream + pipe = parser.pipe(textstream) + + for doc in pipe: + yield func(doc) -""" -def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): - if custom_symbols is not None: - custom_symbols = custom_symbols - else: - custom_symbols = [] +def keepOnlyPOS(pos_list, parser=PARSER): + return lambda doc : parser(" ".join([tok.text for tok in doc if tok.pos_ in pos_list])) - if keep is not None: - keep = keep - else: - keep = [] +def removeAllPOS(pos_list, parser=PARSER): + return lambda doc: parser(" ".join([tok.text for tok in doc if tok.pos_ not in pos_list])) - # List of symbols we don't care about - symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols +def keepOnlyENT(ent_list,parser=PARSER): + return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ in ent_list])) - # parse with spaCy - spacy_doc = parser(string) - tokens = [] - - pos = ["NUM", "SPACE", "PUNCT"] - for p in keep: - pos.remove(p) - - - # append Tokens to a list - for tok in spacy_doc: - if tok.pos_ not in pos and tok.text not in symbols: - tokens.append(tok.text) - - return " ".join(tokens) - -def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False): - - # use preprocessing - if customPreprocessing is not None: - string = customPreprocessing(string) - - if custom_stopwords is not None: - custom_stopwords = custom_stopwords - else: - custom_stopwords = [] - - if custom_words is not None: - custom_words = custom_words - else: - custom_words = [] - - - # custom stoplist - # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import - stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS - - stoplist =list(stop_words) + custom_stopwords - - # replace twitter - mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) - string = mentionFinder.sub("MENTION", string) - - # replace emails - emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) - string = emailFinder.sub("EMAIL", string) - - # replace urls - urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) - string = urlFinder.sub("URL", string) - - # replace HTML symbols - string = string.replace("&", "and").replace(">", ">").replace("<", "<") +def removeAllENT(ent_list, parser=PARSER): + return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ not in ent_list])) - # parse with spaCy - spacy_doc = parser(string) - tokens = [] - added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] - added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 +doc2Set = lambda doc: str(set([tok.text for tok in doc])) +doc2String = lambda doc : doc.text - # append Tokens to a list - for tok in spacy_doc: - if tok.pos_ in added_POS: - if lemmatize: - tokens.append(tok.lemma_.lower().strip()) - else: - tokens.append(tok.text.lower().strip()) - # add entities - if tok.ent_type_ in added_entities: - tokens.append(tok.text.lower()) +mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) +emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) +urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + +def replaceURLs(replace_with="URL",parser=PARSER): + #return lambda doc: parser(textacy.preprocess.replace_urls(doc.text,replace_with=replace_with)) + return lambda doc: parser(urlFinder.sub(replace_with,doc.text)) + +def replaceEmails(replace_with="EMAIL",parser=PARSER): + #return lambda doc: parser(textacy.preprocess.replace_emails(doc.text,replace_with=replace_with)) + return lambda doc : parser(emailFinder.sub(replace_with, doc.text)) + +def replaceTwitterMentions(replace_with="TWITTER_MENTION",parser=PARSER): + return lambda doc : parser(mentionFinder.sub(replace_with, doc.text)) + +def replaceNumbers(replace_with="NUMBER",parser=PARSER): + return lambda doc: parser(textacy.preprocess.replace_numbers(doc.text, replace_with=replace_with)) + +def replacePhonenumbers(replace_with="PHONE",parser=PARSER): + return lambda doc: parser(textacy.preprocess.replace_phone_numbers(doc.text, replace_with=replace_with)) - # remove stopwords - tokens = [tok for tok in tokens if tok not in stoplist] - - # remove custom_words - tokens = [tok for tok in tokens if tok not in custom_words] - - # remove single characters - tokens = [tok for tok in tokens if len(tok)>1] - - # remove large strings of whitespace - #remove_whitespace(" ".join(tokens)) - #idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis - if normalize_synonyms: - tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] - - return " ".join(set(tokens)) - -def cleanText_removeWhitespace(sentence): - whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) - sentence = whitespaceFinder.sub(" ", sentence) - return sentence - -#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms +def resolveAbbreviations(parser=PARSER): + pass #todo -def getFirstSynonym(word, thesaurus_gen): +def removeWords(words, keep=None,parser=PARSER): + if hasattr(keep, '__iter__'): + for k in keep: + try: + words.remove(k) + except ValueError: + pass + return lambda doc : parser(" ".join([tok.text for tok in doc if tok.lower_ not in words])) + + + + +def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER): + #return lambda doc : parser(" ".join([tok.lower_ for tok in doc])) + return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc])) + +def getFirstSynonym(word, thesaurus, default_return_first_Syn=False): + if not isinstance(word, str): + return str(word) word = word.lower() - # durch den thesaurrus iterieren - for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen + for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen for syn in syn_block: syn = syn.lower() if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist if word == syn: - return getHauptform(syn_block, word) + return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)) else: # falls es ein satz ist if word in syn: - return getHauptform(syn_block, word) - return word # zur Not, das ursrpüngliche Wort zurückgeben + return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)) + return str(word) # zur Not, das ursrpüngliche Wort zurückgeben def getHauptform(syn_block, word, default_return_first_Syn=False): - for syn in syn_block: syn = syn.lower() if "hauptform" in syn and len(syn.split(" ")) <= 2: - # nicht ausgeben, falls es in Klammern steht + # nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus for w in syn.split(" "): if not re.match(r'\([^)]+\)', w): return w @@ -394,58 +195,58 @@ def getHauptform(syn_block, word, default_return_first_Syn=False): if not re.match(r'\([^)]+\)', w): return w return word # zur Not, das ursrpüngliche Wort zurückgeben -""" - -def printRandomDoc(textacyCorpus): - print() - - print("len(textacyCorpus) = %i" % len(textacyCorpus)) - randIndex = int((len(textacyCorpus) - 1) * random.random()) - print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) - - print() - -####################'####################'####################'####################'####################'############## -# todo config-file - -import de_core_news_md -DATAPATH = "ticketSamples.xml" -DATAPATH_thesaurus = "openthesaurus.csv" -normalize_Synonyms = True -clean = True -lemmatize = True +stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",") + +path2xml = config.get("default","path2xml") + + + +content_generator = generateMainTextfromTicketXML(path2xml) +metadata_generator = generateMetadatafromTicketXML(path2xml) + + + +ents = config.get("preprocessing","ents").split(",") + + + +clean_in_content=compose( + + doc2String, + #normalizeSynonyms(default_return_first_Syn=config.get("preprocessing","default_return_first_Syn")), + replaceEmails(), + replaceURLs(), + replaceTwitterMentions(), + removeWords(stop_words), + #removeAllPOS(["SPACE","PUNCT"]), + #removeAllENT(ents), + keepOnlyPOS(['NOUN']) +) + +clean_in_meta = { + "Loesung":removeAllPOS(["SPACE"]), + "Zusammenfassung":removeAllPOS(["SPACE","PUNCT"]) +} + + +contentStream = processTextstream(content_generator, func=clean_in_content) +metaStream = processDictstream(metadata_generator, funcdict=clean_in_meta) + + +corpus.add_texts(contentStream,metaStream) +print(corpus[0].text) +printRandomDoc(corpus) + + -custom_words = ["grüßen", "fragen"] - -####################'####################'####################'####################'####################'############## - - -## files to textacy-corpus -textacyCorpus = textacy.Corpus(PARSER) - -print("add texts to textacy-corpus...") -textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH)) - - -#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize): -# textacyCorpus.add_text(txt,dic) -for doc in textacyCorpus: - print(doc.metadata) - print(doc.text) - -#print(textacyCorpus[2].text) -#printRandomDoc(textacyCorpus) -#print(textacyCorpus[len(textacyCorpus)-1].text) -print() -print()