# -*- coding: utf-8 -*- import csv import random import re import spacy import textacy import sys import xml.etree.ElementTree as ET """ import keras import numpy as np from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout from keras.models import Sequential import keras.backend as K """ csv.field_size_limit(sys.maxsize) """ def getFirstSynonym(word, thesaurus_gen): word = word.lower() # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python # durch den thesaurrus iterieren for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen # durch den synonymblock iterieren for syn in syn_block: syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren) # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist) if word in syn: # Hauptform suchen if "auptform" in syn: # nicht ausgeben, falls es in Klammern steht for w in syn: if not re.match(r'\([^)]+\)', w) and w is not None: return w # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht if len(syn) == 1: w = syn[0] if not re.match(r'\([^)]+\)', w) and w is not None: return w return word # zur Not die eingabe ausgeben """ """ def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False): # use preprocessing if customPreprocessing is not None: string = customPreprocessing(string) if custom_stopwords is not None: custom_stopwords = custom_stopwords else: custom_stopwords = [] if custom_words is not None: custom_words = custom_words else: custom_words = [] if custom_symbols is not None: custom_symbols = custom_symbols else: custom_symbols = [] # custom stoplist # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS stoplist =list(stop_words) + custom_stopwords # List of symbols we don't care about either symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols # get rid of newlines string = string.strip().replace("\n", " ").replace("\r", " ") # replace twitter mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) string = mentionFinder.sub("MENTION", string) # replace emails emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) string = emailFinder.sub("EMAIL", string) # replace urls urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) string = urlFinder.sub("URL", string) # replace HTML symbols string = string.replace("&", "and").replace(">", ">").replace("<", "<") # parse with spaCy spacy_doc = PARSER(string) tokens = [] added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 # append Tokens to a list for tok in spacy_doc: if tok.pos_ in added_POS: if lemmatize: tokens.append(tok.lemma_.lower().strip()) else: tokens.append(tok.text.lower().strip()) # add entities if tok.ent_type_ in added_entities: tokens.append(tok.text.lower()) # remove stopwords tokens = [tok for tok in tokens if tok not in stoplist] # remove symbols tokens = [tok for tok in tokens if tok not in symbols] # remove custom_words tokens = [tok for tok in tokens if tok not in custom_words] # remove single characters tokens = [tok for tok in tokens if len(tok)>1] # remove large strings of whitespace remove_large_strings_of_whitespace(" ".join(tokens)) #idee abkürzungen auflösen (v.a. TU -> Technische Universität) if normalize_synonyms: tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] return " ".join(tokens) def remove_large_strings_of_whitespace(sentence): whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE) sentence = whitespaceFinder.sub(" ", sentence) tokenlist = sentence.split(" ") while "" in tokenlist: tokenlist.remove("") while " " in tokenlist: tokenlist.remove(" ") return " ".join(tokenlist) """ """ def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False): import xml.etree.ElementTree as ET tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) root = tree.getroot() for ticket in root: metadata = {} text = "ERROR" for field in ticket: if field.tag == textfield: if clean: text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) else: text = field.text else: #idee hier auch cleanen? metadata[field.tag] = field.text yield text, metadata """ LANGUAGE = 'de' PARSER = spacy.load(LANGUAGE) def generateTextfromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False): import xml.etree.ElementTree as ET tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) root = tree.getroot() """ for subject in root.iter(textfield): if clean: yield cleanText(subject.text) else: yield subject.text """ for ticket in root: text = "ERROR" for field in ticket: if field.tag == textfield: if clean: text = cleanText_words(field.text,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) else: text = field.text yield text def generateMetadatafromXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]): import xml.etree.ElementTree as ET tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) root = tree.getroot() """ metadata = dict.fromkeys(keys) for ticket in root.findall('ticket'): for key in metadata: metadata[key] = ticket.find(key).text yield metadata """ for ticket in root: metadata = {} for field in ticket: if field.tag != textfield: if field.tag == "Zusammenfassung": # idee lösung nur whitespace entfernen, zusammenfassung auch von symbolen befreien metadata[field.tag] = cleanText_symbols(field.text) elif field.tag == "Loesung": metadata[field.tag] = remove_whitespace(field.text) else: metadata[field.tag] = field.text yield metadata def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): """ https://spacy.io/docs/usage/pos-tagging cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols: ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] """ if custom_symbols is not None: custom_symbols = custom_symbols else: custom_symbols = [] if keep is not None: keep = keep else: keep = [] # List of symbols we don't care about symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols # parse with spaCy spacy_doc = parser(string) tokens = [] pos = ["NUM", "SPACE", "PUNCT"] for p in keep: pos.remove(p) # append Tokens to a list for tok in spacy_doc: if tok.pos_ not in pos and tok.text not in symbols: tokens.append(tok.text) return " ".join(tokens) def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False): # use preprocessing if customPreprocessing is not None: string = customPreprocessing(string) if custom_stopwords is not None: custom_stopwords = custom_stopwords else: custom_stopwords = [] if custom_words is not None: custom_words = custom_words else: custom_words = [] # custom stoplist # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS stoplist =list(stop_words) + custom_stopwords # replace twitter mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) string = mentionFinder.sub("MENTION", string) # replace emails emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) string = emailFinder.sub("EMAIL", string) # replace urls urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) string = urlFinder.sub("URL", string) # replace HTML symbols string = string.replace("&", "and").replace(">", ">").replace("<", "<") # parse with spaCy spacy_doc = parser(string) tokens = [] added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 # append Tokens to a list for tok in spacy_doc: if tok.pos_ in added_POS: if lemmatize: tokens.append(tok.lemma_.lower().strip()) else: tokens.append(tok.text.lower().strip()) # add entities if tok.ent_type_ in added_entities: tokens.append(tok.text.lower()) # remove stopwords tokens = [tok for tok in tokens if tok not in stoplist] # remove custom_words tokens = [tok for tok in tokens if tok not in custom_words] # remove single characters tokens = [tok for tok in tokens if len(tok)>1] # remove large strings of whitespace #remove_whitespace(" ".join(tokens)) #idee abkürzungen auflösen (v.a. TU -> Technische Universität) if normalize_synonyms: tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] return " ".join(set(tokens)) def remove_whitespace(sentence): whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) sentence = whitespaceFinder.sub(" ", sentence) return sentence def getFirstSynonym(word, thesaurus_gen): word = word.lower() # durch den thesaurrus iterieren for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen for syn in syn_block: syn = syn.lower() if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist if word == syn: return getHauptform(syn_block, word) else: # falls es ein satz ist if word in syn: return getHauptform(syn_block, word) return word # zur Not, das ursrpüngliche Wort zurückgeben def getHauptform(syn_block, word, default_return_first_Syn=False): for syn in syn_block: syn = syn.lower() if "hauptform" in syn and len(syn.split(" ")) <= 2: # nicht ausgeben, falls es in Klammern steht for w in syn.split(" "): if not re.match(r'\([^)]+\)', w): return w if default_return_first_Syn: # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht for w in syn_block: if not re.match(r'\([^)]+\)', w): return w return word # zur Not, das ursrpüngliche Wort zurückgeben def printRandomDoc(textacyCorpus): print() print("len(textacyCorpus) = %i" % len(textacyCorpus)) randIndex = int((len(textacyCorpus) - 1) * random.random()) print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) print() ####################'####################'####################'####################'####################'############## import de_core_news_md DATAPATH = "ticketSamples.xml" DATAPATH_thesaurus = "openthesaurus.csv" normalize_Synonyms = True clean = True lemmatize = True custom_words = ["grüßen", "fragen"] ####################'####################'####################'####################'####################'############## #PARSER = de_core_news_md.load() THESAURUS_list=list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt ## files to textacy-corpus textacyCorpus = textacy.Corpus(PARSER) print("add texts to textacy-corpus...") textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH,normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromXML(DATAPATH)) #for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize): # textacyCorpus.add_text(txt,dic) for doc in textacyCorpus: print(doc.metadata) print(doc.text) #print(textacyCorpus[2].text) #printRandomDoc(textacyCorpus) #print(textacyCorpus[len(textacyCorpus)-1].text) print() print()