topicModelingTickets/preprocessing.py

488 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import csv
import random
import re
import spacy
import textacy
import sys
import xml.etree.ElementTree as ET
"""
import keras
import numpy as np
from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout
from keras.models import Sequential
import keras.backend as K
"""
csv.field_size_limit(sys.maxsize)
"""
def getFirstSynonym(word, thesaurus_gen):
word = word.lower()
# TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
# durch den thesaurrus iterieren
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
# durch den synonymblock iterieren
for syn in syn_block:
syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
# falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
if word in syn:
# Hauptform suchen
if "auptform" in syn:
# nicht ausgeben, falls es in Klammern steht
for w in syn:
if not re.match(r'\([^)]+\)', w) and w is not None:
return w
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
if len(syn) == 1:
w = syn[0]
if not re.match(r'\([^)]+\)', w) and w is not None:
return w
return word # zur Not die eingabe ausgeben
"""
"""
def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):
# use preprocessing
if customPreprocessing is not None:
string = customPreprocessing(string)
if custom_stopwords is not None:
custom_stopwords = custom_stopwords
else:
custom_stopwords = []
if custom_words is not None:
custom_words = custom_words
else:
custom_words = []
if custom_symbols is not None:
custom_symbols = custom_symbols
else:
custom_symbols = []
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS
stoplist =list(stop_words) + custom_stopwords
# List of symbols we don't care about either
symbols = ["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"] + custom_symbols
# get rid of newlines
string = string.strip().replace("\n", " ").replace("\r", " ")
# replace twitter
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
string = mentionFinder.sub("MENTION", string)
# replace emails
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
string = emailFinder.sub("EMAIL", string)
# replace urls
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
string = urlFinder.sub("URL", string)
# replace HTML symbols
string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
# parse with spaCy
spacy_doc = PARSER(string)
tokens = []
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ in added_POS:
if lemmatize:
tokens.append(tok.lemma_.lower().strip())
else:
tokens.append(tok.text.lower().strip())
# add entities
if tok.ent_type_ in added_entities:
tokens.append(tok.text.lower())
# remove stopwords
tokens = [tok for tok in tokens if tok not in stoplist]
# remove symbols
tokens = [tok for tok in tokens if tok not in symbols]
# remove custom_words
tokens = [tok for tok in tokens if tok not in custom_words]
# remove single characters
tokens = [tok for tok in tokens if len(tok)>1]
# remove large strings of whitespace
remove_large_strings_of_whitespace(" ".join(tokens))
#idee abkürzungen auflösen (v.a. TU -> Technische Universität)
if normalize_synonyms:
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
return " ".join(tokens)
def remove_large_strings_of_whitespace(sentence):
whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE)
sentence = whitespaceFinder.sub(" ", sentence)
tokenlist = sentence.split(" ")
while "" in tokenlist:
tokenlist.remove("")
while " " in tokenlist:
tokenlist.remove(" ")
return " ".join(tokenlist)
"""
"""
def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
metadata = {}
text = "ERROR"
for field in ticket:
if field.tag == textfield:
if clean:
text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
else:
text = field.text
else:
#idee hier auch cleanen?
metadata[field.tag] = field.text
yield text, metadata
"""
LANGUAGE = 'de'
PARSER = spacy.load(LANGUAGE)
def generateTextfromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
"""
for subject in root.iter(textfield):
if clean:
yield cleanText(subject.text)
else:
yield subject.text
"""
for ticket in root:
text = "ERROR"
for field in ticket:
if field.tag == textfield:
if clean:
text = cleanText_words(field.text,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
else:
text = field.text
yield text
def generateMetadatafromXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
"""
metadata = dict.fromkeys(keys)
for ticket in root.findall('ticket'):
for key in metadata:
metadata[key] = ticket.find(key).text
yield metadata
"""
for ticket in root:
metadata = {}
for field in ticket:
if field.tag != textfield:
if field.tag == "Zusammenfassung":
# idee lösung nur whitespace entfernen, zusammenfassung auch von symbolen befreien
metadata[field.tag] = cleanText_symbols(field.text)
elif field.tag == "Loesung":
metadata[field.tag] = remove_whitespace(field.text)
else:
metadata[field.tag] = field.text
yield metadata
def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None):
"""
https://spacy.io/docs/usage/pos-tagging
cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols:
["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"]
"""
if custom_symbols is not None:
custom_symbols = custom_symbols
else:
custom_symbols = []
if keep is not None:
keep = keep
else:
keep = []
# List of symbols we don't care about
symbols = ["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"] + custom_symbols
# parse with spaCy
spacy_doc = parser(string)
tokens = []
pos = ["NUM", "SPACE", "PUNCT"]
for p in keep:
pos.remove(p)
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ not in pos and tok.text not in symbols:
tokens.append(tok.text)
return " ".join(tokens)
def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False):
# use preprocessing
if customPreprocessing is not None:
string = customPreprocessing(string)
if custom_stopwords is not None:
custom_stopwords = custom_stopwords
else:
custom_stopwords = []
if custom_words is not None:
custom_words = custom_words
else:
custom_words = []
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS
stoplist =list(stop_words) + custom_stopwords
# replace twitter
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
string = mentionFinder.sub("MENTION", string)
# replace emails
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
string = emailFinder.sub("EMAIL", string)
# replace urls
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
string = urlFinder.sub("URL", string)
# replace HTML symbols
string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
# parse with spaCy
spacy_doc = parser(string)
tokens = []
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ in added_POS:
if lemmatize:
tokens.append(tok.lemma_.lower().strip())
else:
tokens.append(tok.text.lower().strip())
# add entities
if tok.ent_type_ in added_entities:
tokens.append(tok.text.lower())
# remove stopwords
tokens = [tok for tok in tokens if tok not in stoplist]
# remove custom_words
tokens = [tok for tok in tokens if tok not in custom_words]
# remove single characters
tokens = [tok for tok in tokens if len(tok)>1]
# remove large strings of whitespace
#remove_whitespace(" ".join(tokens))
#idee abkürzungen auflösen (v.a. TU -> Technische Universität)
if normalize_synonyms:
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
return " ".join(set(tokens))
def remove_whitespace(sentence):
whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
sentence = whitespaceFinder.sub(" ", sentence)
return sentence
def getFirstSynonym(word, thesaurus_gen):
word = word.lower()
# durch den thesaurrus iterieren
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
for syn in syn_block:
syn = syn.lower()
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
if word == syn:
return getHauptform(syn_block, word)
else: # falls es ein satz ist
if word in syn:
return getHauptform(syn_block, word)
return word # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform(syn_block, word, default_return_first_Syn=False):
for syn in syn_block:
syn = syn.lower()
if "hauptform" in syn and len(syn.split(" ")) <= 2:
# nicht ausgeben, falls es in Klammern steht
for w in syn.split(" "):
if not re.match(r'\([^)]+\)', w):
return w
if default_return_first_Syn:
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block:
if not re.match(r'\([^)]+\)', w):
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
def printRandomDoc(textacyCorpus):
print()
print("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
print()
####################'####################'####################'####################'####################'##############
import de_core_news_md
DATAPATH = "ticketSamples.xml"
DATAPATH_thesaurus = "openthesaurus.csv"
normalize_Synonyms = True
clean = True
lemmatize = True
custom_words = ["grüßen", "fragen"]
####################'####################'####################'####################'####################'##############
#PARSER = de_core_news_md.load()
THESAURUS_list=list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt
## files to textacy-corpus
textacyCorpus = textacy.Corpus(PARSER)
print("add texts to textacy-corpus...")
textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH,normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromXML(DATAPATH))
#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize):
# textacyCorpus.add_text(txt,dic)
for doc in textacyCorpus:
print(doc.metadata)
print(doc.text)
#print(textacyCorpus[2].text)
#printRandomDoc(textacyCorpus)
#print(textacyCorpus[len(textacyCorpus)-1].text)
print()
print()