topicModelingTickets/preprocessing.py

# -*- coding: utf-8 -*-
import csv
import random
import re

import spacy
import textacy
import sys


"""
import keras
import numpy as np
from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout
from keras.models import Sequential
import keras.backend as K
"""
csv.field_size_limit(sys.maxsize)


def printRandomDoc(textacyCorpus):
    print()
    print("len(textacyCorpus) = %i" % len(textacyCorpus))
    randIndex = int((len(textacyCorpus) - 1) * random.random())
    print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
    print()


def getFirstSynonym(word, thesaurus_gen):

        word = word.lower()
        # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python


        # durch den thesaurrus iterieren
        for syn_block in thesaurus_gen:  # syn_block ist eine liste mit Synonymen

            # durch den synonymblock iterieren
            for syn in syn_block:
                syn = syn.lower().split(" ")  # aus synonym mach liste (um evtl. sätze zu identifieziren)

                # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
                if word in syn:

                    # Hauptform suchen
                    if "auptform" in syn:
                        # nicht ausgeben, falls es in Klammern steht
                        for w in syn:
                            if not re.match(r'\([^)]+\)', w):
                                return w

                    # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
                    if len(syn) == 1:
                        w = syn[0]
                        if not re.match(r'\([^)]+\)', w):
                            return w

                    return word  # zur Not die eingabe ausgeben


def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False):
    import re

    # use preprocessing
    if customPreprocessing is not None:
        string = customPreprocessing(string)


    if custom_stopwords is not None:
        custom_stopwords = custom_stopwords
    else:
        custom_stopwords = []

    if custom_words is not None:
        custom_words = custom_words
    else:
        custom_words = []

    if custom_symbols is not None:
        custom_symbols = custom_symbols
    else:
        custom_symbols = []


    # custom stoplist
    # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
    stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS

    stoplist =list(stop_words) + custom_stopwords
    # List of symbols we don't care about either
    symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols


    # get rid of newlines
    string = string.strip().replace("\n", " ").replace("\r", " ")

    # replace twitter
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    string = mentionFinder.sub("MENTION", string)

    # replace emails
    emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
    string = emailFinder.sub("EMAIL", string)

    # replace urls
    urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
    string = urlFinder.sub("URL", string)

    # replace HTML symbols
    string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")


    # parse with spaCy
    spacy_doc = PARSER(string)
    tokens = []

    added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
    added_POS = ["NOUN", "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen  http://aclweb.org/anthology/U15-1013

    # append Tokens to a list
    for tok in spacy_doc:
            if tok.pos_ in added_POS:
                if lemmatize:
                    tokens.append(tok.lemma_.lower().strip())
                else:
                    tokens.append(tok.text.lower().strip())

            # add entities
            if tok.ent_type_ in added_entities:
                tokens.append(tok.text.lower())


    # remove stopwords
    tokens = [tok for tok in tokens if tok not in stoplist]

    # remove symbols
    tokens = [tok for tok in tokens if tok not in symbols]

    # remove custom_words
    tokens = [tok for tok in tokens if tok not in custom_words]

    # remove single characters
    tokens = [tok for tok in tokens if len(tok)>1]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    #TODO hier thsaurus einbinden?

    return " ".join(tokens)


def generateTextfromXML(path2xml, clean=True, field='Beschreibung'):
    import xml.etree.ElementTree as ET

    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))

    root = tree.getroot()

    for subject in root.iter(field):
        if clean:
            yield cleanText(subject.text)
        else:
            yield subject.text

def generateMetadatafromXML(path2xml, keys=["Loesung","Kategorie","Zusammenfassung"]):
    import xml.etree.ElementTree as ET

    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))

    root = tree.getroot()

    metadata = dict.fromkeys(keys)

    for ticket in root.findall('ticket'):
        for key in metadata:
            metadata[key] = ticket.find(key).text       #TODO hier thsaurus einbinden?

        yield metadata


####################'####################'####################'####################'####################'##############


DATAPATH = "ticketSamples.xml"
DATAPATH_thesaurus = "openthesaurus.csv"

LANGUAGE = 'de'


####################'####################'####################'####################'####################'##############

PARSER = spacy.load(LANGUAGE)
THESAURUS_gen = textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")  # generator [[a,b,c,..],[a,b,c,..],...]


## files to textacy-corpus
textacyCorpus = textacy.Corpus(PARSER)

print("add texts to textacy-corpus...")
textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH), metadatas=generateMetadatafromXML(DATAPATH))


#printRandomDoc(textacyCorpus)

print(textacyCorpus[len(textacyCorpus)-1].text)