topicModelingTickets/cleaning.py

# -*- coding: utf-8 -*-

from datetime import datetime
import csv
import sys
from miscellaneous import *
from datetime import datetime
import time
import textacy
from scipy import *

import os

csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"


# load config
config_ini = FILEPATH + "config.ini"

config = ConfigParser.ConfigParser()
with open(config_ini) as f:
    config.read_file(f)


global REGEX_SPECIALCHAR

global WORDS


REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]' #+r',.'

WORDS= {}


##########################    Spellchecking        ##########################################
# http://norvig.com/spell-correct.html
# http://wortschatz.uni-leipzig.de/en/download

import re


def words(text): return re.findall(r'\w+', text.lower())

def P(word, N=sum(WORDS.values())):
    "Probability of `word`."
    return WORDS[word] / N


def correction(word):
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)


def candidates(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])


def known(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)


def edits1(word):
    "All edits that are one edit away from `word`."
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def edits2(word):
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))


def autocorrectWord(word):
    try:
        return correction(word)
    except:
        return word


############# stringcleaning


def clean(stringstream,autocorrect=False):

    for string in stringstream:
        # fixUnicode
        string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')

        # seperate_words_on_regex:
        string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))  #frage ,.?!

        #normalize whitespace
        string = textacy.preprocess.normalize_whitespace(string)

        #remove linebreaks
        string = re.sub(r'[\n]', " ", string)

        # replaceRockDots
        string = re.sub(r'[ß]', "ss", string)
        string = re.sub(r'[ö]', "oe", string)
        string = re.sub(r'[ü]', "ue", string)
        string = re.sub(r'[ä]', "ae", string)

        # frage autocorrect?
        #idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf
        if autocorrect:
            string = " ".join([autocorrectWord(word) for word in string.split()])

        yield string


def processDictstream(dictstream, funcdict, parser):
    """

    :param dictstream: dict-gen
    :param funcdict:
                    clean_in_meta = {
                        "Solution":funclist,
                        ...
                    }

    :param parser: spacy-parser
    :return: dict-gen
    """
    for dic in dictstream:
        result = {}
        for key, value in dic.items():

            if key in funcdict:

                doc = parser(value)
                tokens = [tok for tok in doc]
                funclist = funcdict[key]

                tokens = filterTokens(tokens, funclist)

                result[key] = " ".join([tok.lower_ for tok in tokens])


            else:
                result[key] = value
        yield result

def filterTokens(tokens, funclist):
    # in:tokenlist, funclist
    # out: tokenlist
    for f in funclist:
        tokens = list(filter(f, tokens))

    return tokens

def removePOS(pos_list):
    return lambda tok: tok.pos_ not in pos_list

##################################################################################################


path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")

corpus_de_path = FILEPATH + config.get("de_corpus", "path")

corpus_en_path = FILEPATH + config.get("en_corpus", "path")


def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):

    logprint("Clean {0}_corpus at {1}".format(lang, datetime.now()))

    rawCorpus_name = lang + "_raw_ticket"
    cleanCorpus_name = lang + "_clean_ticket"

    #load raw corpus and create new one
    raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path)

    clean_corpus = textacy.Corpus(parser)


    ## process and add files to textacy-corpi,
    clean_corpus.add_texts(
        clean(corpus2Text(raw_corpus)),
        processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
    )


    # leere docs aus corpi kicken
    clean_corpus.remove(lambda doc: len(doc) == 0)


    for i in range(printrandom):
        printRandomDoc(clean_corpus)


    #save corpus
    save_corpus(corpus=clean_corpus, corpus_path=corpus_path, corpus_name=cleanCorpus_name)


    return clean_corpus


def main():
    start = time.time()

    WORDS = load_obj(path2wordsdict)

    clean_in_content = [] #frage notwendig?


    clean_in_meta = {
        "Solution": [removePOS(["SPACE"])],
        "Subject": [removePOS(["SPACE", "PUNCT"])],
        "categoryName": [removePOS(["SPACE", "PUNCT"])]
    }

    corpus = cleanCorpus(corpus_de_path,  clean_in_content, clean_in_meta, "de",printrandom=5 )

    end = time.time()
    logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))

if __name__ == "__main__":
    main()
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`# -- coding: utf-8 --`

			`from datetime import datetime`
			`import csv`
			`import sys`
			`from miscellaneous import *`
			`from datetime import datetime`
			`import time`
			`import textacy`
			`from scipy import *`

			`import os`

			`csv.field_size_limit(sys.maxsize)`
			`FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"`



			`# load config`
			`config_ini = FILEPATH + "config.ini"`

			`config = ConfigParser.ConfigParser()`
			`with open(config_ini) as f:`
			`config.read_file(f)`


			`global REGEX_SPECIALCHAR`

			`global WORDS`


			REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"\|</>]' #+r',.'

			`WORDS= {}`


			`########################## Spellchecking ##########################################`
			`# http://norvig.com/spell-correct.html`
			`# http://wortschatz.uni-leipzig.de/en/download`

			`import re`


			`def words(text): return re.findall(r'\w+', text.lower())`

			`def P(word, N=sum(WORDS.values())):`
			"Probability of `word`."
			`return WORDS[word] / N`


			`def correction(word):`
			`"Most probable spelling correction for word."`
			`return max(candidates(word), key=P)`


			`def candidates(word):`
			`"Generate possible spelling corrections for word."`
			`return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])`


			`def known(words):`
			"The subset of `words` that appear in the dictionary of WORDS."
			`return set(w for w in words if w in WORDS)`


			`def edits1(word):`
			"All edits that are one edit away from `word`."
			`letters = 'abcdefghijklmnopqrstuvwxyz'`
			`splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]`
			`deletes = [L + R[1:] for L, R in splits if R]`
			`transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]`
			`replaces = [L + c + R[1:] for L, R in splits if R for c in letters]`
			`inserts = [L + c + R for L, R in splits for c in letters]`
			`return set(deletes + transposes + replaces + inserts)`


			`def edits2(word):`
			"All edits that are two edits away from `word`."
			`return (e2 for e1 in edits1(word) for e2 in edits1(e1))`


			`def autocorrectWord(word):`
			`try:`
			`return correction(word)`
			`except:`
			`return word`


			`############# stringcleaning`





			`def clean(stringstream,autocorrect=False):`

			`for string in stringstream:`
			`# fixUnicode`
			`string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')`

			`# seperate_words_on_regex:`
			`string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) #frage ,.?!`

			`#normalize whitespace`
			`string = textacy.preprocess.normalize_whitespace(string)`

			`#remove linebreaks`
			`string = re.sub(r'[\n]', " ", string)`

			`# replaceRockDots`
			`string = re.sub(r'[ß]', "ss", string)`
			`string = re.sub(r'[ö]', "oe", string)`
			`string = re.sub(r'[ü]', "ue", string)`
			`string = re.sub(r'[ä]', "ae", string)`

			`# frage autocorrect?`
			`#idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf`
			`if autocorrect:`
			`string = " ".join([autocorrectWord(word) for word in string.split()])`

			`yield string`



			`def processDictstream(dictstream, funcdict, parser):`
			`"""`

			`:param dictstream: dict-gen`
			`:param funcdict:`
			`clean_in_meta = {`
			`"Solution":funclist,`
			`...`
			`}`

			`:param parser: spacy-parser`
			`:return: dict-gen`
			`"""`
			`for dic in dictstream:`
			`result = {}`
			`for key, value in dic.items():`

			`if key in funcdict:`

			`doc = parser(value)`
			`tokens = [tok for tok in doc]`
			`funclist = funcdict[key]`

			`tokens = filterTokens(tokens, funclist)`

			`result[key] = " ".join([tok.lower_ for tok in tokens])`


			`else:`
			`result[key] = value`
			`yield result`

			`def filterTokens(tokens, funclist):`
			`# in:tokenlist, funclist`
			`# out: tokenlist`
			`for f in funclist:`
			`tokens = list(filter(f, tokens))`

			`return tokens`

			`def removePOS(pos_list):`
			`return lambda tok: tok.pos_ not in pos_list`

			`##################################################################################################`


			`path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")`

			`corpus_de_path = FILEPATH + config.get("de_corpus", "path")`

			`corpus_en_path = FILEPATH + config.get("en_corpus", "path")`




			`def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):`

			`logprint("Clean {0}_corpus at {1}".format(lang, datetime.now()))`

			`rawCorpus_name = lang + "_raw_ticket"`
			`cleanCorpus_name = lang + "_clean_ticket"`

			`#load raw corpus and create new one`
			`raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path)`

			`clean_corpus = textacy.Corpus(parser)`


			`## process and add files to textacy-corpi,`
			`clean_corpus.add_texts(`
			`clean(corpus2Text(raw_corpus)),`
			`processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)`
			`)`


			`# leere docs aus corpi kicken`
			`clean_corpus.remove(lambda doc: len(doc) == 0)`


			`for i in range(printrandom):`
			`printRandomDoc(clean_corpus)`



			`#save corpus`
			`save_corpus(corpus=clean_corpus, corpus_path=corpus_path, corpus_name=cleanCorpus_name)`



			`return clean_corpus`



			`def main():`
			`start = time.time()`

			`WORDS = load_obj(path2wordsdict)`

			`clean_in_content = [] #frage notwendig?`


			`clean_in_meta = {`
			`"Solution": [removePOS(["SPACE"])],`
			`"Subject": [removePOS(["SPACE", "PUNCT"])],`
			`"categoryName": [removePOS(["SPACE", "PUNCT"])]`
			`}`

			`corpus = cleanCorpus(corpus_de_path, clean_in_content, clean_in_meta, "de",printrandom=5 )`

			`end = time.time()`
			`logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))`

			`if __name__ == "__main__":`
			`main()`