topicModelingTickets/miscellaneous.py

# -*- coding: utf-8 -*-
from datetime import datetime
import configparser as ConfigParser
import csv
import functools
import logging
import random
import re
import sys
from pathlib import Path
import pickle
import spacy
import textacy
from scipy import *
import os
import glob, os
from textacy.fileio import open_sesame
import json
from spacy.tokens.doc import Doc as SpacyDoc
import operator

csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"


# load config
config_ini = FILEPATH + "config.ini"

config = ConfigParser.ConfigParser()
with open(config_ini) as f:
    config.read_file(f)


# config logging
filename = FILEPATH + config.get("logging","filename")
level = config.get("logging","level")
if level == "INFO":
    level = logging.INFO
elif level == "DEBUG":
    level = logging.DEBUG
elif level == "WARNING":
    level = logging.WARNING
logging.basicConfig(filename=filename, level=level)


def logprint(string, level="INFO"):
    """log and prints"""
    string = "{}\t".format(datetime.now()) + str(string)
    print(string)
    if level == "INFO":
        logging.info(string)
    elif level == "DEBUG":
        logging.debug(string)
    elif level == "WARNING":
        logging.warning(string)


def compose(*functions):
    def compose2(f, g):
        return lambda x: f(g(x))

    return functools.reduce(compose2, functions, lambda x: x)


def get_calling_function():
    """finds the calling function in many decent cases.
    https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
    """
    fr = sys._getframe(1)  # inspect.stack()[1][0]
    co = fr.f_code
    for get in (
            lambda: fr.f_globals[co.co_name],
            lambda: getattr(fr.f_locals['self'], co.co_name),
            lambda: getattr(fr.f_locals['cls'], co.co_name),
            lambda: fr.f_back.f_locals[co.co_name],  # nested
            lambda: fr.f_back.f_locals['func'],  # decorators
            lambda: fr.f_back.f_locals['meth'],
            lambda: fr.f_back.f_locals['f'],
    ):
        try:
            func = get()
        except (KeyError, AttributeError):
            pass
        else:
            if func.__code__ == co:
                return func
    raise AttributeError("func not found")


def save_obj(obj, path):
    with open(path , 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


def replaceRockDots_lambda():
    return lambda string : re.sub(r'[ß]', "ss",
          (re.sub(r'[ö]', "oe",
          (re.sub(r'[Ö]', "Oe",
          (re.sub(r'[ü]', "ue",
          (re.sub(r'[Ü]', "Ue",
          (re.sub(r'[ä]', "ae",
          (re.sub(r'[Ä]', "Ae",
                  string)))))))))))))

def replaceRockDots(string):
    return re.sub(r'[ß]', "ss",
          (re.sub(r'[ö]', "oe",
          (re.sub(r'[Ö]', "Oe",
          (re.sub(r'[ü]', "ue",
          (re.sub(r'[Ü]', "Ue",
          (re.sub(r'[ä]', "ae",
          (re.sub(r'[Ä]', "Ae",
                  string)))))))))))))


def list_from_files(*paths):
    """
    create string-list from file like
        n1
        n2
        n3

    :param paths: list(str) or str if single path
    :return: list(str)
    """

    listlist = []
    for path in paths:
        listlist.append(list(textacy.fileio.read_file_lines(path)))

    #liste von listen zu einer liste
    liste = [item for sublist in listlist for item in sublist]

    return list(map(textacy.preprocess.normalize_whitespace, liste))

def breakpoint():
    pass

def sort_dictionary(dict):
    return sorted(dict.items(), key=operator.itemgetter(1))


def normalize_str(string):
    """
    replaceRockDots
    textacy.preprocess.normalize_whitespace
    :param string: str
    :return: str
    """
    return textacy.preprocess.normalize_whitespace(replaceRockDots(string))


def deprecated(func):
    """This is a decorator which can be used to mark functions
    as deprecated. It will result in a warning being emmitted
    when the function is used."""

    @functools.wraps(func)
    def new_func(*args, **kwargs):
        warnings.simplefilter('always', DeprecationWarning) #turn off filter
        warnings.warn("Call to deprecated function {}.".format(func.__name__), category=DeprecationWarning, stacklevel=2)
        warnings.simplefilter('default', DeprecationWarning) #reset filter
        return func(*args, **kwargs)

    return new_func

def flatten(liste):
    return [item for sublist in liste for item in sublist]


def printRandomDoc(textacyCorpus):
    """
    printlogss random doc out of a textacy-Corpus
    :param textacyCorpus:
    """
    print()
    if len(textacyCorpus) == 0:
        logprint("NO DOCS IN CORPUS")
    else:
        #printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
        randIndex = int((len(textacyCorpus) - 1) * random.random())
        logprint("Index: {0} \n Text: {1} \n categoryName: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
                                                                         textacyCorpus[randIndex].metadata['categoryName']))

    print()

def get_list_from_config(section,option):
    return list(map(textacy.preprocess.normalize_whitespace,config.get(section,option).split(",")))

def corpus2Text(corpus):
    for doc in corpus:
        yield doc.text

def corpus2Meta(corpus):
    for doc in corpus:
        yield doc.metadata

def savelabledCorpiLines_cat(corpus, filepath):
    textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath)


def gen_labledLines(corpus, label ="categoryName"):
    for doc in corpus:
        # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
        yield "[" + doc.metadata[label] + "] " + doc.text


def save_corpus(corpus, corpus_path, corpus_name):
    """
    saves a textacy-corpus including spacy-parser
    :param corpus: textacy-Corpus
    :param corpus_path: str
    :param corpus_name: str (should content the language like "_de_")
    """

    # save parser
    parser = corpus.spacy_lang
    parserpath = corpus_path + str(parser.lang) + '_parser'
    parser.save_to_directory(parserpath)

    # save plain content + meta
    plainpath = corpus_path + corpus_name + "_content.json"
    textacy.fileio.write_json_lines(gen_dicts(corpus), plainpath)


def gen_dicts(corpus):
    for doc in corpus:
        dict = {"index" : doc.corpus_index, "content" : doc.text}
        dict.update(doc.metadata)
        yield dict


def multisub(subs, subject):
    #https://stackoverflow.com/questions/764360/a-list-of-string-replacements-in-python
    "Simultaneously perform all substitutions on the subject string."
    pattern = '|'.join('(%s)' % re.escape(p) for p, s in subs)
    substs = [s for p, s in subs]
    replace = lambda m: substs[m.lastindex - 1]
    return re.sub(pattern, replace, subject)

def load_corpus(corpus_path, corpus_name, lang="de"):
    """
    Load textacy-Corpus including spacy-parser out from file
    :param corpus_path: str
    :param corpus_name: str (should content the language like "_de_")
    :param lang: str (language code) ir spacy.Language
    :return: texracy.Corpus, spacy.language
    """

    #ckeck for language
    if "de_" in corpus_name:
        lang="de"
    elif "en_" in corpus_name:
        lang ="en"


    # load parser
    parser = spacy.load(lang)

    stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
    with open(stringstorepath) as file:
        parser.vocab.strings.load(file)

    vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
    parser.vocab.load_lexemes(vocabpath)

    #load corpus
    corpus = textacy.Corpus(parser)

    plainpath = corpus_path + corpus_name + "_content.json"
    plain_stream = textacy.fileio.read_json_lines(plainpath)  # yields {int : str}

    for plain in plain_stream:
        meta = {}
        for key,value in plain.items():
            if key != "content" and key != "index":
                meta[key] = value
        corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))

    return corpus, corpus.spacy_lang
aufgeräumt 2017-10-16 14:01:38 +02:00			`# -- coding: utf-8 --`
last commit. zu verworren geworden neue version ist IMTC_TopicModeling 2017-12-19 17:12:35 +01:00			`from datetime import datetime`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`import configparser as ConfigParser`
aufgeräumt 2017-10-16 14:01:38 +02:00			`import csv`
			`import functools`
			`import logging`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`import random`
aufgeräumt 2017-10-16 14:01:38 +02:00			`import re`
			`import sys`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`from pathlib import Path`
			`import pickle`
aufgeräumt 2017-10-16 14:01:38 +02:00			`import spacy`
			`import textacy`
			`from scipy import *`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`import os`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`import glob, os`
			`from textacy.fileio import open_sesame`
			`import json`
			`from spacy.tokens.doc import Doc as SpacyDoc`
. 2017-11-29 16:31:30 +01:00			`import operator`
aufgeräumt 2017-10-16 14:01:38 +02:00
			`csv.field_size_limit(sys.maxsize)`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"`
aufgeräumt 2017-10-16 14:01:38 +02:00


			`# load config`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`config_ini = FILEPATH + "config.ini"`
aufgeräumt 2017-10-16 14:01:38 +02:00
			`config = ConfigParser.ConfigParser()`
			`with open(config_ini) as f:`
			`config.read_file(f)`



			`# config logging`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`filename = FILEPATH + config.get("logging","filename")`
aufgeräumt 2017-10-16 14:01:38 +02:00			`level = config.get("logging","level")`
			`if level == "INFO":`
			`level = logging.INFO`
			`elif level == "DEBUG":`
			`level = logging.DEBUG`
			`elif level == "WARNING":`
			`level = logging.WARNING`
			`logging.basicConfig(filename=filename, level=level)`



topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`def logprint(string, level="INFO"):`
aufgeräumt 2017-10-16 14:01:38 +02:00			`"""log and prints"""`
last commit. zu verworren geworden neue version ist IMTC_TopicModeling 2017-12-19 17:12:35 +01:00			`string = "{}\t".format(datetime.now()) + str(string)`
aufgeräumt 2017-10-16 14:01:38 +02:00			`print(string)`
			`if level == "INFO":`
			`logging.info(string)`
			`elif level == "DEBUG":`
			`logging.debug(string)`
			`elif level == "WARNING":`
			`logging.warning(string)`


			`def compose(*functions):`
			`def compose2(f, g):`
			`return lambda x: f(g(x))`

			`return functools.reduce(compose2, functions, lambda x: x)`


			`def get_calling_function():`
			`"""finds the calling function in many decent cases.`
			`https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name`
			`"""`
			`fr = sys._getframe(1) # inspect.stack()[1][0]`
			`co = fr.f_code`
			`for get in (`
			`lambda: fr.f_globals[co.co_name],`
			`lambda: getattr(fr.f_locals['self'], co.co_name),`
			`lambda: getattr(fr.f_locals['cls'], co.co_name),`
			`lambda: fr.f_back.f_locals[co.co_name], # nested`
			`lambda: fr.f_back.f_locals['func'], # decorators`
			`lambda: fr.f_back.f_locals['meth'],`
			`lambda: fr.f_back.f_locals['f'],`
			`):`
			`try:`
			`func = get()`
			`except (KeyError, AttributeError):`
			`pass`
			`else:`
			`if func.__code__ == co:`
			`return func`
			`raise AttributeError("func not found")`


			`def save_obj(obj, path):`
			`with open(path , 'wb') as f:`
			`pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)`

			`def load_obj(path):`
			`with open(path, 'rb') as f:`
			`return pickle.load(f)`

topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00
preprocessing überarbeitet 2017-12-08 11:06:07 +01:00			`def replaceRockDots_lambda():`
			`return lambda string : re.sub(r'[ß]', "ss",`
			`(re.sub(r'[ö]', "oe",`
			`(re.sub(r'[Ö]', "Oe",`
			`(re.sub(r'[ü]', "ue",`
			`(re.sub(r'[Ü]', "Ue",`
			`(re.sub(r'[ä]', "ae",`
			`(re.sub(r'[Ä]', "Ae",`
			`string)))))))))))))`

			`def replaceRockDots(string):`
			`return re.sub(r'[ß]', "ss",`
			`(re.sub(r'[ö]', "oe",`
			`(re.sub(r'[Ö]', "Oe",`
			`(re.sub(r'[ü]', "ue",`
			`(re.sub(r'[Ü]', "Ue",`
			`(re.sub(r'[ä]', "ae",`
			`(re.sub(r'[Ä]', "Ae",`
			`string)))))))))))))`

aufgeräumt 2017-10-16 14:01:38 +02:00
			`def list_from_files(*paths):`
			`"""`
			`create string-list from file like`
			`n1`
			`n2`
			`n3`

			`:param paths: list(str) or str if single path`
			`:return: list(str)`
			`"""`

			`listlist = []`
			`for path in paths:`
			`listlist.append(list(textacy.fileio.read_file_lines(path)))`

			`#liste von listen zu einer liste`
			`liste = [item for sublist in listlist for item in sublist]`

			`return list(map(textacy.preprocess.normalize_whitespace, liste))`

llda mit subjects und keywords korrigiert 2017-11-21 10:14:37 +01:00			`def breakpoint():`
refactored 2017-11-17 11:46:57 +01:00			`pass`

. 2017-11-29 16:31:30 +01:00			`def sort_dictionary(dict):`
			`return sorted(dict.items(), key=operator.itemgetter(1))`


last commit. zu verworren geworden neue version ist IMTC_TopicModeling 2017-12-19 17:12:35 +01:00			`def normalize_str(string):`
			`"""`
			`replaceRockDots`
			`textacy.preprocess.normalize_whitespace`
			`:param string: str`
			`:return: str`
			`"""`
			`return textacy.preprocess.normalize_whitespace(replaceRockDots(string))`
refactored 2017-11-17 11:46:57 +01:00
aufgeräumt 2017-10-16 14:01:38 +02:00
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`def deprecated(func):`
			`"""This is a decorator which can be used to mark functions`
			`as deprecated. It will result in a warning being emmitted`
			`when the function is used."""`

			`@functools.wraps(func)`
			`def new_func(args, *kwargs):`
			`warnings.simplefilter('always', DeprecationWarning) #turn off filter`
			`warnings.warn("Call to deprecated function {}.".format(func.__name__), category=DeprecationWarning, stacklevel=2)`
			`warnings.simplefilter('default', DeprecationWarning) #reset filter`
			`return func(args, *kwargs)`
aufgeräumt 2017-10-16 14:01:38 +02:00
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`return new_func`
aufgeräumt 2017-10-16 14:01:38 +02:00
llda mit subjects und keywords korrigiert 2017-11-21 10:14:37 +01:00			`def flatten(liste):`
			`return [item for sublist in liste for item in sublist]`

aufgeräumt 2017-10-16 14:01:38 +02:00
			`def printRandomDoc(textacyCorpus):`
			`"""`
			`printlogss random doc out of a textacy-Corpus`
			`:param textacyCorpus:`
			`"""`
			`print()`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`if len(textacyCorpus) == 0:`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`logprint("NO DOCS IN CORPUS")`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`else:`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`#printlog("len(textacyCorpus) = %i" % len(textacyCorpus))`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`randIndex = int((len(textacyCorpus) - 1) * random.random())`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`logprint("Index: {0} \n Text: {1} \n categoryName: {2}\n".format(randIndex, textacyCorpus[randIndex].text,`
			`textacyCorpus[randIndex].metadata['categoryName']))`
aufgeräumt 2017-10-16 14:01:38 +02:00
			`print()`

lauffähige version 2017-11-06 12:54:59 +01:00			`def get_list_from_config(section,option):`
			`return list(map(textacy.preprocess.normalize_whitespace,config.get(section,option).split(",")))`

			`def corpus2Text(corpus):`
			`for doc in corpus:`
			`yield doc.text`

			`def corpus2Meta(corpus):`
			`for doc in corpus:`
			`yield doc.metadata`

last commit. zu verworren geworden neue version ist IMTC_TopicModeling 2017-12-19 17:12:35 +01:00			`def savelabledCorpiLines_cat(corpus, filepath):`
lauffähige version 2017-11-06 12:54:59 +01:00			`textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath)`

last commit. zu verworren geworden neue version ist IMTC_TopicModeling 2017-12-19 17:12:35 +01:00

			`def gen_labledLines(corpus, label ="categoryName"):`
lauffähige version 2017-11-06 12:54:59 +01:00			`for doc in corpus:`
			`# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi`
last commit. zu verworren geworden neue version ist IMTC_TopicModeling 2017-12-19 17:12:35 +01:00			`yield "[" + doc.metadata[label] + "] " + doc.text`



aufgeräumt 2017-10-16 14:01:38 +02:00

			`def save_corpus(corpus, corpus_path, corpus_name):`
			`"""`
			`saves a textacy-corpus including spacy-parser`
			`:param corpus: textacy-Corpus`
			`:param corpus_path: str`
			`:param corpus_name: str (should content the language like "_de_")`
			`"""`

			`# save parser`
			`parser = corpus.spacy_lang`
			`parserpath = corpus_path + str(parser.lang) + '_parser'`
			`parser.save_to_directory(parserpath)`

commit vor refactoring 2017-11-03 11:49:26 +01:00			`# save plain content + meta`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`plainpath = corpus_path + corpus_name + "_content.json"`
commit vor refactoring 2017-11-03 11:49:26 +01:00			`textacy.fileio.write_json_lines(gen_dicts(corpus), plainpath)`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00

commit vor refactoring 2017-11-03 11:49:26 +01:00			`def gen_dicts(corpus):`
termiteplot für lda 2017-10-30 12:56:52 +01:00			`for doc in corpus:`
commit vor refactoring 2017-11-03 11:49:26 +01:00			`dict = {"index" : doc.corpus_index, "content" : doc.text}`
			`dict.update(doc.metadata)`
			`yield dict`
aufgeräumt 2017-10-16 14:01:38 +02:00
last commit. zu verworren geworden neue version ist IMTC_TopicModeling 2017-12-19 17:12:35 +01:00

preprocessing überarbeitet 2017-12-08 11:06:07 +01:00			`def multisub(subs, subject):`
			`#https://stackoverflow.com/questions/764360/a-list-of-string-replacements-in-python`
			`"Simultaneously perform all substitutions on the subject string."`
			`pattern = '\|'.join('(%s)' % re.escape(p) for p, s in subs)`
			`substs = [s for p, s in subs]`
			`replace = lambda m: substs[m.lastindex - 1]`
			`return re.sub(pattern, replace, subject)`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00
aufgeräumt 2017-10-16 14:01:38 +02:00			`def load_corpus(corpus_path, corpus_name, lang="de"):`
			`"""`
			`Load textacy-Corpus including spacy-parser out from file`
			`:param corpus_path: str`
			`:param corpus_name: str (should content the language like "_de_")`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`:param lang: str (language code) ir spacy.Language`
aufgeräumt 2017-10-16 14:01:38 +02:00			`:return: texracy.Corpus, spacy.language`
			`"""`

			`#ckeck for language`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`if "de_" in corpus_name:`
aufgeräumt 2017-10-16 14:01:38 +02:00			`lang="de"`
weiter aufgeräumt 2017-10-17 10:13:49 +02:00			`elif "en_" in corpus_name:`
aufgeräumt 2017-10-16 14:01:38 +02:00			`lang ="en"`


			`# load parser`
			`parser = spacy.load(lang)`

			`stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'`
			`with open(stringstorepath) as file:`
			`parser.vocab.strings.load(file)`

			`vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')`
			`parser.vocab.load_lexemes(vocabpath)`

			`#load corpus`
			`corpus = textacy.Corpus(parser)`

topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00			`plainpath = corpus_path + corpus_name + "_content.json"`
termiteplot für lda 2017-10-30 12:56:52 +01:00			`plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str}`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00
commit vor refactoring 2017-11-03 11:49:26 +01:00			`for plain in plain_stream:`
			`meta = {}`
			`for key,value in plain.items():`
			`if key != "content" and key != "index":`
			`meta[key] = value`
lauffähige version 2017-11-06 12:54:59 +01:00			`corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))`
topicmodeling jgibbsllda lauffähig 2017-10-25 09:46:44 +02:00
lauffähige version 2017-11-06 12:54:59 +01:00			`return corpus, corpus.spacy_lang`