2017-10-16 14:01:38 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
2017-10-17 10:13:49 +02:00
|
|
|
import configparser as ConfigParser
|
2017-10-16 14:01:38 +02:00
|
|
|
import csv
|
|
|
|
import functools
|
|
|
|
import logging
|
2017-10-17 10:13:49 +02:00
|
|
|
import random
|
2017-10-16 14:01:38 +02:00
|
|
|
import re
|
|
|
|
import sys
|
2017-10-17 10:13:49 +02:00
|
|
|
from pathlib import Path
|
|
|
|
import pickle
|
2017-10-16 14:01:38 +02:00
|
|
|
import spacy
|
|
|
|
import textacy
|
|
|
|
from scipy import *
|
2017-10-17 10:13:49 +02:00
|
|
|
import os
|
2017-10-25 09:46:44 +02:00
|
|
|
import glob, os
|
|
|
|
from textacy.fileio import open_sesame
|
|
|
|
import json
|
|
|
|
from spacy.tokens.doc import Doc as SpacyDoc
|
2017-11-29 16:31:30 +01:00
|
|
|
import operator
|
2017-10-16 14:01:38 +02:00
|
|
|
|
|
|
|
csv.field_size_limit(sys.maxsize)
|
2017-10-17 10:13:49 +02:00
|
|
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
2017-10-16 14:01:38 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# load config
|
2017-10-17 10:13:49 +02:00
|
|
|
config_ini = FILEPATH + "config.ini"
|
2017-10-16 14:01:38 +02:00
|
|
|
|
|
|
|
config = ConfigParser.ConfigParser()
|
|
|
|
with open(config_ini) as f:
|
|
|
|
config.read_file(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# config logging
|
2017-10-17 10:13:49 +02:00
|
|
|
filename = FILEPATH + config.get("logging","filename")
|
2017-10-16 14:01:38 +02:00
|
|
|
level = config.get("logging","level")
|
|
|
|
if level == "INFO":
|
|
|
|
level = logging.INFO
|
|
|
|
elif level == "DEBUG":
|
|
|
|
level = logging.DEBUG
|
|
|
|
elif level == "WARNING":
|
|
|
|
level = logging.WARNING
|
|
|
|
logging.basicConfig(filename=filename, level=level)
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-25 09:46:44 +02:00
|
|
|
def logprint(string, level="INFO"):
|
2017-10-16 14:01:38 +02:00
|
|
|
"""log and prints"""
|
|
|
|
print(string)
|
|
|
|
if level == "INFO":
|
|
|
|
logging.info(string)
|
|
|
|
elif level == "DEBUG":
|
|
|
|
logging.debug(string)
|
|
|
|
elif level == "WARNING":
|
|
|
|
logging.warning(string)
|
|
|
|
|
|
|
|
|
|
|
|
def compose(*functions):
|
|
|
|
def compose2(f, g):
|
|
|
|
return lambda x: f(g(x))
|
|
|
|
|
|
|
|
return functools.reduce(compose2, functions, lambda x: x)
|
|
|
|
|
|
|
|
|
|
|
|
def get_calling_function():
|
|
|
|
"""finds the calling function in many decent cases.
|
|
|
|
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
|
|
|
|
"""
|
|
|
|
fr = sys._getframe(1) # inspect.stack()[1][0]
|
|
|
|
co = fr.f_code
|
|
|
|
for get in (
|
|
|
|
lambda: fr.f_globals[co.co_name],
|
|
|
|
lambda: getattr(fr.f_locals['self'], co.co_name),
|
|
|
|
lambda: getattr(fr.f_locals['cls'], co.co_name),
|
|
|
|
lambda: fr.f_back.f_locals[co.co_name], # nested
|
|
|
|
lambda: fr.f_back.f_locals['func'], # decorators
|
|
|
|
lambda: fr.f_back.f_locals['meth'],
|
|
|
|
lambda: fr.f_back.f_locals['f'],
|
|
|
|
):
|
|
|
|
try:
|
|
|
|
func = get()
|
|
|
|
except (KeyError, AttributeError):
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
if func.__code__ == co:
|
|
|
|
return func
|
|
|
|
raise AttributeError("func not found")
|
|
|
|
|
|
|
|
|
|
|
|
def save_obj(obj, path):
|
|
|
|
with open(path , 'wb') as f:
|
|
|
|
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
|
|
|
|
|
|
|
def load_obj(path):
|
|
|
|
with open(path, 'rb') as f:
|
|
|
|
return pickle.load(f)
|
|
|
|
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def replaceRockDots():
|
|
|
|
return lambda string: re.sub(r'[ß]', "ss",
|
|
|
|
(re.sub(r'[ö]', "oe",
|
|
|
|
(re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
|
|
|
|
|
|
|
def list_from_files(*paths):
|
|
|
|
"""
|
|
|
|
create string-list from file like
|
|
|
|
n1
|
|
|
|
n2
|
|
|
|
n3
|
|
|
|
|
|
|
|
:param paths: list(str) or str if single path
|
|
|
|
:return: list(str)
|
|
|
|
"""
|
|
|
|
|
|
|
|
listlist = []
|
|
|
|
for path in paths:
|
|
|
|
listlist.append(list(textacy.fileio.read_file_lines(path)))
|
|
|
|
|
|
|
|
#liste von listen zu einer liste
|
|
|
|
liste = [item for sublist in listlist for item in sublist]
|
|
|
|
|
|
|
|
return list(map(textacy.preprocess.normalize_whitespace, liste))
|
|
|
|
|
2017-11-21 10:14:37 +01:00
|
|
|
def breakpoint():
|
2017-11-17 11:46:57 +01:00
|
|
|
pass
|
|
|
|
|
2017-11-29 16:31:30 +01:00
|
|
|
def sort_dictionary(dict):
|
|
|
|
return sorted(dict.items(), key=operator.itemgetter(1))
|
|
|
|
|
|
|
|
|
2017-11-17 11:46:57 +01:00
|
|
|
def normalize(string):
|
|
|
|
# replaceRockDots
|
|
|
|
string = re.sub(r'[ß]', "ss", string.lower())
|
|
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
|
|
string = re.sub(r'[ä]', "ae", string)
|
|
|
|
string = textacy.preprocess.normalize_whitespace(string)
|
|
|
|
return string
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-10-25 09:46:44 +02:00
|
|
|
def deprecated(func):
|
|
|
|
"""This is a decorator which can be used to mark functions
|
|
|
|
as deprecated. It will result in a warning being emmitted
|
|
|
|
when the function is used."""
|
|
|
|
|
|
|
|
@functools.wraps(func)
|
|
|
|
def new_func(*args, **kwargs):
|
|
|
|
warnings.simplefilter('always', DeprecationWarning) #turn off filter
|
|
|
|
warnings.warn("Call to deprecated function {}.".format(func.__name__), category=DeprecationWarning, stacklevel=2)
|
|
|
|
warnings.simplefilter('default', DeprecationWarning) #reset filter
|
|
|
|
return func(*args, **kwargs)
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-10-25 09:46:44 +02:00
|
|
|
return new_func
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-11-21 10:14:37 +01:00
|
|
|
def flatten(liste):
|
|
|
|
return [item for sublist in liste for item in sublist]
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
|
|
|
|
def printRandomDoc(textacyCorpus):
|
|
|
|
"""
|
|
|
|
printlogss random doc out of a textacy-Corpus
|
|
|
|
:param textacyCorpus:
|
|
|
|
"""
|
|
|
|
print()
|
2017-10-17 10:13:49 +02:00
|
|
|
if len(textacyCorpus) == 0:
|
2017-10-25 09:46:44 +02:00
|
|
|
logprint("NO DOCS IN CORPUS")
|
2017-10-17 10:13:49 +02:00
|
|
|
else:
|
2017-10-25 09:46:44 +02:00
|
|
|
#printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
2017-10-17 10:13:49 +02:00
|
|
|
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
2017-10-25 09:46:44 +02:00
|
|
|
logprint("Index: {0} \n Text: {1} \n categoryName: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
|
|
|
textacyCorpus[randIndex].metadata['categoryName']))
|
2017-10-16 14:01:38 +02:00
|
|
|
|
|
|
|
print()
|
|
|
|
|
2017-11-06 12:54:59 +01:00
|
|
|
def get_list_from_config(section,option):
|
|
|
|
return list(map(textacy.preprocess.normalize_whitespace,config.get(section,option).split(",")))
|
|
|
|
|
|
|
|
def corpus2Text(corpus):
|
|
|
|
for doc in corpus:
|
|
|
|
yield doc.text
|
|
|
|
|
|
|
|
def corpus2Meta(corpus):
|
|
|
|
for doc in corpus:
|
|
|
|
yield doc.metadata
|
|
|
|
|
|
|
|
def savelabledCorpiLines(corpus,filepath):
|
|
|
|
|
|
|
|
textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath)
|
|
|
|
|
|
|
|
def gen_labledLines(corpus):
|
|
|
|
for doc in corpus:
|
|
|
|
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
|
|
|
yield "[" + doc.metadata["categoryName"] + "] " + doc.text
|
2017-10-16 14:01:38 +02:00
|
|
|
|
|
|
|
|
|
|
|
def save_corpus(corpus, corpus_path, corpus_name):
|
|
|
|
"""
|
|
|
|
saves a textacy-corpus including spacy-parser
|
|
|
|
:param corpus: textacy-Corpus
|
|
|
|
:param corpus_path: str
|
|
|
|
:param corpus_name: str (should content the language like "_de_")
|
|
|
|
"""
|
|
|
|
|
|
|
|
# save parser
|
|
|
|
parser = corpus.spacy_lang
|
|
|
|
parserpath = corpus_path + str(parser.lang) + '_parser'
|
|
|
|
parser.save_to_directory(parserpath)
|
|
|
|
|
2017-11-03 11:49:26 +01:00
|
|
|
# save plain content + meta
|
2017-10-25 09:46:44 +02:00
|
|
|
plainpath = corpus_path + corpus_name + "_content.json"
|
2017-11-03 11:49:26 +01:00
|
|
|
textacy.fileio.write_json_lines(gen_dicts(corpus), plainpath)
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
|
|
|
|
|
2017-11-03 11:49:26 +01:00
|
|
|
def gen_dicts(corpus):
|
2017-10-30 12:56:52 +01:00
|
|
|
for doc in corpus:
|
2017-11-03 11:49:26 +01:00
|
|
|
dict = {"index" : doc.corpus_index, "content" : doc.text}
|
|
|
|
dict.update(doc.metadata)
|
|
|
|
yield dict
|
2017-10-16 14:01:38 +02:00
|
|
|
|
|
|
|
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
def load_corpus(corpus_path, corpus_name, lang="de"):
|
|
|
|
"""
|
|
|
|
Load textacy-Corpus including spacy-parser out from file
|
|
|
|
:param corpus_path: str
|
|
|
|
:param corpus_name: str (should content the language like "_de_")
|
2017-10-17 10:13:49 +02:00
|
|
|
:param lang: str (language code) ir spacy.Language
|
2017-10-16 14:01:38 +02:00
|
|
|
:return: texracy.Corpus, spacy.language
|
|
|
|
"""
|
|
|
|
|
|
|
|
#ckeck for language
|
2017-10-17 10:13:49 +02:00
|
|
|
if "de_" in corpus_name:
|
2017-10-16 14:01:38 +02:00
|
|
|
lang="de"
|
2017-10-17 10:13:49 +02:00
|
|
|
elif "en_" in corpus_name:
|
2017-10-16 14:01:38 +02:00
|
|
|
lang ="en"
|
|
|
|
|
|
|
|
|
|
|
|
# load parser
|
|
|
|
parser = spacy.load(lang)
|
|
|
|
|
|
|
|
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
|
|
|
|
with open(stringstorepath) as file:
|
|
|
|
parser.vocab.strings.load(file)
|
|
|
|
|
|
|
|
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
|
|
|
|
parser.vocab.load_lexemes(vocabpath)
|
|
|
|
|
|
|
|
#load corpus
|
|
|
|
corpus = textacy.Corpus(parser)
|
|
|
|
|
2017-10-25 09:46:44 +02:00
|
|
|
plainpath = corpus_path + corpus_name + "_content.json"
|
2017-10-30 12:56:52 +01:00
|
|
|
plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str}
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-11-03 11:49:26 +01:00
|
|
|
for plain in plain_stream:
|
|
|
|
meta = {}
|
|
|
|
for key,value in plain.items():
|
|
|
|
if key != "content" and key != "index":
|
|
|
|
meta[key] = value
|
2017-11-06 12:54:59 +01:00
|
|
|
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-11-06 12:54:59 +01:00
|
|
|
return corpus, corpus.spacy_lang
|