2017-10-11 17:16:04 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
import csv
|
|
|
|
import sys
|
2017-10-16 14:01:38 +02:00
|
|
|
from miscellaneous import *
|
|
|
|
import time
|
|
|
|
from datetime import datetime
|
|
|
|
import re
|
|
|
|
import textacy
|
2017-10-18 17:37:20 +02:00
|
|
|
from textacy.preprocess import normalize_whitespace
|
2017-10-16 14:01:38 +02:00
|
|
|
from scipy import *
|
2017-10-17 10:13:49 +02:00
|
|
|
import os
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-17 10:13:49 +02:00
|
|
|
csv.field_size_limit(sys.maxsize)
|
|
|
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# load config
|
2017-10-17 10:13:49 +02:00
|
|
|
config_ini = FILEPATH + "config.ini"
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
config = ConfigParser.ConfigParser()
|
|
|
|
with open(config_ini) as f:
|
|
|
|
config.read_file(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-12 15:57:56 +02:00
|
|
|
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
|
2017-10-11 17:16:04 +02:00
|
|
|
"""
|
|
|
|
:param path2csv: string
|
|
|
|
:param content_collumn_name: string
|
|
|
|
:return: string-generator
|
|
|
|
"""
|
|
|
|
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
|
|
|
content_collumn = 0 # standardvalue
|
|
|
|
|
|
|
|
for i, lst in enumerate(stream):
|
|
|
|
if i == 0:
|
|
|
|
# look for desired column
|
|
|
|
for j, col in enumerate(lst):
|
|
|
|
if col == content_collumn_name:
|
|
|
|
content_collumn = j
|
|
|
|
else:
|
|
|
|
yield lst[content_collumn]
|
|
|
|
|
|
|
|
|
2017-10-12 15:57:56 +02:00
|
|
|
def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
|
2017-10-11 17:16:04 +02:00
|
|
|
"""
|
|
|
|
:param path2csv: string
|
|
|
|
:param metalist: list of strings
|
|
|
|
:return: dict-generator
|
|
|
|
"""
|
|
|
|
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
|
|
|
|
|
|
|
content_collumn = 0 # standardvalue
|
|
|
|
metaindices = []
|
|
|
|
metadata_temp = {}
|
|
|
|
for i, lst in enumerate(stream):
|
|
|
|
if i == 0:
|
|
|
|
for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
|
|
|
|
for key in metalist:
|
|
|
|
if re.sub('[^a-zA-Z]+', '', key) == re.sub('[^a-zA-Z]+', '', col):
|
|
|
|
metaindices.append(j)
|
|
|
|
metadata_temp = dict(
|
|
|
|
zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
|
|
|
|
|
|
|
|
else:
|
|
|
|
metadata = metadata_temp.copy()
|
|
|
|
for key, value in metadata.items():
|
|
|
|
metadata[key] = lst[value]
|
|
|
|
yield metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
##################################################################################################
|
|
|
|
|
|
|
|
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
content_collumn_name = config.get("tickets","content_collumn_name")
|
2017-11-06 12:54:59 +01:00
|
|
|
metaliste = get_list_from_config("tickets","metaliste")
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-10-17 10:13:49 +02:00
|
|
|
path2de_csv = FILEPATH + config.get("de_corpus","input")
|
|
|
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-17 10:13:49 +02:00
|
|
|
path2en_csv = FILEPATH + config.get("en_corpus","input")
|
|
|
|
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-10-17 10:13:49 +02:00
|
|
|
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0):
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-17 10:13:49 +02:00
|
|
|
# print paths
|
|
|
|
path_csv_split = path2_csv.split("/")
|
|
|
|
filename = path_csv_split[len(path_csv_split) - 1]
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-25 09:46:44 +02:00
|
|
|
logprint("Corporization of {0} at {1}".format(filename, datetime.now()))
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-17 10:13:49 +02:00
|
|
|
raw_corpus = textacy.Corpus(lang)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
## add files to textacy-corpi,
|
2017-10-17 10:13:49 +02:00
|
|
|
raw_corpus.add_texts(
|
|
|
|
ticketcsv_to_textStream(path2_csv, content_collumn_name),
|
|
|
|
ticket_csv_to_DictStream(path2_csv, metaliste)
|
2017-10-16 14:01:38 +02:00
|
|
|
)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
# leere docs aus corpi kicken
|
2017-10-17 10:13:49 +02:00
|
|
|
raw_corpus.remove(lambda doc: len(doc) == 0)
|
|
|
|
|
2017-11-06 12:54:59 +01:00
|
|
|
logprint("corpus-lenght: {}".format(len(raw_corpus)))
|
2017-10-17 10:13:49 +02:00
|
|
|
#random Doc printen
|
|
|
|
for i in range(printrandom):
|
|
|
|
printRandomDoc(raw_corpus)
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-17 10:13:49 +02:00
|
|
|
# save corpus
|
|
|
|
raw_name = lang + "_raw_ticket"
|
|
|
|
save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name)
|
2017-10-25 09:46:44 +02:00
|
|
|
logprint("Done")
|
2017-10-17 10:13:49 +02:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
start = time.time()
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
2017-10-17 10:13:49 +02:00
|
|
|
ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de")
|
2017-10-11 17:16:04 +02:00
|
|
|
|
2017-10-18 17:37:20 +02:00
|
|
|
#ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en")
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
end = time.time()
|
2017-10-25 09:46:44 +02:00
|
|
|
logprint("Time Elapsed Corporization:{0} min".format((end - start) / 60))
|
2017-10-11 17:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|