# -*- coding: utf-8 -*- import csv import sys from miscellaneous import * import time from datetime import datetime import re import textacy from textacy.preprocess import normalize_whitespace from scipy import * import os csv.field_size_limit(sys.maxsize) FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # load config config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str): """ :param path2csv: string :param content_collumn_name: string :return: string-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue for i, lst in enumerate(stream): if i == 0: # look for desired column for j, col in enumerate(lst): if col == content_collumn_name: content_collumn = j else: yield lst[content_collumn] def ticket_csv_to_DictStream(path2csv: str, metalist: [str]): """ :param path2csv: string :param metalist: list of strings :return: dict-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue metaindices = [] metadata_temp = {} for i, lst in enumerate(stream): if i == 0: for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal for key in metalist: if re.sub('[^a-zA-Z]+', '', key) == re.sub('[^a-zA-Z]+', '', col): metaindices.append(j) metadata_temp = dict( zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10} else: metadata = metadata_temp.copy() for key, value in metadata.items(): metadata[key] = lst[value] yield metadata ################################################################################################## content_collumn_name = config.get("tickets","content_collumn_name") metaliste = get_list_from_config("tickets","metaliste") path2de_csv = FILEPATH + config.get("de_corpus","input") corpus_de_path = FILEPATH + config.get("de_corpus", "path") path2en_csv = FILEPATH + config.get("en_corpus","input") corpus_en_path = FILEPATH + config.get("en_corpus", "path") def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0): # print paths path_csv_split = path2_csv.split("/") filename = path_csv_split[len(path_csv_split) - 1] logprint("Corporization of {0} at {1}".format(filename, datetime.now())) raw_corpus = textacy.Corpus(lang) ## add files to textacy-corpi, raw_corpus.add_texts( ticketcsv_to_textStream(path2_csv, content_collumn_name), ticket_csv_to_DictStream(path2_csv, metaliste) ) # leere docs aus corpi kicken raw_corpus.remove(lambda doc: len(doc) == 0) logprint("corpus-lenght: {}".format(len(raw_corpus))) #random Doc printen for i in range(printrandom): printRandomDoc(raw_corpus) # save corpus raw_name = lang + "_raw_ticket" save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name) logprint("Done") def main(): start = time.time() ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de") #ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en") end = time.time() logprint("Time Elapsed Corporization:{0} min".format((end - start) / 60)) if __name__ == "__main__": main()