# -*- coding: utf-8 -*- import csv import sys from miscellaneous import * import time from datetime import datetime import re import textacy from textacy.preprocess import normalize_whitespace from scipy import * import os csv.field_size_limit(sys.maxsize) FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # load config config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) def ticketcsv_to_textStream(path2csv, content_collumn_name): """ :param path2csv: string :param content_collumn_name: string :return: string-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue for i, lst in enumerate(stream): if i == 0: # look for desired column for j, col in enumerate(lst): if col == content_collumn_name: content_collumn = j else: yield lst[content_collumn] def ticket_csv_to_DictStream(path2csv,content_collumn_name): """ :param path2csv: string :param content_collumn_name: string :return: dict-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue metaindices = [] metalist = [] metadata_template = {} for i, lst in enumerate(stream): if i == 0: for j, col in enumerate(lst): if "icketNumb" in col: #korrigieren der .csv todo wenn hier sowieso hardgecodet werden muss, dann gleich auch config.ini raus? col = "TicketNumber" metalist.append(str(col)) metaindices.append(j) metadata_template = dict( zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10} else: metadata = metadata_template.copy() for key, value in metadata.items(): metadata[key] = lst[value] yield metadata ################################################################################################## content_collumn_name = config.get("tickets","content_collumn_name") path2de_csv = FILEPATH + config.get("de_corpus","input") corpus_de_path = FILEPATH + config.get("de_corpus", "path") def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0): #todo bla das kann hier die main sein """ Use textacy to create a Corpus out of the ITMC-Ticket.csv :param path2_csv: str :param corpus_path: str :param content_collumn_name: str the Collumn which is used as the Docs text :param lang: str standard 2-letter language :param printrandom: print n random Documents :return: textacy.Corpus """ # print paths path_csv_split = path2_csv.split("/") filename = path_csv_split[len(path_csv_split) - 1] logprint("Corporization of {0}".format(filename))#, datetime.now())) raw_corpus = textacy.Corpus(lang) ## add files to textacy-corpi, todo bla hier cleanen, dict nich vergessn raw_corpus.add_texts( ticketcsv_to_textStream(path2_csv, content_collumn_name), ticket_csv_to_DictStream(path2_csv,content_collumn_name) ) # leere docs aus corpi kicken raw_corpus.remove(lambda doc: len(doc) == 0) logprint("corpus-length: {}".format(len(raw_corpus))) # save corpus raw_name = lang + "_raw" save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name) return raw_corpus def main(): start = time.time() raw_corpus = ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,lang="de") end = time.time() logprint("Time Elapsed Corporization:{0} min".format((end - start) / 60)) return raw_corpus if __name__ == "__main__": main()