# -*- coding: utf-8 -*- import csv import sys from miscellaneous import * import time from datetime import datetime import re import textacy from textacy.preprocess import normalize_whitespace from scipy import * import os csv.field_size_limit(sys.maxsize) FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" # load config config_ini = FILEPATH + "config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str): """ :param path2csv: string :param content_collumn_name: string :return: string-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue for i, lst in enumerate(stream): if i == 0: # look for desired column for j, col in enumerate(lst): if col == content_collumn_name: content_collumn = j else: yield lst[content_collumn] def ticket_csv_to_DictStream(path2csv: str, metalist: [str]): """ :param path2csv: string :param metalist: list of strings :return: dict-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue metaindices = [] metadata_temp = {} for i, lst in enumerate(stream): if i == 0: for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal for key in metalist: if re.sub('[^a-zA-Z]+', '', key) == re.sub('[^a-zA-Z]+', '', col): metaindices.append(j) metadata_temp = dict( zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10} else: metadata = metadata_temp.copy() for key, value in metadata.items(): metadata[key] = lst[value] yield metadata ################################################################################################## # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &" """ content_collumn_name = "Description" metaliste = [ "TicketNumber", "Subject", "CreatedDate", "categoryName", "Impact", "Urgency", "BenutzerID", "VerantwortlicherID", "EigentuemerID", "Solution" ] """ content_collumn_name = config.get("tickets","content_collumn_name") metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(","))) path2de_csv = FILEPATH + config.get("de_corpus","input") corpus_de_path = FILEPATH + config.get("de_corpus", "path") path2en_csv = FILEPATH + config.get("en_corpus","input") corpus_en_path = FILEPATH + config.get("en_corpus", "path") def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0): # print paths path_csv_split = path2_csv.split("/") filename = path_csv_split[len(path_csv_split) - 1] printlog("Corporization of {0} at {1}".format(filename,datetime.now())) raw_corpus = textacy.Corpus(lang) ## add files to textacy-corpi, #printlog("Add texts to {0}_textacy-corpi".format(lang)) raw_corpus.add_texts( ticketcsv_to_textStream(path2_csv, content_collumn_name), ticket_csv_to_DictStream(path2_csv, metaliste) ) # leere docs aus corpi kicken raw_corpus.remove(lambda doc: len(doc) == 0) #random Doc printen for i in range(printrandom): printRandomDoc(raw_corpus) # save corpus raw_name = lang + "_raw_ticket" save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name) printlog("Done") def main(): start = time.time() ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de") #ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en") end = time.time() printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60)) if __name__ == "__main__": main()