# -*- coding: utf-8 -*- from datetime import datetime import time import logging from stop_words import get_stop_words #import words as words from nltk.corpus import stopwords as nltk_stopwords from collections import Counter import csv import re import xml.etree.ElementTree as ET import spacy import textacy from scipy import * import sys csv.field_size_limit(sys.maxsize) import pickle import configparser as ConfigParser from miscellaneous import * import time from datetime import datetime import logging from nltk.corpus import stopwords import csv import functools import re import xml.etree.ElementTree as ET import spacy import textacy from scipy import * import sys csv.field_size_limit(sys.maxsize) # load config config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str): """ :param path2csv: string :param content_collumn_name: string :return: string-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue for i, lst in enumerate(stream): if i == 0: # look for desired column for j, col in enumerate(lst): if col == content_collumn_name: content_collumn = j else: yield lst[content_collumn] def ticket_csv_to_DictStream(path2csv: str, metalist: [str]): """ :param path2csv: string :param metalist: list of strings :return: dict-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue metaindices = [] metadata_temp = {} for i, lst in enumerate(stream): if i == 0: for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal for key in metalist: if re.sub('[^a-zA-Z]+', '', key) == re.sub('[^a-zA-Z]+', '', col): metaindices.append(j) metadata_temp = dict( zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10} else: metadata = metadata_temp.copy() for key, value in metadata.items(): metadata[key] = lst[value] yield metadata ################################################################################################## # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &" """ content_collumn_name = "Description" metaliste = [ "TicketNumber", "Subject", "CreatedDate", "categoryName", "Impact", "Urgency", "BenutzerID", "VerantwortlicherID", "EigentuemerID", "Solution" ] """ content_collumn_name = config.get("tickets","content_collumn_name") metaliste = config.get("tickets","metaliste") path2de_csv = config.get("de_corpus","input") corpus_de_path = config.get("de_corpus", "path") raw_de_name = config.get("de_corpus", "raw") path2en_csv = config.get("en_corpus","input") corpus_en_path = config.get("en_corpus", "path") raw_en_name = config.get("en_corpus", "raw") def main(): start = time.time() printlog("Corporization: {0}".format(datetime.now())) #print paths path_csv_split = path2de_csv.split("/") printlog(path_csv_split[len(path_csv_split) - 1]) path_csv_split = path2en_csv.split("/") printlog(path_csv_split[len(path_csv_split) - 1]) DE_PARSER = spacy.load("de") EN_PARSER = spacy.load("en") raw_de_corpus = textacy.Corpus(DE_PARSER) raw_en_corpus = textacy.Corpus(EN_PARSER) ## add files to textacy-corpi, printlog("Add texts to textacy-corpi") raw_de_corpus.add_texts( ticketcsv_to_textStream(path2de_csv, content_collumn_name), ticket_csv_to_DictStream(path2de_csv, metaliste) ) raw_en_corpus.add_texts( ticketcsv_to_textStream(path2en_csv, content_collumn_name), ticket_csv_to_DictStream(path2en_csv, metaliste) ) # leere docs aus corpi kicken raw_de_corpus.remove(lambda doc: len(doc) == 0) raw_en_corpus.remove(lambda doc: len(doc) == 0) #for i in range(20): # printRandomDoc(raw_de_corpus) # printRandomDoc(raw_en_corpus) #save corpi save_corpus(corpus=raw_de_corpus, corpus_path=corpus_de_path, corpus_name=raw_de_name) save_corpus(corpus=raw_en_corpus, corpus_path=corpus_en_path, corpus_name=raw_en_name) end = time.time() printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60)) if __name__ == "__main__": main()