# -*- coding: utf-8 -*- import time from datetime import datetime import logging from nltk.corpus import stopwords import csv import functools import re import xml.etree.ElementTree as ET import spacy import textacy from scipy import * import sys csv.field_size_limit(sys.maxsize) # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &" path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv" path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv" #path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv" path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv" path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv" content_collumn_name = "Description" metaliste = [ "TicketNumber", "Subject", "CreatedDate", "categoryName", "Impact", "Urgency", "BenutzerID", "VerantwortlicherID", "EigentuemerID", "Solution" ] corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" corpus_name = "de_raw_corpus" logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log" # todo configuration file ? """ config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini" config = ConfigParser.ConfigParser() with open(config_ini) as f: config.read_file(f) """ # config logging logging.basicConfig(filename=logfile, level=logging.INFO) # logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO) def printlog(string, level="INFO"): """log and prints""" print(string) if level == "INFO": logging.info(string) elif level == "DEBUG": logging.debug(string) elif level == "WARNING": logging.warning(string) def printRandomDoc(textacyCorpus): import random print() printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) randIndex = int((len(textacyCorpus) - 1) * random.random()) printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) print() def csv_to_textStream(path2csv: str, content_collumn_name: str): """ :param path2csv: string :param content_collumn_name: string :return: string-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue for i, lst in enumerate(stream): if i == 0: # look for desired column for j, col in enumerate(lst): if col == content_collumn_name: content_collumn = j else: yield lst[content_collumn] def csv_to_DictStream(path2csv: str, metalist: [str]): """ :param path2csv: string :param metalist: list of strings :return: dict-generator """ stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8') content_collumn = 0 # standardvalue metaindices = [] metadata_temp = {} for i, lst in enumerate(stream): if i == 0: for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal for key in metalist: if re.sub('[^a-zA-Z]+', '', key) == re.sub('[^a-zA-Z]+', '', col): metaindices.append(j) metadata_temp = dict( zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10} else: metadata = metadata_temp.copy() for key, value in metadata.items(): metadata[key] = lst[value] yield metadata def save_corpus(corpus, corpus_path, corpus_name, parser): """ # save stringstore stringstore_path = corpus_path + corpus_name + '_strings.json' with open(stringstore_path, "w") as file: parser.vocab.strings.dump(file) #todo save vocab? """ # save parser parser.save_to_directory(corpus_path + str(parser.lang) + '_parser') # save content contentpath = corpus_path + corpus_name + "_content.bin" textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath) # save meta metapath = corpus_path + corpus_name + "_meta.json" textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath) ################################################################################################## def cleanTextstream(textstream): """ :param textstream: string-gen :param parser: spacy-parser :yield: string-gen """ for txt in textstream: yield textacy.preprocess.normalize_whitespace(txt) def cleanDictstream(dictstream): """ :param dictstream: dict-gen :param parser: spacy-parser :yield: dict-gen """ for dic in dictstream: result = {} for key, value in dic.items(): result[key] = textacy.preprocess.normalize_whitespace(value) yield result def main(): printlog("Corporization: {0}".format(datetime.now())) path_csv_split = path2de_csv.split("/") printlog(path_csv_split[len(path_csv_split) - 1]) path_csv_split = path2en_csv.split("/") printlog(path_csv_split[len(path_csv_split) - 1]) start = time.time() DE_PARSER = spacy.load("de") EN_PARSER = spacy.load("en") de_corpus = textacy.Corpus(DE_PARSER) en_corpus = textacy.Corpus(EN_PARSER) ## add files to textacy-corpus, printlog("Add texts to textacy-corpus") de_corpus.add_texts( cleanTextstream(csv_to_textStream(path2de_csv, content_collumn_name)), cleanDictstream(csv_to_DictStream(path2de_csv, metaliste)) ) # leere docs aus corpus kicken de_corpus.remove(lambda doc: len(doc) == 0) for i in range(20): printRandomDoc(de_corpus) #save corpus save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name,parser=DE_PARSER) #todo das selbe mit en_corpus end = time.time() printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60)) if __name__ == "__main__": main()