251 lines
6.4 KiB
Python
251 lines
6.4 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
import time
|
||
|
|
||
|
|
||
|
from datetime import datetime
|
||
|
import logging
|
||
|
from nltk.corpus import stopwords
|
||
|
import csv
|
||
|
import functools
|
||
|
import re
|
||
|
import xml.etree.ElementTree as ET
|
||
|
import spacy
|
||
|
import textacy
|
||
|
from scipy import *
|
||
|
import sys
|
||
|
csv.field_size_limit(sys.maxsize)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
|
||
|
|
||
|
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
||
|
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
||
|
#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv"
|
||
|
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
||
|
|
||
|
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
|
||
|
|
||
|
|
||
|
content_collumn_name = "Description"
|
||
|
|
||
|
metaliste = [
|
||
|
|
||
|
"TicketNumber",
|
||
|
"Subject",
|
||
|
"CreatedDate",
|
||
|
"categoryName",
|
||
|
"Impact",
|
||
|
"Urgency",
|
||
|
"BenutzerID",
|
||
|
"VerantwortlicherID",
|
||
|
"EigentuemerID",
|
||
|
"Solution"
|
||
|
]
|
||
|
|
||
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||
|
corpus_name = "de_raw_corpus"
|
||
|
|
||
|
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# todo configuration file ?
|
||
|
"""
|
||
|
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||
|
|
||
|
config = ConfigParser.ConfigParser()
|
||
|
with open(config_ini) as f:
|
||
|
config.read_file(f)
|
||
|
"""
|
||
|
|
||
|
|
||
|
|
||
|
# config logging
|
||
|
logging.basicConfig(filename=logfile, level=logging.INFO)
|
||
|
# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def printlog(string, level="INFO"):
|
||
|
"""log and prints"""
|
||
|
print(string)
|
||
|
if level == "INFO":
|
||
|
logging.info(string)
|
||
|
elif level == "DEBUG":
|
||
|
logging.debug(string)
|
||
|
elif level == "WARNING":
|
||
|
logging.warning(string)
|
||
|
|
||
|
|
||
|
|
||
|
def printRandomDoc(textacyCorpus):
|
||
|
import random
|
||
|
print()
|
||
|
|
||
|
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||
|
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||
|
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||
|
textacyCorpus[randIndex].metadata))
|
||
|
|
||
|
print()
|
||
|
|
||
|
|
||
|
|
||
|
def csv_to_textStream(path2csv: str, content_collumn_name: str):
|
||
|
"""
|
||
|
:param path2csv: string
|
||
|
:param content_collumn_name: string
|
||
|
:return: string-generator
|
||
|
"""
|
||
|
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
||
|
content_collumn = 0 # standardvalue
|
||
|
|
||
|
for i, lst in enumerate(stream):
|
||
|
if i == 0:
|
||
|
# look for desired column
|
||
|
for j, col in enumerate(lst):
|
||
|
if col == content_collumn_name:
|
||
|
content_collumn = j
|
||
|
else:
|
||
|
yield lst[content_collumn]
|
||
|
|
||
|
|
||
|
def csv_to_DictStream(path2csv: str, metalist: [str]):
|
||
|
"""
|
||
|
:param path2csv: string
|
||
|
:param metalist: list of strings
|
||
|
:return: dict-generator
|
||
|
"""
|
||
|
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
||
|
|
||
|
content_collumn = 0 # standardvalue
|
||
|
metaindices = []
|
||
|
metadata_temp = {}
|
||
|
for i, lst in enumerate(stream):
|
||
|
if i == 0:
|
||
|
for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
|
||
|
for key in metalist:
|
||
|
if re.sub('[^a-zA-Z]+', '', key) == re.sub('[^a-zA-Z]+', '', col):
|
||
|
metaindices.append(j)
|
||
|
metadata_temp = dict(
|
||
|
zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
|
||
|
|
||
|
else:
|
||
|
metadata = metadata_temp.copy()
|
||
|
for key, value in metadata.items():
|
||
|
metadata[key] = lst[value]
|
||
|
yield metadata
|
||
|
|
||
|
|
||
|
def save_corpus(corpus, corpus_path, corpus_name, parser):
|
||
|
"""
|
||
|
# save stringstore
|
||
|
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||
|
with open(stringstore_path, "w") as file:
|
||
|
parser.vocab.strings.dump(file)
|
||
|
|
||
|
#todo save vocab?
|
||
|
"""
|
||
|
|
||
|
# save parser
|
||
|
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
|
||
|
|
||
|
# save content
|
||
|
contentpath = corpus_path + corpus_name + "_content.bin"
|
||
|
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
||
|
|
||
|
# save meta
|
||
|
metapath = corpus_path + corpus_name + "_meta.json"
|
||
|
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||
|
|
||
|
|
||
|
|
||
|
##################################################################################################
|
||
|
|
||
|
|
||
|
|
||
|
def cleanTextstream(textstream):
|
||
|
"""
|
||
|
:param textstream: string-gen
|
||
|
:param parser: spacy-parser
|
||
|
:yield: string-gen
|
||
|
"""
|
||
|
|
||
|
for txt in textstream:
|
||
|
yield textacy.preprocess.normalize_whitespace(txt)
|
||
|
|
||
|
|
||
|
def cleanDictstream(dictstream):
|
||
|
"""
|
||
|
:param dictstream: dict-gen
|
||
|
:param parser: spacy-parser
|
||
|
:yield: dict-gen
|
||
|
"""
|
||
|
|
||
|
for dic in dictstream:
|
||
|
|
||
|
result = {}
|
||
|
|
||
|
for key, value in dic.items():
|
||
|
result[key] = textacy.preprocess.normalize_whitespace(value)
|
||
|
yield result
|
||
|
|
||
|
|
||
|
|
||
|
def main():
|
||
|
|
||
|
printlog("Corporization: {0}".format(datetime.now()))
|
||
|
|
||
|
path_csv_split = path2de_csv.split("/")
|
||
|
printlog(path_csv_split[len(path_csv_split) - 1])
|
||
|
path_csv_split = path2en_csv.split("/")
|
||
|
printlog(path_csv_split[len(path_csv_split) - 1])
|
||
|
|
||
|
start = time.time()
|
||
|
|
||
|
DE_PARSER = spacy.load("de")
|
||
|
EN_PARSER = spacy.load("en")
|
||
|
|
||
|
de_corpus = textacy.Corpus(DE_PARSER)
|
||
|
en_corpus = textacy.Corpus(EN_PARSER)
|
||
|
|
||
|
|
||
|
|
||
|
## add files to textacy-corpus,
|
||
|
printlog("Add texts to textacy-corpus")
|
||
|
|
||
|
de_corpus.add_texts(
|
||
|
cleanTextstream(csv_to_textStream(path2de_csv, content_collumn_name)),
|
||
|
cleanDictstream(csv_to_DictStream(path2de_csv, metaliste))
|
||
|
)
|
||
|
|
||
|
|
||
|
# leere docs aus corpus kicken
|
||
|
de_corpus.remove(lambda doc: len(doc) == 0)
|
||
|
|
||
|
|
||
|
for i in range(20):
|
||
|
printRandomDoc(de_corpus)
|
||
|
|
||
|
|
||
|
#save corpus
|
||
|
|
||
|
save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name,parser=DE_PARSER)
|
||
|
|
||
|
#todo das selbe mit en_corpus
|
||
|
|
||
|
|
||
|
|
||
|
end = time.time()
|
||
|
printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60))
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|