topicModelingTickets/corporization.py

153 lines
3.9 KiB
Python
Raw Normal View History

2017-10-11 17:16:04 +02:00
# -*- coding: utf-8 -*-
import csv
import sys
2017-10-16 14:01:38 +02:00
from miscellaneous import *
import time
from datetime import datetime
import re
import textacy
2017-10-18 17:37:20 +02:00
from textacy.preprocess import normalize_whitespace
2017-10-16 14:01:38 +02:00
from scipy import *
2017-10-17 10:13:49 +02:00
import os
2017-10-11 17:16:04 +02:00
2017-10-17 10:13:49 +02:00
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
# load config
2017-10-17 10:13:49 +02:00
config_ini = FILEPATH + "config.ini"
2017-10-11 17:16:04 +02:00
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
2017-12-08 11:06:07 +01:00
def ticketcsv_to_textStream(path2csv, content_collumn_name):
2017-10-11 17:16:04 +02:00
"""
:param path2csv: string
:param content_collumn_name: string
2017-12-08 11:06:07 +01:00
2017-10-11 17:16:04 +02:00
:return: string-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
for i, lst in enumerate(stream):
if i == 0:
# look for desired column
for j, col in enumerate(lst):
if col == content_collumn_name:
content_collumn = j
else:
yield lst[content_collumn]
2017-12-08 11:06:07 +01:00
def ticket_csv_to_DictStream(path2csv,content_collumn_name):
2017-10-11 17:16:04 +02:00
"""
:param path2csv: string
2017-12-08 11:06:07 +01:00
:param content_collumn_name: string
2017-10-11 17:16:04 +02:00
:return: dict-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
metaindices = []
2017-12-08 11:06:07 +01:00
metalist = []
metadata_template = {}
2017-10-11 17:16:04 +02:00
for i, lst in enumerate(stream):
if i == 0:
2017-12-08 11:06:07 +01:00
for j, col in enumerate(lst):
if "icketNumb" in col: #korrigieren der .csv todo wenn hier sowieso hardgecodet werden muss, dann gleich auch config.ini raus?
col = "TicketNumber"
2017-12-08 11:06:07 +01:00
metalist.append(str(col))
metaindices.append(j)
metadata_template = dict(
2017-10-11 17:16:04 +02:00
zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
else:
2017-12-08 11:06:07 +01:00
metadata = metadata_template.copy()
2017-10-11 17:16:04 +02:00
for key, value in metadata.items():
metadata[key] = lst[value]
yield metadata
2017-10-16 14:01:38 +02:00
##################################################################################################
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
content_collumn_name = config.get("tickets","content_collumn_name")
2017-10-11 17:16:04 +02:00
2017-10-25 09:46:44 +02:00
2017-10-17 10:13:49 +02:00
path2de_csv = FILEPATH + config.get("de_corpus","input")
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
2017-10-11 17:16:04 +02:00
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0): #todo bla das kann hier die main sein
"""
Use textacy to create a Corpus out of the ITMC-Ticket.csv
:param path2_csv: str
:param corpus_path: str
:param content_collumn_name: str the Collumn which is used as the Docs text
:param lang: str standard 2-letter language
:param printrandom: print n random Documents
:return: textacy.Corpus
"""
2017-10-11 17:16:04 +02:00
2017-10-17 10:13:49 +02:00
# print paths
path_csv_split = path2_csv.split("/")
filename = path_csv_split[len(path_csv_split) - 1]
2017-10-11 17:16:04 +02:00
logprint("Corporization of {0}".format(filename))#, datetime.now()))
2017-10-11 17:16:04 +02:00
2017-10-17 10:13:49 +02:00
raw_corpus = textacy.Corpus(lang)
2017-10-11 17:16:04 +02:00
## add files to textacy-corpi, todo bla hier cleanen, dict nich vergessn
2017-10-17 10:13:49 +02:00
raw_corpus.add_texts(
ticketcsv_to_textStream(path2_csv, content_collumn_name),
2017-12-08 11:06:07 +01:00
ticket_csv_to_DictStream(path2_csv,content_collumn_name)
2017-10-16 14:01:38 +02:00
)
2017-10-11 17:16:04 +02:00
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
# leere docs aus corpi kicken
2017-10-17 10:13:49 +02:00
raw_corpus.remove(lambda doc: len(doc) == 0)
2017-12-08 11:06:07 +01:00
logprint("corpus-length: {}".format(len(raw_corpus)))
2017-10-11 17:16:04 +02:00
2017-10-17 10:13:49 +02:00
# save corpus
2017-12-08 11:06:07 +01:00
raw_name = lang + "_raw"
2017-10-17 10:13:49 +02:00
save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name)
2017-12-08 11:06:07 +01:00
return raw_corpus
2017-10-17 10:13:49 +02:00
def main():
start = time.time()
2017-10-11 17:16:04 +02:00
2017-12-08 11:06:07 +01:00
raw_corpus = ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,lang="de")
2017-10-11 17:16:04 +02:00
end = time.time()
2017-10-25 09:46:44 +02:00
logprint("Time Elapsed Corporization:{0} min".format((end - start) / 60))
2017-12-08 11:06:07 +01:00
return raw_corpus
2017-10-11 17:16:04 +02:00
if __name__ == "__main__":
main()