topicModelingTickets/corporization.py

201 lines
4.8 KiB
Python
Raw Normal View History

2017-10-11 17:16:04 +02:00
# -*- coding: utf-8 -*-
from datetime import datetime
2017-10-16 14:01:38 +02:00
import time
2017-10-11 17:16:04 +02:00
import logging
2017-10-16 14:01:38 +02:00
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
2017-10-11 17:16:04 +02:00
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
2017-10-16 14:01:38 +02:00
import pickle
import configparser as ConfigParser
from miscellaneous import *
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
import time
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
from datetime import datetime
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
# load config
2017-10-11 17:16:04 +02:00
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
2017-10-11 17:16:04 +02:00
"""
:param path2csv: string
:param content_collumn_name: string
:return: string-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
for i, lst in enumerate(stream):
if i == 0:
# look for desired column
for j, col in enumerate(lst):
if col == content_collumn_name:
content_collumn = j
else:
yield lst[content_collumn]
def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
2017-10-11 17:16:04 +02:00
"""
:param path2csv: string
:param metalist: list of strings
:return: dict-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
metaindices = []
metadata_temp = {}
for i, lst in enumerate(stream):
if i == 0:
for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
for key in metalist:
if re.sub('[^a-zA-Z]+', '', key) == re.sub('[^a-zA-Z]+', '', col):
metaindices.append(j)
metadata_temp = dict(
zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
else:
metadata = metadata_temp.copy()
for key, value in metadata.items():
metadata[key] = lst[value]
yield metadata
2017-10-16 14:01:38 +02:00
##################################################################################################
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
"""
content_collumn_name = "Description"
metaliste = [
"TicketNumber",
"Subject",
"CreatedDate",
"categoryName",
"Impact",
"Urgency",
"BenutzerID",
"VerantwortlicherID",
"EigentuemerID",
"Solution"
]
"""
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
content_collumn_name = config.get("tickets","content_collumn_name")
metaliste = config.get("tickets","metaliste")
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
path2de_csv = config.get("de_corpus","input")
corpus_de_path = config.get("de_corpus", "path")
raw_de_name = config.get("de_corpus", "raw")
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
path2en_csv = config.get("en_corpus","input")
corpus_en_path = config.get("en_corpus", "path")
raw_en_name = config.get("en_corpus", "raw")
2017-10-11 17:16:04 +02:00
def main():
2017-10-16 14:01:38 +02:00
start = time.time()
2017-10-11 17:16:04 +02:00
printlog("Corporization: {0}".format(datetime.now()))
2017-10-16 14:01:38 +02:00
#print paths
2017-10-11 17:16:04 +02:00
path_csv_split = path2de_csv.split("/")
printlog(path_csv_split[len(path_csv_split) - 1])
path_csv_split = path2en_csv.split("/")
printlog(path_csv_split[len(path_csv_split) - 1])
2017-10-16 14:01:38 +02:00
2017-10-11 17:16:04 +02:00
DE_PARSER = spacy.load("de")
EN_PARSER = spacy.load("en")
2017-10-16 14:01:38 +02:00
raw_de_corpus = textacy.Corpus(DE_PARSER)
raw_en_corpus = textacy.Corpus(EN_PARSER)
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
## add files to textacy-corpi,
printlog("Add texts to textacy-corpi")
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
raw_de_corpus.add_texts(
ticketcsv_to_textStream(path2de_csv, content_collumn_name),
ticket_csv_to_DictStream(path2de_csv, metaliste)
2017-10-11 17:16:04 +02:00
)
2017-10-16 14:01:38 +02:00
raw_en_corpus.add_texts(
ticketcsv_to_textStream(path2en_csv, content_collumn_name),
ticket_csv_to_DictStream(path2en_csv, metaliste)
)
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
# leere docs aus corpi kicken
raw_de_corpus.remove(lambda doc: len(doc) == 0)
raw_en_corpus.remove(lambda doc: len(doc) == 0)
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
#for i in range(20):
# printRandomDoc(raw_de_corpus)
# printRandomDoc(raw_en_corpus)
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
#save corpi
save_corpus(corpus=raw_de_corpus, corpus_path=corpus_de_path, corpus_name=raw_de_name)
save_corpus(corpus=raw_en_corpus, corpus_path=corpus_en_path, corpus_name=raw_en_name)
2017-10-11 17:16:04 +02:00
end = time.time()
printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60))
if __name__ == "__main__":
main()