2017-10-10 14:42:09 +02:00
# -*- coding: utf-8 -*-
2017-12-08 11:06:07 +01:00
import matplotlib
matplotlib . use ( ' Agg ' )
2017-10-10 14:42:09 +02:00
from datetime import datetime
2017-11-17 11:46:57 +01:00
import draw
import draw1
2017-10-10 14:42:09 +02:00
import time
2017-11-06 12:54:59 +01:00
import numpy as np
2017-11-27 12:49:05 +01:00
import operator
2017-10-10 14:42:09 +02:00
2017-10-16 14:01:38 +02:00
import csv
import sys
2017-10-18 17:37:20 +02:00
import json
import os . path
import subprocess
2017-11-03 11:49:26 +01:00
from textacy import Vectorizer , viz
2017-10-16 14:01:38 +02:00
2017-10-18 17:37:20 +02:00
from miscellaneous import *
2017-10-16 14:01:38 +02:00
import textacy
from scipy import *
2017-10-18 17:37:20 +02:00
import os
2017-12-08 11:06:07 +01:00
2017-10-16 14:01:38 +02:00
csv . field_size_limit ( sys . maxsize )
2017-10-18 17:37:20 +02:00
FILEPATH = os . path . dirname ( os . path . realpath ( __file__ ) ) + " / "
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
# load config
config_ini = FILEPATH + " config.ini "
config = ConfigParser . ConfigParser ( )
with open ( config_ini ) as f :
config . read_file ( f )
2017-10-10 14:42:09 +02:00
2017-11-06 12:54:59 +01:00
def textacyTopicModeling ( corpus ,
n_topics = 15 , top_topic_words = 7 , top_document_labels_per_topic = 5 ,
2017-11-17 11:46:57 +01:00
ngrams = 1 , min_df = 1 , max_df = 0.9 ,
2017-11-06 12:54:59 +01:00
topicModel = ' lda ' ) :
n_terms = int ( n_topics * top_topic_words )
sort_terms_by = ' seriation ' # 'seriation', 'weight', 'index', 'alphabetical'
rank_terms_by = ' corpus ' # 'corpus', 'topic'
2017-10-25 09:46:44 +02:00
2017-11-17 11:46:57 +01:00
logprint ( " #### Topic Modeling {0} " . format ( topicModel ) )
2017-10-25 09:46:44 +02:00
logprint ( str ( " ngrams: {0} " . format ( ngrams ) ) )
logprint ( str ( " min_df: {0} " . format ( min_df ) ) )
logprint ( str ( " max_df: {0} " . format ( max_df ) ) )
logprint ( str ( " n_topics: {0} " . format ( n_topics ) ) )
2017-11-06 12:54:59 +01:00
logprint ( " \n " )
2017-10-10 14:42:09 +02:00
start = time . time ( )
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ( ' tf ' if topicModel == ' lda ' else ' tfidf ' )
2017-10-30 12:56:52 +01:00
2017-10-10 14:42:09 +02:00
2017-11-17 11:46:57 +01:00
###### vectorize corpi
2017-11-06 12:54:59 +01:00
2017-10-10 14:42:09 +02:00
vectorizer = Vectorizer ( weighting = weighting , min_df = min_df , max_df = max_df )
2017-11-06 12:54:59 +01:00
terms_list = ( doc . to_terms_list ( ngrams = ngrams , named_entities = False , as_strings = True ) for doc in corpus )
2017-10-10 14:42:09 +02:00
doc_term_matrix = vectorizer . fit_transform ( terms_list )
2017-12-19 17:12:35 +01:00
#id2term = vectorizer.__getattribute__("id_to_term")
2017-10-10 14:42:09 +02:00
2017-11-17 11:46:57 +01:00
####### Initialize and train a topic model
2017-11-06 12:54:59 +01:00
2017-10-10 14:42:09 +02:00
model = textacy . tm . TopicModel ( topicModel , n_topics = n_topics )
2017-11-06 12:54:59 +01:00
2017-10-10 14:42:09 +02:00
model . fit ( doc_term_matrix )
doc_topic_matrix = model . transform ( doc_term_matrix )
2017-11-06 12:54:59 +01:00
2017-11-27 12:49:05 +01:00
for topic_idx , top_terms in model . top_topic_terms ( vectorizer . id_to_term , top_n = top_topic_words , weights = True ) :
logprint ( ' {0} : {1} ' . format ( topic_idx , str ( top_terms ) ) )
2017-10-10 14:42:09 +02:00
for topic_idx , top_docs in model . top_topic_docs ( doc_topic_matrix , top_n = top_document_labels_per_topic ) :
2017-10-25 09:46:44 +02:00
logprint ( topic_idx )
2017-10-10 14:42:09 +02:00
for j in top_docs :
2017-10-25 09:46:44 +02:00
logprint ( corpus [ j ] . metadata [ ' categoryName ' ] )
2017-10-10 14:42:09 +02:00
2017-10-30 12:56:52 +01:00
2017-11-06 12:54:59 +01:00
####################### termite plot ###################################################################
grams_label = " uni " if ngrams == 1 else " bi "
2017-11-27 12:49:05 +01:00
draw1 . termite_plot ( model , doc_term_matrix , vectorizer . id_to_term ,
2017-10-30 12:56:52 +01:00
n_terms = n_terms ,
sort_terms_by = sort_terms_by ,
2017-11-17 11:46:57 +01:00
rank_terms_by = rank_terms_by + ' _weight ' ,
2017-10-30 12:56:52 +01:00
2017-11-17 11:46:57 +01:00
save = FILEPATH + " results/ {} _ {} _ {} _ {} _ {} _ {} .png " . format ( grams_label , topicModel , n_topics ,
n_terms , sort_terms_by , rank_terms_by ) )
2017-10-30 12:56:52 +01:00
2017-10-10 14:42:09 +02:00
end = time . time ( )
2017-10-25 09:46:44 +02:00
logprint ( " \n \n \n Time Elapsed Topic Modeling with {1} : {0} min \n \n " . format ( ( end - start ) / 60 , topicModel ) )
2017-10-10 14:42:09 +02:00
2017-12-19 17:12:35 +01:00
def textacyTopicModeling_v2 ( doc_term_matrix , id_to_term ,
n_topics = 15 , top_topic_words = 3 ,
topicModel = ' lda ' ) :
n_terms = int ( n_topics * top_topic_words )
sort_terms_by = ' seriation ' # 'seriation', 'weight', 'index', 'alphabetical'
rank_terms_by = ' corpus ' # 'corpus', 'topic'
logprint ( " #### Topic Modeling {0} " . format ( topicModel ) )
logprint ( str ( " n_topics: {0} " . format ( n_topics ) ) )
logprint ( " \n " )
start = time . time ( )
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ( ' tf ' if topicModel == ' lda ' else ' tfidf ' )
####### Initialize and train a topic model
model = textacy . tm . TopicModel ( topicModel , n_topics = n_topics )
model . fit ( doc_term_matrix )
doc_topic_matrix = model . transform ( doc_term_matrix )
for topic_idx , top_terms in model . top_topic_terms ( id_to_term , top_n = top_topic_words , weights = True ) :
logprint ( ' {0} : {1} ' . format ( topic_idx , str ( top_terms ) ) )
####################### termite plot ###################################################################
draw1 . termite_plot ( model , doc_term_matrix , id_to_term ,
n_terms = n_terms ,
sort_terms_by = sort_terms_by ,
rank_terms_by = rank_terms_by + ' _weight ' ,
save = FILEPATH + " results/ {} .png " . format ( topicModel ) )
end = time . time ( )
logprint ( " \n \n \n Time Elapsed Topic Modeling with {1} : {0} min \n \n " . format ( ( end - start ) / 60 , topicModel ) )
def create_ticket2label_dict ( ticket2chunk_dict , corpus ) :
"""
Creates a dictionary to map a TicketNumber to a label
: param ticket2chunk_dict : e . g . { TicketNumber : KB_entries }
: return : { TicketNumber : label }
"""
labelist = ticket2chunk_dict . values ( )
labelist = flatten ( labelist )
labeldict = create_labeldict ( labelist , min_label_freq = 1 , add_default_label = True )
ticket2label = { }
for doc in corpus :
ticketID = doc . metadata [ " TicketNumber " ]
keywords = ticket2chunk_dict . get ( ticketID , [ ' DEFAULT ' ] )
label = " "
for kw in keywords :
label = label + str ( labeldict . get ( normalize_str ( str ( kw ) ) , labeldict [ ' DEFAULT ' ] ) ) + " "
ticket2label . update ( { ticketID : label } )
return ticket2label
def create_labeldict ( labelist , min_label_freq = 1 , add_default_label = True ) :
# nur die x häufigsten labels benutzen
labelist = [ l for l in labelist if labelist . count ( l ) > = min_label_freq ]
in_labelist_ = { k : labelist . count ( k ) for k in labelist } # { label1 : 3 , label2 : 5, label3 : 1 }
labelist = sort_dictionary ( in_labelist_ ) # [ (label3, 1), (label1, 3), (label2, 5) ]
labelist . reverse ( ) # [ (label2, 5), (label1, 3), (label3, 1) ]
labeldict = { elem [ 0 ] : i for i , elem in enumerate ( labelist ) } # { label2 : 0, label1 : 1 , label3 : 2 }
if add_default_label :
if ' DEFAULT ' not in labeldict . keys ( ) :
labeldict . update ( { ' DEFAULT ' : len ( labelist ) } ) # { label2 : 0, label1 : 1 , label3 : 2 , DEFAULT : 3 }
return labeldict
#todo
def jgibbsLLDAv2 ( labeled_lines_path , ticket2kbs_dict , cleaned_corpus , path2save_results , top_topic_words = 7 ) :
ticket2label_dict = create_ticket2label_dict ( ticket2kbs_dict , cleaned_corpus )
# reduce ticket2label_dict
labeldict = { }
label_list = list ( set ( ticket2label_dict . values ( ) ) )
lbl_dict = { elem : i for i , elem in enumerate ( label_list ) }
labeldict = { k : lbl_dict [ v ] for k , v in ticket2label_dict . items ( ) }
labeldict . update ( { " DEFAULT " : len ( labeldict ) } )
def gen_lines_from_labeled_lines ( input , ticket2label_dict ) :
line_gen = textacy . fileio . read_file_lines ( input )
for line in line_gen :
label = re . findall ( r ' \ [(.*?) \ ] ' , line )
new_label = " [ "
for lbl in label :
new_label = new_label + str ( ticket2label_dict . get ( str ( lbl ) , " " ) ) . strip ( ) + " "
new_label = new_label + " ] "
result = new_label + str ( line . rpartition ( " ] " ) [ 2 ] )
# new_label = str([ticket2label_dict.get(str(lbl),"") for lbl in label])
# result = "[ " + new_label + " ] " + line.rpartition("]")[2]
#print(result)
yield result
labeldict_rev = { v : k for k , v in labeldict . items ( ) }
#line_gen = gen_lines_from_labeled_lines(labeled_lines_path,ticket2label_dict)
line_gen = gen_lines_from_labeled_lines ( labeled_lines_path , labeldict )
jgibbsLLDA_root = FILEPATH + " java_LabledLDA/ "
LLDA_filepath = " {0} models/tickets/tickets.gz " . format ( jgibbsLLDA_root )
textacy . fileio . write_file_lines ( line_gen , filepath = LLDA_filepath )
# wait for file to exist
while not os . path . exists ( LLDA_filepath ) :
time . sleep ( 1 )
# run JGibbsLLDA file
n_topics = len ( labeldict ) #+1 #default-topic
FNULL = open ( os . devnull , ' w ' ) # supress output
cmd_jgibbs_java = [ " java " , " -cp " ,
" {0} lib/trove-3.0.3.jar: {0} lib/args4j-2.0.6.jar: {0} out/production/LabledLDA/ " . format (
jgibbsLLDA_root ) ,
" jgibblda.LDA " , " -est " , " -dir " , " {0} models/tickets " . format ( jgibbsLLDA_root ) , " -dfile " ,
" tickets.gz " ,
" -twords " , str ( top_topic_words ) , " -ntopics " , str ( n_topics ) ]
subprocess . call ( cmd_jgibbs_java , stdout = FNULL )
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
cmd_gzip = [ " gzip " , " -dc " , " {0} /models/tickets/.twords.gz " . format ( jgibbsLLDA_root ) ]
output = subprocess . check_output ( cmd_gzip ) . decode ( " utf-8 " )
topic_regex = re . compile ( r ' Topic [0-9]* ' )
#####################################
# todo save results in file aufgrund von results
result = [ ]
2017-12-08 11:06:07 +01:00
2017-12-19 17:12:35 +01:00
for line in output . splitlines ( ) :
findall = topic_regex . findall ( line )
if len ( findall ) != 0 :
try :
index = int ( findall [ 0 ] . split ( ) [ 1 ] )
result . append ( " Topic {} {} : " . format ( index , str ( ticket2kbs_dict [ labeldict_rev [ index ] ] ) ) )
except :
result . append ( line )
else :
result . append ( line )
textacy . fileio . write_file_lines ( result , path2save_results + " .txt " )
#####################################
results = [ ]
res_dict = { }
count = 0
for line in output . splitlines ( ) :
findall = topic_regex . findall ( line )
if len ( findall ) != 0 :
if len ( res_dict ) != 0 :
results . append ( res_dict ) #vorheriges an die liste ran (ist ja dann fertig)
index = int ( findall [ 0 ] . split ( ) [ 1 ] )
res_dict = { index : str ( labeldict_rev [ index ] ) }
else :
splitted = line . split ( )
res_dict [ splitted [ 0 ] ] = float ( splitted [ 1 ] )
if len ( res_dict ) != 0 :
results . append ( res_dict ) # letzes an die liste ran
# every term in the resulsts to a list
terms = [ ]
for res in results :
for key , value in res . items ( ) :
if not isinstance ( key , int ) and not key in terms :
terms . append ( key )
term2id = { t : i for i , t in enumerate ( terms ) } #and to dict
################# termite plot #####################################################################
topic_labels = list ( range ( len ( labeldict ) ) )
term_labels = list ( range ( len ( term2id ) ) ) #tuple([key for key in term2id.keys()])
term_topic_weights = np . zeros ( ( len ( term2id ) , len ( topic_labels ) ) )
for i , res in enumerate ( results ) :
for key , value in res . items ( ) :
if not isinstance ( key , int ) :
term_topic_weights [ term2id [ key ] ] [ i ] = value
term_labels [ term2id [ key ] ] = key
else :
topic_labels [ i ] = labeldict_rev [ key ]
draw . draw_termite (
term_topic_weights , topic_labels , term_labels , save = path2save_results + " .png " )
draw . draw_termite (
term_topic_weights , topic_labels , term_labels , save = path2save_results + " _spaced.png " , pow_x = 0.78 , pow_y = 0.87 )
# save labeldict
labeldict_path = path2save_results + " _labeldict.json "
with open ( labeldict_path , ' w ' ) as file :
file . write ( json . dumps ( labeldict ) )
2017-12-08 11:06:07 +01:00
2017-11-17 11:46:57 +01:00
def jgibbsLLDA ( labeldict , line_gen , path2save_results , top_topic_words = 7 ) :
2017-10-10 14:42:09 +02:00
2017-12-08 11:06:07 +01:00
#labeldict = {k : labelist.count(k) for k in labelist}
#max=0
#for v in labeldict.values():
# max = v if v > max else max
#labelist = sort_dictionary(labeldict)
#labeldict.update({'DEFAULT' : max+1})
2017-11-17 11:46:57 +01:00
labeldict_rev = { v : k for k , v in labeldict . items ( ) }
2017-11-06 12:54:59 +01:00
jgibbsLLDA_root = FILEPATH + " java_LabledLDA/ "
LLDA_filepath = " {0} models/tickets/tickets.gz " . format ( jgibbsLLDA_root )
2017-11-17 11:46:57 +01:00
textacy . fileio . write_file_lines ( line_gen , filepath = LLDA_filepath )
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
# wait for file to exist
while not os . path . exists ( LLDA_filepath ) :
time . sleep ( 1 )
2017-11-06 12:54:59 +01:00
# run JGibbsLLDA file
2017-11-17 11:46:57 +01:00
n_topics = len ( labeldict ) #+1 #default-topic
2017-10-18 17:37:20 +02:00
FNULL = open ( os . devnull , ' w ' ) # supress output
2017-10-25 09:46:44 +02:00
cmd_jgibbs_java = [ " java " , " -cp " ,
" {0} lib/trove-3.0.3.jar: {0} lib/args4j-2.0.6.jar: {0} out/production/LabledLDA/ " . format (
jgibbsLLDA_root ) ,
" jgibblda.LDA " , " -est " , " -dir " , " {0} models/tickets " . format ( jgibbsLLDA_root ) , " -dfile " ,
" tickets.gz " ,
" -twords " , str ( top_topic_words ) , " -ntopics " , str ( n_topics ) ]
subprocess . call ( cmd_jgibbs_java , stdout = FNULL )
2017-10-10 14:42:09 +02:00
2017-10-18 17:37:20 +02:00
2017-11-06 12:54:59 +01:00
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
2017-10-25 09:46:44 +02:00
cmd_gzip = [ " gzip " , " -dc " , " {0} /models/tickets/.twords.gz " . format ( jgibbsLLDA_root ) ]
2017-11-06 12:54:59 +01:00
output = subprocess . check_output ( cmd_gzip ) . decode ( " utf-8 " )
2017-10-25 09:46:44 +02:00
2017-11-06 12:54:59 +01:00
topic_regex = re . compile ( r ' Topic [0-9]* ' )
2017-10-25 09:46:44 +02:00
2017-11-06 12:54:59 +01:00
#####################################
# todo save results in file aufgrund von results
2017-10-25 09:46:44 +02:00
result = [ ]
2017-10-18 17:37:20 +02:00
2017-11-06 12:54:59 +01:00
for line in output . splitlines ( ) :
findall = topic_regex . findall ( line )
2017-10-25 09:46:44 +02:00
if len ( findall ) != 0 :
try :
index = int ( findall [ 0 ] . split ( ) [ 1 ] )
2017-11-17 11:46:57 +01:00
result . append ( " Topic {} {} : " . format ( index , labeldict_rev [ index ] ) )
2017-10-25 09:46:44 +02:00
except :
result . append ( line )
else :
result . append ( line )
2017-11-06 12:54:59 +01:00
textacy . fileio . write_file_lines ( result , path2save_results + " .txt " )
#####################################
2017-11-03 11:49:26 +01:00
2017-11-06 12:54:59 +01:00
results = [ ]
res_dict = { }
count = 0
for line in output . splitlines ( ) :
findall = topic_regex . findall ( line )
if len ( findall ) != 0 :
if len ( res_dict ) != 0 :
results . append ( res_dict ) #vorheriges an die liste ran (ist ja dann fertig)
index = int ( findall [ 0 ] . split ( ) [ 1 ] )
2017-11-17 11:46:57 +01:00
res_dict = { index : str ( labeldict_rev [ index ] ) }
2017-11-06 12:54:59 +01:00
else :
splitted = line . split ( )
res_dict [ splitted [ 0 ] ] = float ( splitted [ 1 ] )
if len ( res_dict ) != 0 :
results . append ( res_dict ) # letzes an die liste ran
# every term in the resulsts to a list
terms = [ ]
for res in results :
for key , value in res . items ( ) :
if not isinstance ( key , int ) and not key in terms :
terms . append ( key )
term2id = { t : i for i , t in enumerate ( terms ) } #and to dict
################# termite plot #####################################################################
2017-11-17 11:46:57 +01:00
topic_labels = list ( range ( len ( labeldict ) ) )
2017-11-06 12:54:59 +01:00
term_labels = list ( range ( len ( term2id ) ) ) #tuple([key for key in term2id.keys()])
2017-11-03 11:49:26 +01:00
2017-11-06 12:54:59 +01:00
term_topic_weights = np . zeros ( ( len ( term2id ) , len ( topic_labels ) ) )
for i , res in enumerate ( results ) :
for key , value in res . items ( ) :
if not isinstance ( key , int ) :
term_topic_weights [ term2id [ key ] ] [ i ] = value
term_labels [ term2id [ key ] ] = key
else :
2017-11-17 11:46:57 +01:00
topic_labels [ i ] = labeldict_rev [ key ]
2017-11-03 11:49:26 +01:00
2017-11-17 11:46:57 +01:00
draw . draw_termite (
2017-11-06 12:54:59 +01:00
term_topic_weights , topic_labels , term_labels , save = path2save_results + " .png " )
2017-11-17 11:46:57 +01:00
draw . draw_termite (
term_topic_weights , topic_labels , term_labels , save = path2save_results + " _spaced.png " , pow_x = 0.78 , pow_y = 0.87 )
# save labeldict
labeldict_path = path2save_results + " _labeldict.json "
with open ( labeldict_path , ' w ' ) as file :
file . write ( json . dumps ( labeldict ) )
def jgibbsLLDA_category ( corpus , path2save_results , top_topic_words = 7 ) :
start = time . time ( )
logprint ( " " )
logprint ( " start Category-LLDA: " )
2017-11-21 10:14:37 +01:00
2017-12-19 17:12:35 +01:00
# labeldict ############################################################################################
2017-11-17 11:46:57 +01:00
# build dictionary of ticketcategories
labelist = [ ]
2017-11-21 10:14:37 +01:00
for doc in corpus :
2017-11-27 12:49:05 +01:00
2017-12-19 17:12:35 +01:00
category = normalize_str ( doc . metadata [ " categoryName " ] )
2017-11-27 12:49:05 +01:00
labelist . append ( category )
2017-12-19 17:12:35 +01:00
x = 50 # frage nur die x häufigsten labels benutzen, rest raus?
labelist = [ l for l in labelist if labelist . count ( l ) > x ]
2017-11-17 11:46:57 +01:00
2017-12-08 11:06:07 +01:00
in_labelist_ = { k : labelist . count ( k ) for k in labelist }
labelist = sort_dictionary ( in_labelist_ )
labelist . reverse ( )
labeldict = { elem [ 0 ] : i for i , elem in enumerate ( labelist ) }
#for elem in labelist:
# l = elem[0]
# c = elem[1]
#labeldict = {elem[0] : len(labelist)-(i+1) for i, elem in enumerate(labelist)}
#labelist = list(set(labelist))
#labeldict = {k: v for v, k in enumerate(labelist)}
labeldict . update ( { ' DEFAULT ' : len ( labelist ) } )
2017-11-17 11:46:57 +01:00
2017-12-19 17:12:35 +01:00
##############################################################################################
2017-11-21 10:14:37 +01:00
2017-11-17 11:46:57 +01:00
def gen_cat_lines ( textacyCorpus , labeldict ) :
""" generates [topic1, topic2....] tok1 tok2 tok3 out of corpi """
for doc in textacyCorpus :
2017-12-19 17:12:35 +01:00
label = labeldict . get ( normalize_str ( doc . metadata [ " categoryName " ] ) , labeldict [ ' DEFAULT ' ] )
2017-11-27 12:49:05 +01:00
2017-12-08 11:06:07 +01:00
if label is not ' DEFAULT ' :
yield " [ " + str ( label ) + " ] " + doc . text
2017-11-17 11:46:57 +01:00
line_gen = gen_cat_lines ( corpus , labeldict )
path2save_results = path2save_results + " _kb_cat_llda_ {} " . format ( " top " + str ( top_topic_words ) )
jgibbsLLDA ( labeldict , line_gen , path2save_results , top_topic_words = top_topic_words )
2017-10-25 09:46:44 +02:00
end = time . time ( )
2017-11-17 11:46:57 +01:00
logprint ( " \n \n \n Time Elapsed Category-LLDA : {0} min \n \n " . format ( ( end - start ) / 60 ) )
2017-11-06 12:54:59 +01:00
2017-11-27 12:49:05 +01:00
@deprecated
2017-11-17 11:46:57 +01:00
def jgibbsLLDA_KB ( corpus , path2save_results , top_topic_words = 7 , kb_keywords = False ) :
""" ticket_ID -> KB_ID -> keywords / subject -> llda """
2017-11-06 12:54:59 +01:00
2017-11-17 11:46:57 +01:00
start = time . time ( )
logprint ( " " )
logprint ( " start {} -LLDA: " . format ( " Keyword " if kb_keywords else " Subject " ) )
2017-12-19 17:12:35 +01:00
# labeldict ############################################################################################
2017-11-17 11:46:57 +01:00
# ticket2kb_dict
kb2ticket_gen = textacy . fileio . read_csv ( FILEPATH + " M42-Export/KB2Ticket_2017-09-13.csv " , delimiter = " ; " )
ticket2kb_dict = { }
for line in kb2ticket_gen :
ticket_id = line [ 0 ]
kb_id = line [ 1 ]
ticket2kb_dict [ ticket_id ] = kb_id
# {'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...}
kb_entries_used = len ( list ( set ( ticket2kb_dict . values ( ) ) ) )
print ( " kb_entries_used: {} " . format ( kb_entries_used ) )
# kb2keywords_dict
kb2keywords_gen = textacy . fileio . read_csv ( FILEPATH + " M42-Export/KB_2017-09-13.csv " , delimiter = " ; " )
next ( kb2keywords_gen , None ) #skip first line("ArticleID";"Subject";"Keywords";...)
kb2keywords_dict = { }
for line in kb2keywords_gen :
kb_id = line [ 0 ]
subject = line [ 1 ]
keywords = line [ 2 ]
2017-12-19 17:12:35 +01:00
keywords_list = [ normalize_str ( x ) for x in str ( keywords ) . split ( " , " ) ]
2017-11-17 11:46:57 +01:00
if kb_id not in kb2keywords_dict . keys ( ) :
kb2keywords_dict [ kb_id ] = [ ]
if kb_keywords :
for item in keywords_list :
if item != " " :
kb2keywords_dict [ kb_id ] . append ( item )
else :
kb2keywords_dict [ kb_id ] . append ( subject )
#remove all empty items
kb2keywords_dict = { k : v for k , v in kb2keywords_dict . items ( ) if len ( v ) != 0 }
# {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...}
#keywords2kb_dict
keywords2kb_dict = { }
for kb_id , lst in kb2keywords_dict . items ( ) :
for l in lst :
if l not in keywords2kb_dict . keys ( ) :
keywords2kb_dict [ l ] = [ kb_id ]
else :
keywords2kb_dict [ l ] . append ( kb_id )
# {'unicard namensaenderung': ['KBA10276'], 'vpn': ['KBA10063'], 'outlook_exchange': ['KBA10181'], ...}
# Look for actually used keywords
used_keywords = [ ]
for doc in corpus :
ticket_number = doc . metadata [ " TicketNumber " ]
kb_id = ticket2kb_dict . get ( ticket_number , None )
keywords = kb2keywords_dict . get ( kb_id , None )
2017-11-06 12:54:59 +01:00
2017-11-17 11:46:57 +01:00
if keywords and kb_id :
2017-12-19 17:12:35 +01:00
used_keywords . append ( list ( map ( normalize_str , keywords ) ) )
2017-11-06 12:54:59 +01:00
2017-11-17 11:46:57 +01:00
labelist = [ item for sublist in used_keywords for item in sublist ] #flatten list
labelist = list ( set ( labelist ) )
print ( " len(labelist): {} " . format ( len ( labelist ) ) )
2017-11-06 12:54:59 +01:00
2017-10-18 17:37:20 +02:00
2017-10-25 09:46:44 +02:00
2017-11-17 11:46:57 +01:00
labeldict = { k : v for v , k in enumerate ( labelist ) }
2017-12-19 17:12:35 +01:00
##############################################################################################
2017-11-17 11:46:57 +01:00
def gen_KB_lines ( textacyCorpus , labeldict , ticket2kb_dict , kb2keywords_dict ) :
for doc in corpus :
ticket_number = doc . metadata [ " TicketNumber " ]
kb_number = ticket2kb_dict . get ( ticket_number , None )
keywords = kb2keywords_dict . get ( kb_number , None )
if keywords :
label = " "
for kw in keywords :
2017-12-19 17:12:35 +01:00
label = label + str ( labeldict . get ( normalize_str ( str ( kw ) ) , len ( labeldict ) ) ) + " "
2017-11-17 11:46:57 +01:00
yield " [ " + label + " ] " + doc . text
line_gen = gen_KB_lines ( corpus , labeldict , ticket2kb_dict , kb2keywords_dict )
path2save_results = path2save_results + " _kb_ {} _llda_ {} " . format ( " keys " if kb_keywords else " subs " ,
" top " + str ( top_topic_words ) )
jgibbsLLDA ( labeldict , line_gen , path2save_results , top_topic_words = top_topic_words )
end = time . time ( )
logprint ( " \n \n \n Time Elapsed {1} -LLDA : {0} min \n \n " . format ( ( end - start ) / 60 , " Keyword " if kb_keywords else " Subject " ) )
2017-11-21 10:14:37 +01:00
def jgibbsLLDA_KB_v2 ( corpus , path2save_results , top_topic_words = 7 ) :
start = time . time ( )
logprint ( " " )
logprint ( " start LLDA: " )
2017-12-19 17:12:35 +01:00
# labeldict ############################################################################################
2017-11-21 10:14:37 +01:00
2017-11-27 12:49:05 +01:00
# kb2keywords_dict / kb2subjects_dict --> {str : [str]}
2017-11-21 10:14:37 +01:00
kb2keywords_dict = { }
kb2subjects_dict = { }
kb_gen = textacy . fileio . read_csv ( FILEPATH + " M42-Export/KB_2017-09-13.csv " , delimiter = " ; " )
next ( kb_gen , None ) # skip first line "ArticleID";"Subject";"Keywords";...
for line in kb_gen :
kb_id = line [ 0 ]
2017-12-19 17:12:35 +01:00
subject = normalize_str ( line [ 1 ] )
2017-11-21 10:14:37 +01:00
2017-12-19 17:12:35 +01:00
keywords = [ normalize_str ( x ) for x in str ( line [ 2 ] ) . split ( " , " ) ]
2017-11-21 10:14:37 +01:00
if kb_id not in kb2keywords_dict . keys ( ) :
kb2keywords_dict [ kb_id ] = keywords if keywords != [ ' ' ] else [ " DEFAULT " ]
else :
kb2keywords_dict [ kb_id ] = kb2keywords_dict [ kb_id ] + keywords
if kb_id not in kb2subjects_dict . keys ( ) :
2017-12-19 17:12:35 +01:00
kb2subjects_dict [ kb_id ] = [ normalize_str ( subject ) if subject != [ ' ' ] else " DEFAULT " ]
2017-11-21 10:14:37 +01:00
else :
2017-12-19 17:12:35 +01:00
kb2subjects_dict [ kb_id ] . append ( normalize_str ( subject ) )
2017-11-21 10:14:37 +01:00
2017-11-27 12:49:05 +01:00
# ticket2kbs_dict --> {str : [str]}
2017-11-21 10:14:37 +01:00
ticket2kbs_dict = { }
kb2ticket_gen = textacy . fileio . read_csv ( FILEPATH + " M42-Export/KB2Ticket_2017-09-13.csv " , delimiter = " ; " )
next ( kb2ticket_gen , None ) # skip first line "TicketNumber";"ArticleID"
for line in kb2ticket_gen :
ticket_id = line [ 0 ]
kb_id = line [ 1 ]
if ticket_id not in ticket2kbs_dict . keys ( ) :
ticket2kbs_dict [ ticket_id ] = [ kb_id ]
else :
ticket2kbs_dict [ ticket_id ] . append ( kb_id )
2017-11-27 12:49:05 +01:00
# ticket2keywords --> {str:[str]}
ticket2keywords_dict = { }
2017-11-21 10:14:37 +01:00
for ticket_id , kb_ids in ticket2kbs_dict . items ( ) :
if ticket_id not in ticket2keywords_dict . keys ( ) :
ticket2keywords_dict [ ticket_id ] = [ ]
for kb_id in kb_ids :
ticket2keywords_dict [ ticket_id ] . append ( kb2keywords_dict [ kb_id ] )
ticket2keywords_dict [ ticket_id ] = flatten ( ticket2keywords_dict [ ticket_id ] )
2017-11-27 12:49:05 +01:00
# ticket2subjects --> {str:[str]}
ticket2subjects_dict = { }
2017-11-21 10:14:37 +01:00
for ticket_id , kb_ids in ticket2kbs_dict . items ( ) :
if ticket_id not in ticket2subjects_dict . keys ( ) :
ticket2subjects_dict [ ticket_id ] = [ ]
for kb_id in kb_ids :
ticket2subjects_dict [ ticket_id ] . append ( kb2subjects_dict [ kb_id ] )
ticket2subjects_dict [ ticket_id ] = flatten ( ticket2subjects_dict [ ticket_id ] )
# kb2keywords_dict {'KBA10230': ['DEFAULT'], 'KBA10129': ['DEFAULT'], 'KBA10287': ['sd_ansys_informationen'], } len = 260
2017-11-27 12:49:05 +01:00
# kb2subjects_dict {'KBA10230': ['unicard nochmal beantragen'], 'KBA10129': ['sd_entsperrung unicard nach verlust/wiederfinden'], } len = 260
# ticket2kbs_dict {'INC44526': ['KBA10056'], 'INC67205': ['KBA10056'], } len = 4832
2017-11-21 10:14:37 +01:00
# ticket2keywords_dict {'INC44526': ['DEFAULT'], 'INC67205': ['DEFAULT'], 'INC71863': ['DEFAULT'], 'INC44392': ['asknet'] } len=4832
2017-11-27 12:49:05 +01:00
# ticket2subjects_dict {'INC44526': ['sd_telefon (antrag: neuanschluss, umzug, aenderung erledigt)'], len=4832
2017-11-21 10:14:37 +01:00
count_dict = { }
for v in ticket2kbs_dict . values ( ) :
for kb in v :
if kb in count_dict . keys ( ) :
count_dict [ kb ] + = 1
else :
count_dict [ kb ] = 1
sorted_dict = sorted ( count_dict . items ( ) , key = operator . itemgetter ( 1 ) )
2017-12-11 12:10:40 +01:00
"""
2017-11-27 12:49:05 +01:00
for k , v in sorted_dict :
subs = kb2subjects_dict [ k ]
keys = kb2keywords_dict [ k ]
print ( subs , keys , v ) # frage wieviele tickets pro topic?
2017-11-21 10:14:37 +01:00
2017-11-27 12:49:05 +01:00
print ( " kb_entrys used: {} " . format ( len ( sorted_dict ) ) ) # frage wie viele kb_entry's insg genutzt?: 155
2017-12-11 12:10:40 +01:00
"""
2017-11-21 10:14:37 +01:00
labelist = ticket2keywords_dict . values ( )
labelist = flatten ( labelist )
labelist = list ( set ( labelist ) )
labeldict = { k : v for v , k in enumerate ( labelist ) }
2017-12-19 17:12:35 +01:00
##############################################################################################
2017-11-21 10:14:37 +01:00
def gen_key_lines ( textacyCorpus , labeldict , ticket2keywords_dict ) :
for doc in corpus :
ticket_number = doc . metadata [ " TicketNumber " ]
keywords = ticket2keywords_dict . get ( ticket_number , [ ' DEFAULT ' ] )
if keywords != [ ' DEFAULT ' ] :
label = " "
for kw in keywords :
2017-12-19 17:12:35 +01:00
label = label + str ( labeldict . get ( normalize_str ( str ( kw ) ) , labeldict [ ' DEFAULT ' ] ) ) + " "
2017-11-21 10:14:37 +01:00
yield " [ " + label + " ] " + doc . text
2017-11-27 12:49:05 +01:00
2017-11-21 10:14:37 +01:00
keys_line_gen = gen_key_lines ( corpus , labeldict , ticket2keywords_dict )
path2save_keys_results = path2save_results + " _kb_keys_llda_ {} " . format ( " top " + str ( top_topic_words ) )
jgibbsLLDA ( labeldict , keys_line_gen , path2save_keys_results , top_topic_words = top_topic_words )
labelist = ticket2subjects_dict . values ( )
labelist = flatten ( labelist )
labelist = list ( set ( labelist ) )
labeldict = { k : v for v , k in enumerate ( labelist ) }
2017-11-27 12:49:05 +01:00
2017-11-21 10:14:37 +01:00
labeldict . update ( { ' DEFAULT ' : len ( labeldict ) } )
subj_line_gen = gen_key_lines ( corpus , labeldict , ticket2subjects_dict )
path2save_subj_results = path2save_results + " _kb_subj_llda_ {} " . format ( " top " + str ( top_topic_words ) )
jgibbsLLDA ( labeldict , subj_line_gen , path2save_subj_results , top_topic_words = top_topic_words )
end = time . time ( )
logprint ( " \n \n \n Time Elapsed LLDA : {0} min \n \n " . format ( ( end - start ) / 60 ) )
2017-12-08 11:06:07 +01:00
def load_from_labled_lines ( path ) :
path = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/pre_labled_lines_wo_lemma_061217.txt "
2017-12-19 17:12:35 +01:00
#idee
2017-12-08 11:06:07 +01:00
# clean laden, pre laden
2017-12-19 17:12:35 +01:00
# unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee bigramme mit _ verbinden # nimm nur ngrams wo midn. ein token in pre vorkommt
2017-12-08 11:06:07 +01:00
2017-11-21 10:14:37 +01:00
2017-12-08 11:06:07 +01:00
def main ( cleaned_corpus , pre_corpus , algorithm = " llda " ) :
2017-10-25 09:46:44 +02:00
logprint ( " Topic Modeling: {0} " . format ( datetime . now ( ) ) )
2017-12-08 11:06:07 +01:00
#todo von labled_lines laden ??
#idee thesaurus vor id2term
#todo akronyme & abk. drin lassen
#todo bigramme nicht auf pre, sondern auf cleaned
#todo zahlen drin lassen, bigramme: NUM wort kombis
#todo levenstein/hamming distanz statt autokorrekt #idee oder word2vec
#todo ticket-subj mit einbeziehen
2017-11-06 12:54:59 +01:00
2017-11-27 12:49:05 +01:00
resultspath = FILEPATH + " results/pre "
2017-10-18 17:37:20 +02:00
2017-12-08 11:06:07 +01:00
de_corpus = pre_corpus
2017-11-06 12:54:59 +01:00
2017-10-18 17:37:20 +02:00
2017-11-17 11:46:57 +01:00
if algorithm == " llda " :
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
top_topic_words = 3
2017-11-21 10:14:37 +01:00
2017-11-17 11:46:57 +01:00
jgibbsLLDA_category ( de_corpus , path2save_results = resultspath , top_topic_words = top_topic_words )
2017-10-18 17:37:20 +02:00
2017-11-21 10:14:37 +01:00
jgibbsLLDA_KB_v2 ( de_corpus , path2save_results = resultspath , top_topic_words = top_topic_words )
2017-10-25 09:46:44 +02:00
2017-11-27 12:49:05 +01:00
"""
2017-11-17 11:46:57 +01:00
kb_keywords = False
2017-11-27 12:49:05 +01:00
jgibbsLLDA_KB ( de_corpus , path2save_results = resultspath , top_topic_words = top_topic_words , kb_keywords = kb_keywords )
2017-11-17 11:46:57 +01:00
kb_keywords = True
2017-11-27 12:49:05 +01:00
jgibbsLLDA_KB ( de_corpus , path2save_results = resultspath , top_topic_words = top_topic_words , kb_keywords = kb_keywords )
2017-10-18 17:37:20 +02:00
2017-11-27 12:49:05 +01:00
2017-10-30 12:56:52 +01:00
top_topic_words = 10
2017-11-06 12:54:59 +01:00
path2save_results = resultspath + " _ {} _ {} " . format ( algorithm , " top " + str ( top_topic_words ) )
jgibbsLLDA ( de_corpus , path2save_results = path2save_results , top_topic_words = top_topic_words )
2017-10-18 17:37:20 +02:00
2017-11-06 12:54:59 +01:00
top_topic_words = 15
path2save_results = resultspath + " _ {} _ {} " . format ( algorithm , " top " + str ( top_topic_words ) )
jgibbsLLDA ( de_corpus , path2save_results = path2save_results , top_topic_words = top_topic_words )
2017-10-18 17:37:20 +02:00
2017-11-06 12:54:59 +01:00
top_topic_words = 20
path2save_results = resultspath + " _ {} _ {} " . format ( algorithm , " top " + str ( top_topic_words ) )
jgibbsLLDA ( de_corpus , path2save_results = path2save_results , top_topic_words = top_topic_words )
2017-10-18 17:37:20 +02:00
2017-11-06 12:54:59 +01:00
"""
2017-10-30 12:56:52 +01:00
else :
2017-10-25 09:46:44 +02:00
2017-10-30 12:56:52 +01:00
textacyTopicModeling ( ngrams = 1 ,
topicModel = algorithm ,
2017-11-03 11:49:26 +01:00
corpus = de_corpus )
2017-11-06 12:54:59 +01:00
"""
2017-11-03 11:49:26 +01:00
textacyTopicModeling ( ngrams = 1 ,
min_df = 1 ,
max_df = 0.9 ,
topicModel = algorithm ,
n_topics = 20 ,
corpus = de_corpus )
2017-11-17 11:46:57 +01:00
2017-11-03 11:49:26 +01:00
textacyTopicModeling ( ngrams = 1 ,
min_df = 1 ,
max_df = 0.9 ,
topicModel = algorithm ,
n_topics = 25 ,
2017-10-30 12:56:52 +01:00
corpus = de_corpus )
2017-11-03 11:49:26 +01:00
textacyTopicModeling ( ngrams = 1 ,
min_df = 1 ,
max_df = 0.9 ,
topicModel = algorithm ,
n_topics = 30 ,
corpus = de_corpus )
2017-11-06 12:54:59 +01:00
"""
2017-11-03 11:49:26 +01:00
textacyTopicModeling ( ngrams = ( 1 , 2 ) ,
topicModel = algorithm ,
corpus = de_corpus )
2017-11-06 12:54:59 +01:00
"""
2017-11-03 11:49:26 +01:00
textacyTopicModeling ( ngrams = ( 1 , 2 ) ,
min_df = 1 ,
max_df = 0.9 ,
topicModel = algorithm ,
n_topics = 20 ,
corpus = de_corpus )
2017-11-17 11:46:57 +01:00
2017-11-03 11:49:26 +01:00
textacyTopicModeling ( ngrams = ( 1 , 2 ) ,
min_df = 1 ,
max_df = 0.9 ,
topicModel = algorithm ,
n_topics = 25 ,
corpus = de_corpus )
textacyTopicModeling ( ngrams = ( 1 , 2 ) ,
min_df = 1 ,
max_df = 0.9 ,
topicModel = algorithm ,
n_topics = 30 ,
corpus = de_corpus )
2017-10-30 12:56:52 +01:00
"""
2017-10-25 09:46:44 +02:00
2017-10-18 17:37:20 +02:00
if __name__ == " __main__ " :
2017-12-08 11:06:07 +01:00
# load corpus
corpus_de_path = FILEPATH + config . get ( " de_corpus " , " path " )
pre_corpus_name = " de " + " _pre "
pre_corpus , parser = load_corpus ( corpus_name = pre_corpus_name , corpus_path = corpus_de_path )
logprint ( " Corpus loaded: {0} " . format ( pre_corpus_name ) )
cleaned_corpus_name = " de " + " _raw "
#cleaned_corpus, parser = load_corpus(corpus_name=cleaned_corpus_name, corpus_path=corpus_de_path)
logprint ( " Corpus loaded: {0} " . format ( cleaned_corpus_name ) )
cleaned_corpus = None
main ( pre_corpus = pre_corpus , cleaned_corpus = cleaned_corpus , algorithm = " llda " )
main ( pre_corpus = pre_corpus , cleaned_corpus = cleaned_corpus , algorithm = " lda " )
2017-10-10 14:42:09 +02:00