topicModelingTickets/topicModeling_1711_0846.py

# -*- coding: utf-8 -*-

from datetime import datetime
import draw
import draw1
import time
import numpy as np

import csv
import sys
import json
import os.path
import subprocess
from textacy import Vectorizer, viz

from miscellaneous import *
import textacy
from scipy import *

import os

csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"


# load config
config_ini = FILEPATH + "config.ini"

config = ConfigParser.ConfigParser()
with open(config_ini) as f:
    config.read_file(f)


def label2ID(label, labeldict):
    return labeldict.get(label, len(labeldict))


def generate_lablelID_lines(textacyCorpus, labeldict):
    for doc in textacyCorpus:
        # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
        yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text

"""
def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
    logprint(str("ngrams: {0}".format(ngrams)))
    logprint(str("min_df: {0}".format(min_df)))
    logprint(str("max_df: {0}".format(max_df)))
    logprint(str("named_entities: {0}".format(named_entities)))

    # printlog("vectorize corpi...")
    vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)

    terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
    doc_term_matrix = vectorizer.fit_transform(terms_list)
    id2term = vectorizer.__getattribute__("id_to_term")

    for t in terms_list:
        print(t)
    logprint("doc_term_matrix: {0}".format(doc_term_matrix))
    logprint("id2term: {0}".format(id2term))
"""

def textacyTopicModeling(corpus,
                         n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5,
                         ngrams = 1, min_df=1, max_df=1.0,
                         topicModel='lda'):


    n_terms = int(n_topics * top_topic_words)
    sort_terms_by = 'seriation'  # 'seriation', 'weight', 'index', 'alphabetical'
    rank_terms_by = 'corpus'  # 'corpus', 'topic'


    logprint(
        "############### Topic Modeling {0}   ###########################".format(
            topicModel))
    logprint(str("ngrams: {0}".format(ngrams)))
    logprint(str("min_df: {0}".format(min_df)))
    logprint(str("max_df: {0}".format(max_df)))
    logprint(str("n_topics: {0}".format(n_topics)))
    logprint("\n")

    start = time.time()

    # http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
    weighting = ('tf' if topicModel == 'lda' else 'tfidf')


    ####################    vectorize corpi     ####################

    vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)

    terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
    doc_term_matrix = vectorizer.fit_transform(terms_list)
    id2term = vectorizer.__getattribute__("id_to_term")

    # printlog("terms_list: {0}".format(list(terms_list)))
    # printlog("doc_term_matrix: {0}".format(doc_term_matrix))


    #####################   Initialize and train a topic model        ##############################################

    model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)

    model.fit(doc_term_matrix)

    doc_topic_matrix = model.transform(doc_term_matrix)


    for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
        logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))

    for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
        logprint(topic_idx)
        for j in top_docs:
            logprint(corpus[j].metadata['categoryName'])


    #######################  termite plot        ###################################################################

    grams_label = "uni" if ngrams == 1 else "bi"
    """
    model.termite_plot(doc_term_matrix, id2term,

                       n_terms=n_terms,
                       sort_terms_by=sort_terms_by,
                       rank_terms_by=rank_terms_by+'_weight',


    save= FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label,topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by))
    """
    draw1.termite_plot(model,doc_term_matrix, id2term,

                       n_terms=n_terms,
                       sort_terms_by=sort_terms_by,
                       rank_terms_by=rank_terms_by + '_weight',

                       save=FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label, topicModel, n_topics,
                                                                              n_terms, sort_terms_by, rank_terms_by))

    end = time.time()
    logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))


def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
    start = time.time()


    jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"

    LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)


    # build  dictionary of ticketcategories
    labelist = []
    for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
        labelist.append(texdoc.metadata["categoryName"])


    labeldict = {k: v for v, k in enumerate(labelist)}
    reverse_labeldict = {v: k for k, v in labeldict.items()}

    #and save
    labeldict_path = FILEPATH + "results/labeldict.txt"
    with open(labeldict_path, 'w') as file:
        file.write(json.dumps(labeldict))


    n_topics = len(labeldict)  #+1  #default-topic


    # create file with label_IDs (input for llda)
    textacy.fileio.write_file_lines(generate_lablelID_lines(corpus, labeldict), filepath=LLDA_filepath)

    # wait for file to exist
    while not os.path.exists(LLDA_filepath):
        time.sleep(1)

    logprint("")
    logprint("start LLDA:")


    # run JGibbsLLDA file

    FNULL = open(os.devnull, 'w')  # supress output
    cmd_jgibbs_java = ["java", "-cp",
                       "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
                           jgibbsLLDA_root),
                       "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
                       "tickets.gz",
                       "-twords", str(top_topic_words), "-ntopics", str(n_topics)]
    subprocess.call(cmd_jgibbs_java, stdout=FNULL)


    # ANMERKUNG: Dateien sind versteckt. zu finden in models/
    cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
    output = subprocess.check_output(cmd_gzip).decode("utf-8")


    topic_regex = re.compile(r'Topic [0-9]*')

    #####################################
    # todo save results in file aufgrund von results
    result = []

    for line in output.splitlines():
        findall = topic_regex.findall(line)
        if len(findall) != 0:
            try:
                index = int(findall[0].split()[1])
                result.append("Topic {} {}:".format(index, reverse_labeldict[index]))

            except:
                result.append(line)

        else:
            result.append(line)

    textacy.fileio.write_file_lines(result, path2save_results+".txt")
    #####################################

    results = []
    res_dict = {}
    count =0
    for line in output.splitlines():

        findall = topic_regex.findall(line)

        if len(findall) != 0:

            if len(res_dict) != 0:
                results.append(res_dict)    #vorheriges an die liste ran (ist ja dann fertig)

            index = int(findall[0].split()[1])

            res_dict = {index : str(reverse_labeldict[index]) }

        else:
            splitted = line.split()
            res_dict[splitted[0]] = float(splitted[1])
            """
            ### print terms that are topics
            for s  in list(res_dict.values()):
                if isinstance(s,str) and splitted[0] in s:
                    vals = list(res_dict.values())
                    keys = list(res_dict.keys())
                    for v in vals:
                        if not isinstance(v,float):
                            print("{}".format(v))
                    print("{}".format(splitted[0]))
                    count +=1
                    print()
            ###
            """

    if len(res_dict) != 0:
        results.append(res_dict)  # letzes an die liste ran

    #print(count)
    #print(float(count)/float(len(labelist)))


    # {0: 'betrieb', 'service': 0.24162679425837305, 'support': 0.24162679425837305, 'browser': 0.24162679425837305, 'unicard': 0.24162679425837305, 'telefon': 0.0023923444976076593}


    # every term in the resulsts to a list

    terms=[]
    for res in results:
        for key,value in res.items():
            if not isinstance(key, int) and not key in terms:
                terms.append(key)

    term2id = {t:i for i,t in enumerate(terms)} #and to dict

    #################   termite plot    #####################################################################

    #term_topic_weights.shape = (len(term_ids),len(topic_ids)


    #topic_labels = tuple(labelist)

    topic_labels = list(range(len(labelist)))
    term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])


    term_topic_weights = np.zeros((len(term2id),len(topic_labels)))


    for i,res in enumerate(results):

        for key,value in res.items():

            if not isinstance(key, int):
                term_topic_weights[term2id[key]][i] = value
                term_labels[term2id[key]] = key
            else:
                topic_labels[i] = reverse_labeldict[key]


    #viz.draw_termite_plot(term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
    draw.draw_termite(
        term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")


    end = time.time()
    logprint("Time Elapsed Topic Modeling JGibbsLLDA:{0} min\n".format((end - start) / 60))


def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words=7, kb_keywords=False):

    jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
    LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)


    # ticket2kb_dict

    kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")

    ticket2kb_dict = {} #{'INC55646': 'KBA10065', 'INC65776': 'KBA10040', 'INC43025': 'KBA10056', ...}
    for line in kb2ticket_gen:
        ticket_id = line[0]
        kb_id = line[1]

        ticket2kb_dict[ticket_id] = kb_id
    #############


    # kb2keywords_dict

    kb2keywords_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB_2017-09-13.csv", delimiter=";") #"ArticleID";"Subject";"Keywords";.....
    next(kb2keywords_gen,None) #skip first
    kb2keywords_dict = {}

    for lino in kb2keywords_gen:
        kb_id = lino[0]
        kb2keywords_dict[kb_id] = []

        subject = lino[1]

        keywords = lino[2]

        keywords_list = [x.lower().strip() for x in map(replaceRockDots_lambda(), str(keywords).split(","))]

        if kb_keywords:
            for item in keywords_list:
                if item != "":
                    kb2keywords_dict[kb_id].append(item)

        else:
            kb2keywords_dict[kb_id].append(subject)


    #remove all empty items
    kb2keywords_dict = { k : v for k,v in kb2keywords_dict.items() if len(v) != 0}
    ###############


    #keywords2kb_dict
    keywords2kb_dict = {}
    for kb_id, lst in kb2keywords_dict.items():
        for l in lst:
            if l not in keywords2kb_dict.keys():
                keywords2kb_dict[l] = [kb_id]
            else:
                keywords2kb_dict[l].append(kb_id)
    ############


    # idee topic_ID -> KB_ID -> keywords / subject -> llda


    # ticket2kb_dict  {'INC65627': 'KBA10044', 'INC66057': 'KBA10009', ...}

    # kb2keywords_dict  {'KBA10091': ['citavi'], 'KBA10249': ['"beschaedigte unicard"', 'risse', '"defekte karte"'], ...}

    # keywords2kb_dict {'unicard namensaenderung': ['KBA10276'], 'vpn': ['KBA10063'], 'outlook_exchange': ['KBA10181'], ...}


    # Look for actually used keywords
    used_keywords = []
    for doc in corpus:
        ticket_number = doc.metadata["TicketNumber"]

        kb_number = ticket2kb_dict.get(ticket_number, None)

        keywords = kb2keywords_dict.get(kb_number, None)

        if keywords and kb_number:
            used_keywords.append(list(map(normalize,keywords)))

    kb_entries_used = (len(list(set([kb for kb in ticket2kb_dict.values()]))))
    print("kb_entries_used: {}".format(kb_entries_used))

    labelist = [item for sublist in used_keywords for item in sublist]
    labelist = list(set(labelist))
    print("len(labelist): {}".format(len(labelist)))


    labeldict = {k: v for v, k in enumerate(labelist)}
    labeldict_rev = {v: k for k, v in labeldict.items()}
    print("labledict created")

    def genos_linos(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict):

        for doc in textacyCorpus:

            ticket_number = doc.metadata["TicketNumber"]

            kb_number = ticket2kb_dict.get(ticket_number, None)


            keywords = kb2keywords_dict.get(kb_number, None)

            if keywords is not None:
                pass
            if keywords and kb_number:

                label = ""
                for kw in keywords:
                    label = label + str(labeldict.get( normalize(str(kw)) , len(labeldict))) + " "

                yield "[ " + label + "] " + doc.text

    line_gen = genos_linos(corpus, labeldict, ticket2kb_dict, kb2keywords_dict)


    textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath)


    # wait for file to exist
    while not os.path.exists(LLDA_filepath):
        time.sleep(1)

    logprint("")
    logprint("start LLDA:")


    # run JGibbsLLDA file

    n_topics = len(labeldict)  #+1  #default-topic

    FNULL = open(os.devnull, 'w')  # supress output
    cmd_jgibbs_java = ["java", "-cp",
                       "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
                           jgibbsLLDA_root),
                       "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
                       "tickets.gz",
                       "-twords", str(top_topic_words), "-ntopics", str(n_topics)]
    subprocess.call(cmd_jgibbs_java, stdout=FNULL)


    # ANMERKUNG: Dateien sind versteckt. zu finden in models/
    cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
    output = subprocess.check_output(cmd_gzip).decode("utf-8")


    topic_regex = re.compile(r'Topic [0-9]*')

    #####################################
    # todo save results in file aufgrund von results
    result = []

    for line in output.splitlines():
        findall = topic_regex.findall(line)
        if len(findall) != 0:
            try:
                index = int(findall[0].split()[1])
                result.append("Topic {} {}:".format(index, labeldict_rev[index]))

            except:
                result.append(line)

        else:
            result.append(line)

    textacy.fileio.write_file_lines(result, path2save_results+".txt")
    #####################################

    results = []
    res_dict = {}
    count =0
    for line in output.splitlines():

        findall = topic_regex.findall(line)

        if len(findall) != 0:

            if len(res_dict) != 0:
                results.append(res_dict)    #vorheriges an die liste ran (ist ja dann fertig)

            index = int(findall[0].split()[1])

            res_dict = {index : str(labeldict_rev[index]) }

        else:
            splitted = line.split()
            res_dict[splitted[0]] = float(splitted[1])

    if len(res_dict) != 0:
        results.append(res_dict)  # letzes an die liste ran


    # every term in the resulsts to a list

    terms=[]
    for res in results:
        for key,value in res.items():
            if not isinstance(key, int) and not key in terms:
                terms.append(key)

    term2id = {t:i for i,t in enumerate(terms)} #and to dict

    #################   termite plot    #####################################################################
    topic_labels = list(range(len(labelist)))
    term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])


    term_topic_weights = np.zeros((len(term2id),len(topic_labels)))

    for i,res in enumerate(results):

        for key,value in res.items():

            if not isinstance(key, int):
                term_topic_weights[term2id[key]][i] = value
                term_labels[term2id[key]] = key
            else:
                topic_labels[i] = labeldict_rev[key]


    draw.draw_termite(
        term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")


    end = time.time()


def main(use_cleaned=False, algorithm="llda"):


    logprint("Topic Modeling: {0}".format(datetime.now()))

    corpus_de_path = FILEPATH + config.get("de_corpus", "path")
    corpus_en_path = FILEPATH + config.get("en_corpus", "path")


    if use_cleaned:
        preCorpus_name = "de" + "_clean_ticket"
        resultspath = FILEPATH + "results/clean"
    else:
        preCorpus_name = "de" + "_pre_ticket"
        resultspath = FILEPATH + "results/pre"


    # load cleand corpus
    de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
    logprint("Corpus loaded: {0}".format(de_corpus.lang))


    """
    ngrams = 1
    min_df = 1
    max_df = 1.0
    weighting = 'tf'
    # weighting ='tfidf'
    named_entities = False


    printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
    printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
    printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting)

    printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting)
    printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
    printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
    """

    if algorithm == "llda":
        top_topic_words = 5
        path2save_results = resultspath +  "_cat_{}_{}".format(algorithm,"top"+str(top_topic_words))
        jgibbsLLDA_category(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)


        kb_keywords = False
        path2save_results = resultspath +  "_kb_{}_{}_{}".format("keys" if kb_keywords else "subs",algorithm,"top"+str(top_topic_words))
        jgibbsLLDA_KB(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, kb_keywords=kb_keywords)

        kb_keywords = True
        path2save_results = resultspath + "_kb_{}_{}_{}".format("keys" if kb_keywords else "subs", algorithm,
                                                                "top" + str(top_topic_words))
        jgibbsLLDA_KB(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
                      kb_keywords=kb_keywords)

        """
        top_topic_words = 10
        path2save_results = resultspath +  "_{}_{}".format(algorithm,"top"+str(top_topic_words))
        jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)


        top_topic_words = 15
        path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words))
        jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)

        top_topic_words = 20
        path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words))
        jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)

        """
    else:


        textacyTopicModeling(ngrams = 1,
                             min_df = 1,
                             max_df = 0.9,
                             topicModel = algorithm,
                             n_topics =15,
                             corpus=de_corpus)
        """
        textacyTopicModeling(ngrams=1,
                             min_df=1,
                             max_df=0.9,
                             topicModel=algorithm,
                             n_topics=20,
                             corpus=de_corpus)

        textacyTopicModeling(ngrams=1,
                             min_df=1,
                             max_df=0.9,
                             topicModel=algorithm,
                             n_topics=25,
                             corpus=de_corpus)


        textacyTopicModeling(ngrams=1,
                             min_df=1,
                             max_df=0.9,
                             topicModel=algorithm,
                             n_topics=30,
                             corpus=de_corpus)
        """


        textacyTopicModeling(ngrams=(1, 2),
                             min_df=1,
                             max_df=0.9,
                             topicModel=algorithm,
                             n_topics=15,
                             corpus=de_corpus)
        """
        textacyTopicModeling(ngrams = (1,2),
                             min_df = 1,
                             max_df = 0.9,
                             topicModel = algorithm,
                             n_topics =20,
                             corpus=de_corpus)

        textacyTopicModeling(ngrams = (1,2),
                             min_df = 1,
                             max_df = 0.9,
                             topicModel = algorithm,
                             n_topics =25,
                             corpus=de_corpus)


        textacyTopicModeling(ngrams = (1,2),
                             min_df = 1,
                             max_df = 0.9,
                             topicModel = algorithm,
                             n_topics =30,
                             corpus=de_corpus)
        """


if __name__ == "__main__":
    main()