topicModelingTickets/topicModeling.py

# -*- coding: utf-8 -*-

from datetime import datetime

import time

import csv
import sys
import json
import os.path
import subprocess
from textacy import Vectorizer, viz

from miscellaneous import *
import textacy
from scipy import *

import os

csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"

# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"


# load config
config_ini = FILEPATH + "config.ini"

config = ConfigParser.ConfigParser()
with open(config_ini) as f:
    config.read_file(f)


def label2ID(label, labeldict):
    return labeldict.get(label, len(labeldict))


def generate_labled_lines(textacyCorpus, labeldict):
    for doc in textacyCorpus:
        # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
        yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text


def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
    logprint(str("ngrams: {0}".format(ngrams)))
    logprint(str("min_df: {0}".format(min_df)))
    logprint(str("max_df: {0}".format(max_df)))
    logprint(str("named_entities: {0}".format(named_entities)))

    # printlog("vectorize corpi...")
    vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)

    terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in de_corpus)
    doc_term_matrix = vectorizer.fit_transform(terms_list)
    id2term = vectorizer.__getattribute__("id_to_term")

    for t in terms_list:
        print(t)
    logprint("doc_term_matrix: {0}".format(doc_term_matrix))
    logprint("id2term: {0}".format(id2term))


def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda', named_entities=False):
    logprint(
        "############################################ Topic Modeling {0}   #############################################".format(
            topicModel))
    print("\n\n")
    logprint(str("ngrams: {0}".format(ngrams)))
    logprint(str("min_df: {0}".format(min_df)))
    logprint(str("max_df: {0}".format(max_df)))
    logprint(str("n_topics: {0}".format(n_topics)))
    logprint(str("named_entities: {0}".format(named_entities)))

    start = time.time()

    top_topic_words = 7
    top_document_labels_per_topic = 5

    # http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
    weighting = ('tf' if topicModel == 'lda' else 'tfidf')


    ####################'####################


    # printlog("vectorize corpi...")
    vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)

    terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
    doc_term_matrix = vectorizer.fit_transform(terms_list)
    id2term = vectorizer.__getattribute__("id_to_term")

    # printlog("terms_list: {0}".format(list(terms_list)))
    # printlog("doc_term_matrix: {0}".format(doc_term_matrix))


    #####################     LSA, LDA, NMF         Topic Modeling via Textacy         ##############################################

    # Initialize and train a topic model
    # printlog("Initialize and train a topic model..")
    model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
    model.fit(doc_term_matrix)

    # Transform the corpi and interpret our model:
    # printlog("Transform the corpi and interpret our model..")
    doc_topic_matrix = model.transform(doc_term_matrix)
    print()

    for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
        logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))

    print()
    for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
        logprint(topic_idx)
        for j in top_docs:
            logprint(corpus[j].metadata['categoryName'])
        print()

    #####################################################################################################################
    print()
    print()


    # termite plot
    n_terms = int(n_topics*top_topic_words)
    sort_terms_by = 'seriation' #'seriation', 'weight', 'index', 'alphabetical'
    rank_terms_by = 'corpus'  # 'corpus', 'topic'
    model.termite_plot(doc_term_matrix, id2term,

                       n_terms=n_terms,
                       sort_terms_by=sort_terms_by,
                       rank_terms_by=rank_terms_by+'_weight',

    save="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/results/{}_{}_{}_{}_{}.png".format(topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by))


    end = time.time()
    logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))


def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=False):
    #####################   LLDA           Topic Modeling via JGibbsLabledLDA     ##############################################

    start = time.time()

    # build dictionary of ticketcategories
    labelist = []

    for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
        labelist.append(texdoc.metadata["categoryName"])

    labeldict = {k: v for v, k in enumerate(labelist)}

    if add_default_topic:
        n_topics = len(labeldict) + 1  # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
    else:
        n_topics = len(labeldict)  # + 1  # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic

    jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/"

    LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
    #dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
    dict_path = FILEPATH +"results/labeldict.txt".format(jgibbsLLDA_root)

    # printlog(str("LABELDICT: {0}".format(labeldict)))
    #logprint(str("LABELDICT-length: {0}".format(len(labeldict))))
    with open(dict_path, 'w') as file:
        file.write(json.dumps(labeldict))

    # for line in generate_labled_lines(de_corpus,labeldict):
    #    print(line)

    # create file
    textacy.fileio.write_file_lines(generate_labled_lines(corpus, labeldict), filepath=LLDA_filepath)

    # wait for file to exist
    while not os.path.exists(LLDA_filepath):
        time.sleep(1)
    #top_topic_words=1
    logprint("")
    logprint("start LLDA:")
    # run JGibsslda file
    FNULL = open(os.devnull, 'w')  # supress output
    cmd_jgibbs_java = ["java", "-cp",
                       "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
                           jgibbsLLDA_root),
                       "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
                       "tickets.gz",
                       "-twords", str(top_topic_words), "-ntopics", str(n_topics)]
    subprocess.call(cmd_jgibbs_java, stdout=FNULL)

    # ANMERKUNG: Dateien sind versteckt. zu finden in models/

    # twords
    """
    subprocess.call(["gzip",
                     "-dc",
                     "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
    """

    cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
    """
    proc = subprocess.Popen(cmd_gzip, stdout=subprocess.PIPE)

    process = subprocess.Popen(cmd_gzip, shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)

    # wait for the process to terminate
    out, err = process.communicate()
    errcode = process.returncode

    result = subprocess.check_output(cmd_gzip)

    #result = proc.stdout.read()
    result = proc.communicate()
    out=[]
    for line in result:
        out.append(line)
    """

    output = subprocess.check_output(cmd_gzip).decode("utf-8")

    reverse_labeldict = {v: k for k, v in labeldict.items()}
    result = []
    regex = re.compile(r'Topic [0-9]*')
    for line in output.splitlines():

        findall = regex.findall(line)
        if len(findall) != 0:
            try:
                index = int(findall[0].split()[1])
                result.append("Topic {} {}:".format(index, reverse_labeldict[index]))

            except:
                result.append(line)

        else:
            result.append(line)

    textacy.fileio.write_file_lines(result, path2save_results)
    #####################################################################################################################

    #todo llda termite plot
    """
    topic_inds=[] #<class 'list'>: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

    # get topic and term labels
    # <class 'tuple'>: ('topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14')
    topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)

    # <class 'tuple'>: ('hardware', 'raum', 'adresse', 'gebaeude', 'tu', 'uni', 'ticket', 'email', 'account', 'nummer', 'mail', 'outlook', 'karte', 'drucker', 'server', 'service', 'antwort', 'verbindung', 'herzliche', 'einrichten', 'vergessen', 'wenden', 'ews', 'anhang', 'form', 'konto', 'nachricht', 'unterstuetzung', 'passwort', 'unicard', 'semester', 'system', 'aenderung', 'rueckmeldung', 'meldung', 'zugreifen', 'login', 'adressat', 'sender', 'kurs', 'made', 'mittwoch', 'note', 'our', 'korrespondenz', 'unbeschadet', 'boss', 'unterrichten', 'telefax', 'zugang', 'probleme', 'zugriff', 'mitarbeiterin', 'internet', 'daten', 'anmeldung', 'aendern', 'unterschrift', 'loeschen', 'anmelden', 'datei', 'message', 'laptop', 'benoetigt', 'link', 'montag', 'programm', 'ordner', 'personal', 'rechner', 'veranstaltung', 'august', 'lizenz', 'anschluss', 'mitarbeiter', 'erwuenscht', 'umzug', 'pc', 'uniaccount', 'amt', 'fax', 'it', 'institut', 'nutzer', 'bild', 'type', 'prof', 'verantwortlicher', 'bemerkung', 'antragsteller', 'element', 'hahn', 'eintrag', 'telefonbuch', 'ansprechpartner', 'universitaet', 'physik', 'abteilung', 'fakultaet', 'software', 'dezernat', 'einrichtung', 'telefon', 'lehrstuhl', 'buero')
    term_labels = tuple(id2term[term_ind] for term_ind in term_inds)

    # get topic-term weights to size dots
    #[[ 0.02721858 -0.03898025  0.00047936 ...,  0.05862538 -0.07742336  0.04761928]
    # [ 0.14977875 -0.24192522 -0.00620335 ..., -0.0497216   0.08269951    -0.05715901]
    # [ 0.04977951  0.02296709  0.01214562 ...,  0.11444371 -0.15212482     0.21481788]
    # ..., 
    # [ 
    term_topic_weights = np.array([self.model.components_[topic_ind][term_inds]
                                   for topic_ind in topic_inds]).T

    viz.draw_termite_plot(
        term_topic_weights, topic_labels, term_labels, save=path2save_results)
    """
    logprint("")

    end = time.time()
    logprint("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60))


def main(use_raw=False, algorithm="llda"):
    logprint("Topic Modeling: {0}".format(datetime.now()))

    corpus_de_path = FILEPATH + config.get("de_corpus", "path")

    corpus_en_path = FILEPATH + config.get("en_corpus", "path")


    if use_raw:
        # fehler Unknown document label ( X ) for document 352.
        preCorpus_name = "de" + "_raw_ticket"
        resultspath = FILEPATH + "results/raw"

    else:
        preCorpus_name = "de" + "_pre_ticket"
        resultspath = FILEPATH + "results/pre"

    # load raw corpus and create new one
    de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
    logprint("Corpus loaded: {0}".format(de_corpus.lang))

    # idee http://bigartm.org/
    # idee http://wiki.languagetool.org/tips-and-tricks
    # idee https://en.wikipedia.org/wiki/Noisy_text_analytics
    # idee https://gate.ac.uk/family/

    # todo llda topics zusammenfassen
    # idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
    # frage lda wieviele tickets pro topic?

    """
    ngrams = 1
    min_df = 1
    max_df = 1.0
    weighting = 'tf'
    # weighting ='tfidf'
    named_entities = False

  
    printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
    printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
    printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting)

    printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting)
    printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
    printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
    """

    if algorithm == "llda":
        top_topic_words = 5
        add_default_topic = False
        path2save_results = resultspath +  "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
        jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
                   add_default_topic=add_default_topic)

        top_topic_words = 5
        add_default_topic = True
        path2save_results = resultspath +  "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
        jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
                   add_default_topic=add_default_topic)

        top_topic_words = 10
        add_default_topic = False
        path2save_results = resultspath +  "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
        jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
                   add_default_topic=add_default_topic)

        top_topic_words = 10
        add_default_topic = True
        path2save_results = resultspath +  "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
        jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
                   add_default_topic=add_default_topic)

    # no_below = 20
    # no_above = 0.5


    # n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic


    else:

        # build dictionary of ticketcategories
        labelist = []

        for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
            labelist.append(texdoc.metadata["categoryName"])

        labeldict = {k: v for v, k in enumerate(labelist)}


        textacyTopicModeling(ngrams = 1,
                             min_df = 1,
                             max_df = 0.9,
                             topicModel = algorithm,
                             n_topics =15,
                             corpus=de_corpus)

        textacyTopicModeling(ngrams=1,
                             min_df=1,
                             max_df=0.9,
                             topicModel=algorithm,
                             n_topics=20,
                             corpus=de_corpus)

        textacyTopicModeling(ngrams=1,
                             min_df=1,
                             max_df=0.9,
                             topicModel=algorithm,
                             n_topics=25,
                             corpus=de_corpus)


        textacyTopicModeling(ngrams=1,
                             min_df=1,
                             max_df=0.9,
                             topicModel=algorithm,
                             n_topics=30,
                             corpus=de_corpus)


        textacyTopicModeling(ngrams=(1, 2),
                             min_df=1,
                             max_df=0.9,
                             topicModel=algorithm,
                             n_topics=15,
                             corpus=de_corpus)

        textacyTopicModeling(ngrams = (1,2),
                             min_df = 1,
                             max_df = 0.9,
                             topicModel = algorithm,
                             n_topics =20,
                             corpus=de_corpus)

        textacyTopicModeling(ngrams = (1,2),
                             min_df = 1,
                             max_df = 0.9,
                             topicModel = algorithm,
                             n_topics =25,
                             corpus=de_corpus)


        textacyTopicModeling(ngrams = (1,2),
                             min_df = 1,
                             max_df = 0.9,
                             topicModel = algorithm,
                             n_topics =30,
                             corpus=de_corpus)


        """
        textacyTopicModeling(ngrams = (1,2),
                             min_df = 1,
                             max_df = 0.8,
                             topicModel = algorithm,
                             n_topics =n_topics,
                             corpus=de_corpus)

        """

        """
        textacyTopicModeling(ngrams = 1,
                             min_df = 0.1,
                             max_df = 0.6,
                             topicModel = algorithm,
                             n_topics =n_topics,
                             corpus=de_corpus)


        textacyTopicModeling(ngrams = (1,2),
                             min_df = 1,
                             max_df = 1.0,
                             topicModel = algorithm,
                             n_topics =n_topics,
                             corpus=de_corpus)

        textacyTopicModeling(ngrams = (1,2),
                             min_df = 0.1,
                             max_df = 0.6,
                             topicModel = algorithm,
                             n_topics =n_topics,
                             corpus=de_corpus)


        textacyTopicModeling(ngrams = (1,2),
                      min_df = 0.2,
                      max_df = 0.8,
                      topicModel = algorithm,
                      n_topics = 20,
                      corpus=de_corpus)
        """


if __name__ == "__main__":
    main()