topicModelingTickets/draw1.py

# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals

import logging

import numpy as np
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.externals import joblib

from textacy import viz
import draw

LOGGER = logging.getLogger(__name__)


def termite_plot(model, doc_term_matrix, id2term,
                 topics=-1, sort_topics_by='index', highlight_topics=None,
                 n_terms=25, rank_terms_by='topic_weight', sort_terms_by='seriation',
                 save=False, pow_x = 0.66,  pow_y = 0.8):


    if highlight_topics is not None:
        if isinstance(highlight_topics, int):
            highlight_topics = (highlight_topics,)
        elif len(highlight_topics) > 6:
            raise ValueError('no more than 6 topics may be highlighted at once')

    # get topics indices
    if topics == -1:
        topic_inds = tuple(range(model.n_topics))
    elif isinstance(topics, int):
        topic_inds = (topics,)
    else:
        topic_inds = tuple(topics)

    # get topic indices in sorted order
    if sort_topics_by == 'index':
        topic_inds = sorted(topic_inds)
    elif sort_topics_by == 'weight':
        topic_inds = tuple(topic_ind for topic_ind
                           in np.argsort(model.topic_weights(model.transform(doc_term_matrix)))[::-1]
                           if topic_ind in topic_inds)
    else:
        msg = 'invalid sort_topics_by value; must be in {}'.format(
            {'index', 'weight'})
        raise ValueError(msg)

    # get column index of any topics to highlight in termite plot
    if highlight_topics is not None:
        highlight_cols = tuple(i for i in range(len(topic_inds))
                               if topic_inds[i] in highlight_topics)
    else:
        highlight_cols = None

    # get top term indices
    if rank_terms_by == 'corpus_weight':
        term_inds = np.argsort(np.ravel(doc_term_matrix.sum(axis=0)))[:-n_terms - 1:-1]
    elif rank_terms_by == 'topic_weight':
        term_inds = np.argsort(model.model.components_.sum(axis=0))[:-n_terms - 1:-1]
    else:
        msg = 'invalid rank_terms_by value; must be in {}'.format(
            {'corpus_weight', 'topic_weight'})
        raise ValueError(msg)

    # get top term indices in sorted order
    if sort_terms_by == 'weight':
        pass
    elif sort_terms_by == 'index':
        term_inds = sorted(term_inds)
    elif sort_terms_by == 'alphabetical':
        term_inds = sorted(term_inds, key=lambda x: id2term[x])
    elif sort_terms_by == 'seriation':
        topic_term_weights_mat = np.array(
            np.array([model.model.components_[topic_ind][term_inds]
                      for topic_ind in topic_inds])).T
        # calculate similarity matrix
        topic_term_weights_sim = np.dot(topic_term_weights_mat, topic_term_weights_mat.T)
        # substract minimum of sim mat in order to keep sim mat nonnegative
        topic_term_weights_sim = topic_term_weights_sim - topic_term_weights_sim.min()
        # compute Laplacian matrice and its 2nd eigenvector
        L = np.diag(sum(topic_term_weights_sim, 1)) - topic_term_weights_sim
        D, V = np.linalg.eigh(L)
        D = D[np.argsort(D)]
        V = V[:, np.argsort(D)]
        fiedler = V[:, 1]
        # get permutation corresponding to sorting the 2nd eigenvector
        term_inds = [term_inds[i] for i in np.argsort(fiedler)]
    else:
        msg = 'invalid sort_terms_by value; must be in {}'.format(
            {'weight', 'index', 'alphabetical', 'seriation'})
        raise ValueError(msg)

    # get topic and term labels
    topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)
    term_labels = tuple(id2term[term_ind] for term_ind in term_inds)

    # get topic-term weights to size dots
    term_topic_weights = np.array([model.model.components_[topic_ind][term_inds]
                                   for topic_ind in topic_inds]).T

    return draw.draw_termite(
        term_topic_weights, topic_labels, term_labels,
        highlight_cols=highlight_cols, save=save, pow_x = pow_x,  pow_y = pow_y)
refactored 2017-11-17 11:46:57 +01:00			`# -- coding: utf-8 --`
			`from __future__ import absolute_import, division, print_function, unicode_literals`

			`import logging`

			`import numpy as np`
			`from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD`
			`from sklearn.externals import joblib`

			`from textacy import viz`
			`import draw`

			`LOGGER = logging.getLogger(__name__)`



			`def termite_plot(model, doc_term_matrix, id2term,`
			`topics=-1, sort_topics_by='index', highlight_topics=None,`
			`n_terms=25, rank_terms_by='topic_weight', sort_terms_by='seriation',`
			`save=False, pow_x = 0.66, pow_y = 0.8):`



			`if highlight_topics is not None:`
			`if isinstance(highlight_topics, int):`
			`highlight_topics = (highlight_topics,)`
			`elif len(highlight_topics) > 6:`
			`raise ValueError('no more than 6 topics may be highlighted at once')`

			`# get topics indices`
			`if topics == -1:`
			`topic_inds = tuple(range(model.n_topics))`
			`elif isinstance(topics, int):`
			`topic_inds = (topics,)`
			`else:`
			`topic_inds = tuple(topics)`

			`# get topic indices in sorted order`
			`if sort_topics_by == 'index':`
			`topic_inds = sorted(topic_inds)`
			`elif sort_topics_by == 'weight':`
			`topic_inds = tuple(topic_ind for topic_ind`
			`in np.argsort(model.topic_weights(model.transform(doc_term_matrix)))[::-1]`
			`if topic_ind in topic_inds)`
			`else:`
			`msg = 'invalid sort_topics_by value; must be in {}'.format(`
			`{'index', 'weight'})`
			`raise ValueError(msg)`

			`# get column index of any topics to highlight in termite plot`
			`if highlight_topics is not None:`
			`highlight_cols = tuple(i for i in range(len(topic_inds))`
			`if topic_inds[i] in highlight_topics)`
			`else:`
			`highlight_cols = None`

			`# get top term indices`
			`if rank_terms_by == 'corpus_weight':`
			`term_inds = np.argsort(np.ravel(doc_term_matrix.sum(axis=0)))[:-n_terms - 1:-1]`
			`elif rank_terms_by == 'topic_weight':`
			`term_inds = np.argsort(model.model.components_.sum(axis=0))[:-n_terms - 1:-1]`
			`else:`
			`msg = 'invalid rank_terms_by value; must be in {}'.format(`
			`{'corpus_weight', 'topic_weight'})`
			`raise ValueError(msg)`

			`# get top term indices in sorted order`
			`if sort_terms_by == 'weight':`
			`pass`
			`elif sort_terms_by == 'index':`
			`term_inds = sorted(term_inds)`
			`elif sort_terms_by == 'alphabetical':`
			`term_inds = sorted(term_inds, key=lambda x: id2term[x])`
			`elif sort_terms_by == 'seriation':`
			`topic_term_weights_mat = np.array(`
			`np.array([model.model.components_[topic_ind][term_inds]`
			`for topic_ind in topic_inds])).T`
			`# calculate similarity matrix`
			`topic_term_weights_sim = np.dot(topic_term_weights_mat, topic_term_weights_mat.T)`
			`# substract minimum of sim mat in order to keep sim mat nonnegative`
			`topic_term_weights_sim = topic_term_weights_sim - topic_term_weights_sim.min()`
			`# compute Laplacian matrice and its 2nd eigenvector`
			`L = np.diag(sum(topic_term_weights_sim, 1)) - topic_term_weights_sim`
			`D, V = np.linalg.eigh(L)`
			`D = D[np.argsort(D)]`
			`V = V[:, np.argsort(D)]`
			`fiedler = V[:, 1]`
			`# get permutation corresponding to sorting the 2nd eigenvector`
			`term_inds = [term_inds[i] for i in np.argsort(fiedler)]`
			`else:`
			`msg = 'invalid sort_terms_by value; must be in {}'.format(`
			`{'weight', 'index', 'alphabetical', 'seriation'})`
			`raise ValueError(msg)`

			`# get topic and term labels`
			`topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)`
			`term_labels = tuple(id2term[term_ind] for term_ind in term_inds)`

			`# get topic-term weights to size dots`
			`term_topic_weights = np.array([model.model.components_[topic_ind][term_inds]`
			`for topic_ind in topic_inds]).T`

			`return draw.draw_termite(`
			`term_topic_weights, topic_labels, term_labels,`
			`highlight_cols=highlight_cols, save=save, pow_x = pow_x, pow_y = pow_y)`