2017-09-11 17:29:54 +02:00
# -*- coding: utf-8 -*-
2017-09-15 14:32:44 +02:00
2017-09-13 12:53:09 +02:00
import time
start = time . time ( )
2017-09-15 14:32:44 +02:00
import logging
2017-09-13 12:53:09 +02:00
2017-09-12 14:56:11 +02:00
import csv
2017-09-11 17:29:54 +02:00
import functools
2017-09-12 14:56:11 +02:00
import os . path
2017-09-11 17:29:54 +02:00
import re
2017-09-12 14:56:11 +02:00
import subprocess
import time
2017-09-11 17:29:54 +02:00
import xml . etree . ElementTree as ET
2017-09-12 14:56:11 +02:00
import sys
2017-09-11 17:29:54 +02:00
import spacy
import textacy
2017-09-12 14:56:11 +02:00
from scipy import *
from textacy import Vectorizer
2017-09-13 12:53:09 +02:00
import warnings
2017-09-15 14:32:44 +02:00
import configparser as ConfigParser
2017-09-14 11:40:00 +02:00
import sys
2017-09-15 14:32:44 +02:00
csv . field_size_limit ( sys . maxsize )
2017-09-14 11:40:00 +02:00
2017-09-11 17:29:54 +02:00
2017-09-12 14:56:11 +02:00
2017-09-13 12:53:09 +02:00
# Load the configuration file
2017-09-15 14:32:44 +02:00
config_ini = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini "
2017-09-13 12:53:09 +02:00
config = ConfigParser . ConfigParser ( )
2017-09-15 14:32:44 +02:00
with open ( config_ini ) as f :
2017-09-13 12:53:09 +02:00
config . read_file ( f )
2017-09-14 11:40:00 +02:00
2017-09-15 14:32:44 +02:00
# config logging
logging . basicConfig ( filename = config . get ( " filepath " , " logfile " ) , level = logging . INFO )
2017-09-14 11:40:00 +02:00
2017-09-15 14:32:44 +02:00
thesauruspath = config . get ( " filepath " , " thesauruspath " )
THESAURUS = list ( textacy . fileio . read_csv ( thesauruspath , delimiter = " ; " ) )
2017-09-13 12:53:09 +02:00
2017-09-15 14:32:44 +02:00
DE_PARSER = spacy . load ( " de " ) #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen
2017-09-13 12:53:09 +02:00
de_stop_words = list ( __import__ ( " spacy. " + DE_PARSER . lang , globals ( ) , locals ( ) , [ ' object ' ] ) . STOP_WORDS )
2017-09-11 17:29:54 +02:00
2017-09-12 14:56:11 +02:00
2017-09-15 14:32:44 +02:00
LEMMAS = config . get ( " filepath " , " lemmas " )
2017-09-12 14:56:11 +02:00
2017-09-13 12:53:09 +02:00
############# misc
2017-09-15 14:32:44 +02:00
def printlog ( string , level = " INFO " ) :
""" log and prints """
print ( string )
if level == " INFO " :
logging . info ( string )
elif level == " DEBUG " :
logging . debug ( string )
elif level == " WARNING " :
logging . warning ( string )
printlog ( " Load functions " )
2017-09-13 12:53:09 +02:00
def compose ( * functions ) :
def compose2 ( f , g ) :
return lambda x : f ( g ( x ) )
return functools . reduce ( compose2 , functions , lambda x : x )
def get_calling_function ( ) :
""" finds the calling function in many decent cases.
https : / / stackoverflow . com / questions / 39078467 / python - how - to - get - the - calling - function - not - just - its - name
"""
fr = sys . _getframe ( 1 ) # inspect.stack()[1][0]
co = fr . f_code
for get in (
lambda : fr . f_globals [ co . co_name ] ,
lambda : getattr ( fr . f_locals [ ' self ' ] , co . co_name ) ,
lambda : getattr ( fr . f_locals [ ' cls ' ] , co . co_name ) ,
lambda : fr . f_back . f_locals [ co . co_name ] , # nested
lambda : fr . f_back . f_locals [ ' func ' ] , # decorators
lambda : fr . f_back . f_locals [ ' meth ' ] ,
lambda : fr . f_back . f_locals [ ' f ' ] ,
) :
try :
func = get ( )
except ( KeyError , AttributeError ) :
pass
else :
if func . __code__ == co :
return func
raise AttributeError ( " func not found " )
2017-09-15 14:32:44 +02:00
2017-09-12 14:56:11 +02:00
def printRandomDoc ( textacyCorpus ) :
import random
print ( )
2017-09-15 14:32:44 +02:00
printlog ( " len(textacyCorpus) = %i " % len ( textacyCorpus ) )
2017-09-12 14:56:11 +02:00
randIndex = int ( ( len ( textacyCorpus ) - 1 ) * random . random ( ) )
2017-09-15 14:32:44 +02:00
printlog ( " Index: {0} ; Text: {1} ; Metadata: {2} " . format ( randIndex , textacyCorpus [ randIndex ] . text , textacyCorpus [ randIndex ] . metadata ) )
2017-09-12 14:56:11 +02:00
print ( )
2017-09-11 17:29:54 +02:00
2017-09-14 11:40:00 +02:00
############# load xml
def generateMainTextfromTicketXML ( path2xml , main_textfield = ' Description ' ) :
2017-09-11 17:29:54 +02:00
"""
generates strings from XML
: param path2xml :
: param main_textfield :
: param cleaning_function :
: yields strings
"""
tree = ET . parse ( path2xml , ET . XMLParser ( encoding = " utf-8 " ) )
root = tree . getroot ( )
for ticket in root :
for field in ticket :
if field . tag == main_textfield :
yield field . text
2017-09-15 14:32:44 +02:00
2017-09-14 11:40:00 +02:00
def generateMetadatafromTicketXML ( path2xml , leave_out = [ ' Description ' ] ) :
2017-09-12 14:56:11 +02:00
tree = ET . parse ( path2xml , ET . XMLParser ( encoding = " utf-8 " ) )
root = tree . getroot ( )
2017-09-11 17:29:54 +02:00
2017-09-12 14:56:11 +02:00
for ticket in root :
metadata = { }
for field in ticket :
if field . tag not in leave_out :
2017-09-11 17:29:54 +02:00
2017-09-12 14:56:11 +02:00
metadata [ field . tag ] = field . text
yield metadata
2017-09-11 17:29:54 +02:00
2017-09-14 11:40:00 +02:00
############# load csv
2017-09-13 12:53:09 +02:00
def csv_to_contentStream ( path2csv : str , content_collumn_name : str ) :
"""
: param path2csv : string
: param content_collumn_name : string
: return : string - generator
"""
stream = textacy . fileio . read_csv ( path2csv , delimiter = " ; " ) # ,encoding='utf8')
content_collumn = 0 # standardvalue
for i , lst in enumerate ( stream ) :
if i == 0 :
# look for desired column
for j , col in enumerate ( lst ) :
if col == content_collumn_name :
content_collumn = j
else :
yield lst [ content_collumn ]
2017-09-15 14:32:44 +02:00
2017-09-13 12:53:09 +02:00
def csv_to_metaStream ( path2csv : str , metalist : [ str ] ) :
"""
: param path2csv : string
: param metalist : list of strings
: return : dict - generator
"""
stream = textacy . fileio . read_csv ( path2csv , delimiter = " ; " ) # ,encoding='utf8')
content_collumn = 0 # standardvalue
metaindices = [ ]
metadata_temp = { }
for i , lst in enumerate ( stream ) :
if i == 0 :
for j , col in enumerate ( lst ) : # geht bestimmt effizienter... egal, weil passiert nur einmal
for key in metalist :
if key == col :
metaindices . append ( j )
metadata_temp = dict ( zip ( metalist , metaindices ) ) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
else :
metadata = metadata_temp . copy ( )
for key , value in metadata . items ( ) :
metadata [ key ] = lst [ value ]
yield metadata
2017-09-15 14:32:44 +02:00
############################################ Preprocessing ##############################################
2017-09-13 12:53:09 +02:00
2017-09-14 11:40:00 +02:00
############# on str-gen
2017-09-13 12:53:09 +02:00
def processTokens ( tokens , funclist , parser ) :
# in:tokenlist, funclist
# out: tokenlist
for f in funclist :
2017-09-18 16:08:11 +02:00
# idee: funclist sortieren,s.d. erst alle string-methoden ausgeführt werden, dann wird geparesed, dann wird auf tokens gearbeitet, dann evtl. auf dem ganzen Doc
2017-09-13 12:53:09 +02:00
if ' bool ' in str ( f . __annotations__ ) :
tokens = list ( filter ( f , tokens ) )
elif ' str ' in str ( f . __annotations__ ) :
tokens = list ( map ( f , tokens ) ) # purer text
2017-09-18 16:08:11 +02:00
doc = parser ( " " . join ( tokens ) ) # neu parsen
2017-09-13 12:53:09 +02:00
tokens = [ tok for tok in doc ] # nur tokens
elif ' spacy.tokens.doc.Doc ' in str ( f . __annotations__ ) :
2017-09-14 11:40:00 +02:00
#todo wirkt gefrickelt
doc = parser ( " " . join ( tok . lower_ for tok in tokens ) ) # geparsed
tokens = f ( doc )
doc = parser ( " " . join ( tokens ) ) # geparsed
tokens = [ tok for tok in doc ] # nur tokens
2017-09-13 12:53:09 +02:00
else :
warnings . warn ( " Unknown Annotation while preprocessing. Function: {0} " . format ( str ( f ) ) )
return tokens
def processTextstream ( textstream , funclist , parser = DE_PARSER ) :
"""
: param textstream : string - gen
: param funclist : [ func ]
: param parser : spacy - parser
: return : string - gen
"""
# input:str-stream output:str-stream
pipe = parser . pipe ( textstream )
for doc in pipe :
2017-09-15 14:32:44 +02:00
tokens = [ ]
for tok in doc :
tokens . append ( tok )
2017-09-13 12:53:09 +02:00
tokens = processTokens ( tokens , funclist , parser )
2017-09-11 17:29:54 +02:00
yield " " . join ( [ tok . lower_ for tok in tokens ] )
2017-09-13 12:53:09 +02:00
def processDictstream ( dictstream , funcdict , parser = DE_PARSER ) :
"""
: param dictstream : dict - gen
: param funcdict :
clean_in_meta = {
" Solution " : funclist ,
. . .
}
: param parser : spacy - parser
: return : dict - gen
"""
2017-09-12 14:56:11 +02:00
for dic in dictstream :
result = { }
for key , value in dic . items ( ) :
2017-09-13 12:53:09 +02:00
2017-09-12 14:56:11 +02:00
if key in funcdict :
2017-09-13 12:53:09 +02:00
doc = parser ( value )
tokens = [ tok for tok in doc ]
funclist = funcdict [ key ]
tokens = processTokens ( tokens , funclist , parser )
result [ key ] = " " . join ( [ tok . lower_ for tok in tokens ] )
2017-09-12 14:56:11 +02:00
else :
result [ key ] = value
yield result
2017-09-14 11:40:00 +02:00
############# return bool
2017-09-13 12:53:09 +02:00
2017-09-12 14:56:11 +02:00
def keepPOS ( pos_list ) - > bool :
ret = lambda tok : tok . pos_ in pos_list
2017-09-11 17:29:54 +02:00
2017-09-13 12:53:09 +02:00
ret . __annotations__ = get_calling_function ( ) . __annotations__
2017-09-12 14:56:11 +02:00
return ret
2017-09-11 17:29:54 +02:00
2017-09-12 14:56:11 +02:00
def removePOS ( pos_list ) - > bool :
ret = lambda tok : tok . pos_ not in pos_list
2017-09-11 17:29:54 +02:00
2017-09-13 12:53:09 +02:00
ret . __annotations__ = get_calling_function ( ) . __annotations__
2017-09-12 14:56:11 +02:00
return ret
2017-09-11 17:29:54 +02:00
2017-09-12 14:56:11 +02:00
def removeWords ( words , keep = None ) - > bool :
2017-09-11 17:29:54 +02:00
if hasattr ( keep , ' __iter__ ' ) :
for k in keep :
try :
words . remove ( k )
except ValueError :
pass
2017-09-12 14:56:11 +02:00
ret = lambda tok : tok . lower_ not in words
2017-09-13 12:53:09 +02:00
ret . __annotations__ = get_calling_function ( ) . __annotations__
2017-09-12 14:56:11 +02:00
return ret
def keepENT ( ent_list ) - > bool :
ret = lambda tok : tok . ent_type_ in ent_list
2017-09-13 12:53:09 +02:00
ret . __annotations__ = get_calling_function ( ) . __annotations__
2017-09-12 14:56:11 +02:00
return ret
def removeENT ( ent_list ) - > bool :
ret = lambda tok : tok . ent_type_ not in ent_list
2017-09-13 12:53:09 +02:00
ret . __annotations__ = get_calling_function ( ) . __annotations__
2017-09-12 14:56:11 +02:00
return ret
2017-09-15 14:32:44 +02:00
def remove_words_containing_Numbers ( ) - > bool :
ret = lambda tok : not bool ( re . search ( ' \ d ' , tok . lower_ ) )
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
def remove_words_containing_specialCharacters ( ) - > bool :
ret = lambda tok : not bool ( re . search ( r ' [` \ -=~!@#$ % ^&*()_+ \ [ \ ] {} ; \' \\ : " |<,./<>?] ' , tok . lower_ ) )
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
2017-09-18 16:08:11 +02:00
def remove_words_containing_topLVL ( ) - > bool :
ret = lambda tok : not bool ( re . search ( r ' \ .[a-z] { 2,3}( \ .[a-z] { 2,3})? ' , tok . lower_ ) )
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
2017-09-15 14:32:44 +02:00
def lemmatizeWord ( word , filepath = LEMMAS ) :
""" http://www.lexiconista.com/datasets/lemmatization/ """
for line in list ( textacy . fileio . read_file_lines ( filepath = filepath ) ) :
if word . lower ( ) == line . split ( ) [ 1 ] . strip ( ) . lower ( ) :
return line . split ( ) [ 0 ] . strip ( ) . lower ( )
return word . lower ( ) # falls nix gefunden wurde
2017-09-12 14:56:11 +02:00
def lemmatize ( ) - > str :
2017-09-15 14:32:44 +02:00
ret = lambda tok : lemmatizeWord ( tok . lower_ )
2017-09-12 14:56:11 +02:00
2017-09-13 12:53:09 +02:00
ret . __annotations__ = get_calling_function ( ) . __annotations__
2017-09-12 14:56:11 +02:00
return ret
2017-09-13 12:53:09 +02:00
############# return strings
2017-09-11 17:29:54 +02:00
2017-09-13 12:53:09 +02:00
mentionFinder = re . compile ( r " @[a-z0-9_] { 1,15} " , re . IGNORECASE )
emailFinder = re . compile ( r " \ b[A-Z0-9._ % +-]+@[A-Z0-9.-]+ \ .[A-Z] { 2,} \ b " , re . IGNORECASE )
urlFinder = re . compile ( r " ^(?:https?: \ / \ /)?(?:www \ .)?[a-zA-Z0-9./]+$ " , re . IGNORECASE )
2017-09-18 16:08:11 +02:00
topLVLFinder = re . compile ( r ' \ .[a-z] { 2,3}( \ .[a-z] { 2,3})? ' , re . IGNORECASE )
specialFinder = re . compile ( r ' [` \ -=~!@#$ % ^&*()_+ \ [ \ ] {} ; \' \\ : " |<,./>?] ' , re . IGNORECASE )
hardSFinder = re . compile ( r ' [ß] ' , re . IGNORECASE )
2017-09-13 12:53:09 +02:00
def replaceEmails ( replace_with = " EMAIL " ) - > str :
ret = lambda tok : emailFinder . sub ( replace_with , tok . lower_ )
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
def replaceURLs ( replace_with = " URL " ) - > str :
ret = lambda tok : textacy . preprocess . replace_urls ( tok . lower_ , replace_with = replace_with )
#ret = lambda tok: urlFinder.sub(replace_with,tok.lower_)
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
2017-09-18 16:08:11 +02:00
def replaceSpecialChars ( replace_with = " " ) - > str :
ret = lambda tok : specialFinder . sub ( replace_with , tok . lower_ )
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
2017-09-13 12:53:09 +02:00
def replaceTwitterMentions ( replace_with = " TWITTER_MENTION " ) - > str :
ret = lambda tok : mentionFinder . sub ( replace_with , tok . lower_ )
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
def replaceNumbers ( replace_with = " NUMBER " ) - > str :
ret = lambda tok : textacy . preprocess . replace_numbers ( tok . lower_ , replace_with = replace_with )
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
def replacePhonenumbers ( replace_with = " PHONENUMBER " ) - > str :
ret = lambda tok : textacy . preprocess . replace_phone_numbers ( tok . lower_ , replace_with = replace_with )
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
2017-09-18 16:08:11 +02:00
def replaceHardS ( replace_with = " ss " ) - > str :
ret = lambda tok : hardSFinder . sub ( replace_with , tok . lower_ )
2017-09-13 12:53:09 +02:00
2017-09-18 16:08:11 +02:00
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
2017-09-15 14:32:44 +02:00
def fixUnicode ( ) - > str :
ret = lambda tok : textacy . preprocess . fix_bad_unicode ( tok . lower_ , normalization = u ' NFC ' )
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
2017-09-18 16:08:11 +02:00
2017-09-13 12:53:09 +02:00
def resolveAbbreviations ( ) :
pass #todo
2017-09-18 16:08:11 +02:00
#todo wörter mit len < 2 entfernen( vorher abkürzungen (v.a. tu und fh) auflösen) und > 35 oder 50 ("Reiserücktrittskostenversicherung)
2017-09-13 12:53:09 +02:00
2017-09-18 16:08:11 +02:00
############# return docs
2017-09-14 11:40:00 +02:00
def keepUniqeTokens ( ) - > spacy . tokens . Doc :
ret = lambda doc : ( set ( [ tok . lower_ for tok in doc ] ) )
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
def lower ( ) - > spacy . tokens . Doc :
ret = lambda doc : ( [ tok . lower_ for tok in doc ] )
ret . __annotations__ = get_calling_function ( ) . __annotations__
return ret
2017-09-15 14:32:44 +02:00
################################################################################################################
path2xml = config . get ( " filepath " , " path2xml " )
path2csv = config . get ( " filepath " , " path2csv " )
path2csv = " M42-Export/Tickets_med.csv "
printlog ( " CSV: {0} " . format ( path2csv ) )
2017-09-14 11:40:00 +02:00
ticketcorpus = textacy . Corpus ( DE_PARSER )
2017-09-15 14:32:44 +02:00
"""
vllt kategorien in unterkategorien aufteilen
allg :
utf - korregieren , bei sonderzeichen wörter trennen
2017-09-19 14:42:38 +02:00
namen raus , addressen nach grüßen
2017-09-15 14:32:44 +02:00
emails , urls , nummern raus
vllt sogar alles , was ebend jenes enthält ( oder auf . toplvldomain bzw . sonderzeichen enthält oder alles was ein @ enthält
sinnvoller wörter von müll trennen : 8203 ; verfügung
abkürzungen raus : m . a , o . ä .
sinnlose bsp : nr54065467 455 a33c5 tvt ? = - - - - - - problem - - - - - - - -
"""
2017-09-14 11:40:00 +02:00
2017-09-13 12:53:09 +02:00
metaliste = [
" Subject " ,
" categoryName " ,
" Solution "
]
clean_in_meta = {
2017-09-14 11:40:00 +02:00
" Solution " : [ removePOS ( [ " SPACE " ] ) , lower ( ) ] ,
2017-09-15 14:32:44 +02:00
" Subject " : [ removePOS ( [ " SPACE " , " PUNCT " ] ) , lower ( ) ] ,
" categoryName " : [ removePOS ( [ " SPACE " , " PUNCT " ] ) , lower ( ) ]
2017-09-12 14:56:11 +02:00
2017-09-15 14:32:44 +02:00
}
2017-09-12 14:56:11 +02:00
2017-09-14 11:40:00 +02:00
2017-09-15 14:32:44 +02:00
printlog ( " Start Preprocessing " )
2017-09-14 11:40:00 +02:00
2017-09-11 17:29:54 +02:00
clean_in_content = [
2017-09-18 16:08:11 +02:00
replaceHardS ( ) ,
replaceSpecialChars ( ) ,
2017-09-15 14:32:44 +02:00
2017-09-18 16:08:11 +02:00
remove_words_containing_topLVL ( ) ,
2017-09-15 14:32:44 +02:00
remove_words_containing_Numbers ( ) ,
remove_words_containing_specialCharacters ( ) ,
2017-09-18 16:08:11 +02:00
#removePOS(["SPACE","PUNCT","NUM"]),
#removeENT("PERSON"),
#keepPOS(["NOUN"]),
2017-09-14 11:40:00 +02:00
#replaceURLs(),
#replaceEmails(),
2017-09-15 14:32:44 +02:00
#fixUnicode(),
2017-09-18 16:08:11 +02:00
lemmatize ( ) ,
removeWords ( de_stop_words + config . get ( " preprocessing " , " custom_words " ) . split ( " , " ) ) ,
2017-09-15 14:32:44 +02:00
#keepUniqeTokens(),
#keepENT(config.get("preprocessing","ents2keep"))
2017-09-13 12:53:09 +02:00
2017-09-11 17:29:54 +02:00
]
2017-10-16 14:01:38 +02:00
## add files to textacy-corpi,
printlog ( " add texts to textacy-corpi " )
2017-09-14 11:40:00 +02:00
ticketcorpus . add_texts (
2017-09-13 12:53:09 +02:00
processTextstream ( csv_to_contentStream ( path2csv , " Description " ) , clean_in_content ) ,
processDictstream ( csv_to_metaStream ( path2csv , metaliste ) , clean_in_meta )
2017-09-11 17:29:54 +02:00
)
2017-09-15 14:32:44 +02:00
for i in range ( 10 ) :
printRandomDoc ( ticketcorpus )
2017-09-14 11:40:00 +02:00
end = time . time ( )
2017-09-15 14:32:44 +02:00
printlog ( " Time Elapsed Preprocessing: {0} min " . format ( ( end - start ) / 60 ) )
2017-09-14 11:40:00 +02:00
2017-09-11 17:29:54 +02:00
2017-09-15 14:32:44 +02:00
############################################ Topic Modeling #############################################
2017-09-14 11:40:00 +02:00
print ( " \n \n " )
start = time . time ( )
# build citionary of ticketcategories
labelist = [ ]
for texdoc in ticketcorpus . get ( lambda texdoc : texdoc . metadata [ " categoryName " ] not in labelist ) :
labelist . append ( texdoc . metadata [ " categoryName " ] )
LABELDICT = { k : v for v , k in enumerate ( labelist ) }
print ( LABELDICT )
def label2ID ( label , labeldict = LABELDICT ) :
return labeldict . get ( label , len ( labeldict ) )
def generate_labled_lines ( textacyCorpus ) :
for doc in textacyCorpus :
2017-10-16 14:01:38 +02:00
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
2017-09-14 11:40:00 +02:00
yield " [ " + str ( label2ID ( doc . metadata [ " categoryName " ] ) ) + " ] " + doc . text
####################'####################' todo alles in config
2017-09-15 14:32:44 +02:00
ngrams = 1
2017-09-14 11:40:00 +02:00
min_df = 0
max_df = 1.0
no_below = 20
no_above = 0.5
topicModel = ' lda '
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ( ' tf ' if topicModel == ' lda ' else ' tfidf ' )
top_topic_words = 7
top_document_labels_per_topic = 2
n_topics = len ( LABELDICT ) #len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
####################'####################
2017-10-16 14:01:38 +02:00
printlog ( " vectorize corpi... " )
2017-09-14 11:40:00 +02:00
vectorizer = Vectorizer ( weighting = weighting , min_df = min_df , max_df = max_df )
terms_list = ( doc . to_terms_list ( ngrams = ngrams , named_entities = False , as_strings = True ) for doc in ticketcorpus )
doc_term_matrix = vectorizer . fit_transform ( terms_list )
id2term = vectorizer . __getattribute__ ( " id_to_term " )
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
# Initialize and train a topic model
2017-09-15 14:32:44 +02:00
printlog ( " Initialize and train a topic model.. " )
2017-09-14 11:40:00 +02:00
model = textacy . tm . TopicModel ( topicModel , n_topics = n_topics )
model . fit ( doc_term_matrix )
2017-10-16 14:01:38 +02:00
#Transform the corpi and interpret our model:
printlog ( " Transform the corpi and interpret our model.. " )
2017-09-14 11:40:00 +02:00
doc_topic_matrix = model . transform ( doc_term_matrix )
print ( )
for topic_idx , top_terms in model . top_topic_terms ( vectorizer . id_to_term , top_n = top_topic_words ) :
print ( ' topic ' , topic_idx , ' : ' , ' ' . join ( top_terms ) )
print ( )
for topic_idx , top_docs in model . top_topic_docs ( doc_topic_matrix , top_n = top_document_labels_per_topic ) :
2017-09-15 14:32:44 +02:00
print ( topic_idx )
for j in top_docs :
print ( ticketcorpus [ j ] . metadata [ ' categoryName ' ] )
2017-09-14 11:40:00 +02:00
#####################################################################################################################
print ( )
print ( )
2017-09-15 14:32:44 +02:00
"""
2017-09-14 11:40:00 +02:00
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
2017-09-15 14:32:44 +02:00
jgibbsLLDA_root = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/ "
LLDA_filepath = " {0} models/tickets/tickets.gz " . format ( jgibbsLLDA_root )
2017-09-14 11:40:00 +02:00
#create file
2017-09-15 14:32:44 +02:00
textacy . fileio . write_file_lines ( generate_labled_lines ( ticketcorpus ) , filepath = LLDA_filepath )
2017-09-14 11:40:00 +02:00
# wait for file to exist
2017-09-15 14:32:44 +02:00
while not os . path . exists ( LLDA_filepath ) :
2017-09-14 11:40:00 +02:00
time . sleep ( 1 )
2017-09-15 14:32:44 +02:00
print ( " \n \n " )
printlog ( " start LLDA: " )
2017-09-14 11:40:00 +02:00
#run JGibsslda file
FNULL = open ( os . devnull , ' w ' ) # supress output
subprocess . call ( [ " java " ,
" -cp " , " {0} lib/trove-3.0.3.jar: {0} lib/args4j-2.0.6.jar: {0} out/production/LabledLDA/ " . format ( jgibbsLLDA_root ) ,
" jgibblda.LDA " ,
" -est " ,
" -dir " , " {0} models/tickets " . format ( jgibbsLLDA_root ) ,
" -dfile " , " tickets.gz " ,
" -twords " , str ( top_topic_words ) ,
" -ntopics " , str ( n_topics ) ] , stdout = FNULL )
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
2017-09-15 14:32:44 +02:00
#twords
2017-09-14 11:40:00 +02:00
subprocess . call ( [ " gzip " ,
" -dc " ,
" {0} /models/tickets/.twords.gz " . format ( jgibbsLLDA_root ) ] )
#####################################################################################################################
print ( )
print ( )
2017-09-11 17:29:54 +02:00
2017-09-15 14:32:44 +02:00
"""
2017-09-11 17:29:54 +02:00
2017-09-13 12:53:09 +02:00
end = time . time ( )
2017-09-15 14:32:44 +02:00
printlog ( " \n \n \n Time Elapsed Topic Modeling: {0} \n \n " . format ( end - start ) )