last commit. zu verworren geworden

neue version ist IMTC_TopicModeling
This commit is contained in:
jannis.grundmann 2017-12-19 17:12:35 +01:00
parent 412f25d8d8
commit 1ae9d00c16
17 changed files with 1364 additions and 230 deletions

View File

@ -49,15 +49,20 @@ def clean(stringstream):#, NOUNS):
#string = textacy.preprocess.unidecode(string) #string = textacy.preprocess.unidecode(string)
# seperate_words_on_regex: # seperate_words_on_regex:
string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|</>]').split(string)) string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|</>]').split(string)) #todo bla vllt lassen wir das hier? achaj: für header und footer vllt englische-spracherkennung und adressen parsing und grußfromelerkennung
#normalize whitespace #normalize whitespace
string = textacy.preprocess.normalize_whitespace(string) string = textacy.preprocess.normalize_whitespace(string)
#todo bla textacy.preprocess.remove_accents(text, method=u'unicode')[source]
#remove linebreaks #remove linebreaks
string = re.sub(r'[\n]', " ", string) string = re.sub(r'[\n]', " ", string) #todo bla kann/soll raus? weil absätze vllt weas zu bedeuten haben
string = replaceRockDots(string) string = replaceRockDots(string) #todo bla gehört zu normalize
""" """
# fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden. # fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden.
@ -91,13 +96,70 @@ def clean(stringstream):#, NOUNS):
yield string yield string
def processDictstream_v2(dictstream, keys_to_clean):
for dic in dictstream:
result = {k: re.sub(r'[.!?]', "", normalize_str(v).lower()) if k in keys_to_clean else v for k, v in dic.items()}
yield result
def processDictstream(dictstream, funcdict, parser):
"""
:param dictstream: dict-gen
:param funcdict:
clean_in_meta = {
"Solution":funclist,
...
}
:param parser: spacy-parser
:return: dict-gen
"""
for dic in dictstream:
result = {}
for key, value in dic.items():
if key in funcdict:
doc = parser(value)
tokens = [tok for tok in doc]
funclist = funcdict[key]
tokens = filterTokens(tokens, funclist)
result[key] = " ".join([tok.lower_ for tok in tokens])
else:
result[key] = value
yield result
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
for tok in tokens:
if tok.pos_ == "NOUN":
x = 0
return tokens
################################################################################################## ##################################################################################################
corpus_de_path = FILEPATH + config.get("de_corpus", "path") corpus_de_path = FILEPATH + config.get("de_corpus", "path")
def cleanCorpus(corpus): def cleanCorpus(corpus,clean_in_meta):
logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now())) logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
""" """
@ -122,9 +184,12 @@ def cleanCorpus(corpus):
# Actually clean the corpus # Actually clean the corpus
cleaned_corpus = textacy.Corpus(parser) cleaned_corpus = textacy.Corpus(parser)
cleaned_corpus.add_texts( cleaned_corpus.add_texts(
clean(corpus2Text(raw_corpus)), clean(corpus2Text(raw_corpus)),
corpus2Meta(raw_corpus) #processDictstream(corpus2Meta(cleaned_corpus), clean_in_meta, parser=parser)
processDictstream_v2(corpus2Meta(raw_corpus),clean_in_meta)
) )
@ -143,14 +208,26 @@ def cleanCorpus(corpus):
return cleaned_corpus return cleaned_corpus
def removePOS(pos_list):
return lambda tok: tok.pos_ not in pos_list
def main(corpus): def main(corpus):
start = time.time() start = time.time()
clean_in_meta = {
"Solution": [removePOS(["SPACE"])],
"Subject": [removePOS(["SPACE", "PUNCT"])],
"categoryName": [removePOS(["SPACE", "PUNCT"])]
}
cleaned_corpus = cleanCorpus(corpus) clean_in_meta = ["Subject", "categoryName" ]
cleaned_corpus = cleanCorpus(corpus, clean_in_meta)

View File

@ -67,7 +67,7 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
autocorrect = false autocorrect = false
#true #true
custom_words=aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderung,test,erwuenscht,antragsteller,bemerkung,tu,uni,prof,bezeichnung,gramm,type,eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok,januar,februar,maerz,april,mai,juni,juli,august,september,oktober,november,dezember custom_words=problem,without,aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderung,test,erwuenscht,antragsteller,bemerkung,tu,uni,prof,bezeichnung,gramm,type,eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok,januar,februar,maerz,april,mai,juni,juli,august,september,oktober,november,dezember

View File

@ -0,0 +1,24 @@
[neuanschluss] telefon dezernat uniaccount pavillon telefonbuch mail amt telefon benoetigt telefon speicherpl.f.die ausreichend moebel aufgestellt.weder
[neuanschluss] telefon dekanat bereich studium lehre uniaccount ct g2 telefonbuch mail amt anschluss faxgeraet rufnummer
[lan] service netzwerkanschluss kollegen intranet mail kollegen leitung vpn verbindung intranet netzwerk wlan netz netzwerk mitteilen wenden
[betrieb] support unicard browser service wochen
[elektronisches telefonbuch] telefon umzug astrid.gramm@tu-dortmund.de dezernat uniaccount dezernat telefonbuch mail
[verwaltung] laptop klaerung dezernat organisationsentwicklung mail
[umzug] telefon umzug uniaccount chemie telefonbuch mail anschluss fakultaet berufung telefonanlage gesondert b.
[umzug] telefon umzug uniaccount chemie telefonbuch mail anschluss fax anschluss
[umzug] telefon umzug uniaccount chemie telefonbuch mail anschluss hochschullehrer fakultaet telefonanlage anschlusses
[lsf] service semester semester stelle studium wahlmodulen experimentelle webseite erstellen vision stufen kommilitonen informatik fakultaet erfinden sichtweise daten boss schaffen aspekt studienplan uebersicht aktuelle ansicht semester modul stufe veranstaltungsinformationen studiengang fakultaet dozent semester turnus orientierung anhand semester automatisiert erstellen datenbank dozent turnus uhrzeit beschreibung boss programmierschnittstelle datenabfrage login benutzername passwort einzelne benutzer erlaubnis liste boss nummer format xml pdf gespraech klaeren
[sap] mitarbeiter schulung berechtigung budget kennung ort schulung
[gastaufenthalt] damen pruefung email adresse honorarprofessor ing vorlesungen bereich satellitenkommunikation fakultaet elektrotechnik informationstechnik mitarbeiter lehrstuhl hochfrequenztechnik lehrstuhl email adresse sinnvolle kommunikation hilfsmittel ews sinne email adresse
[sap] schulung dezernat zuhoeren berechtigung budget lage account
[fk raumplanung 09] pc modus
[sap] kolleginnen kollegen schulung anfaenger verwendung feld dezentral zugreifen uebersicht alternative budget berechtigung transaktionen fb60 dezernat sekretariaten kuerze fk05 statistik einsatz korrektur kurze rueckmeldung freischaltung einrichtungen
[fiona] mitarbeiter fachgebiet regionalsoziologie fakultaet raumplanung fachgebiet alte homepage homepage erscheinungsbild aktuell kenne programm umstellung einstiegsschulung vornehmen besprechen taeglich buero erreichen bescheid weber gb iii raumplanung waehlen mithilfe
[fk 12] hi zugang fk12-adresse aendern
[uniaccount] meldung zugangsdaten passwort rechtzeitig zugang problemlos account
[elektronisches telefonbuch] telefon umzug lehrstuhl uniaccount physik telefonbuch mail nr mitnehmen
[abmeldung] telefon abmeldung uniaccount telefonbuch mail besitzer nr
[telefon] telefon geraet display defekt telefon wenden -5886
[neuanschluss] telefon leitung uniaccount telefonbuch mail amt telefon auszubildende sekretariat azubi sekretariat
[uni mail] kenntnisnahme loesung alte passwort aendern erklaert passwort buero server absturz problemlos passwort unabhaengig telefonats service geloest erstmal rueckmeldung vorgehensweise kollegen geloest service antrag dienstreise passwort alte passwort mail dienstreisen antrag passwort system unding offenbar it sachverhalt systemausfall wochen reibungslos
[uni mail] webmailer text einfuegen

View File

@ -60,8 +60,8 @@ def ticket_csv_to_DictStream(path2csv,content_collumn_name):
for i, lst in enumerate(stream): for i, lst in enumerate(stream):
if i == 0: if i == 0:
for j, col in enumerate(lst): for j, col in enumerate(lst):
if "icketNumb" in col: if "icketNumb" in col: #korrigieren der .csv todo wenn hier sowieso hardgecodet werden muss, dann gleich auch config.ini raus?
col = "TicketNumber" col = "TicketNumber"
metalist.append(str(col)) metalist.append(str(col))
metaindices.append(j) metaindices.append(j)
metadata_template = dict( metadata_template = dict(
@ -89,7 +89,7 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path")
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0): def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0): #todo bla das kann hier die main sein
""" """
Use textacy to create a Corpus out of the ITMC-Ticket.csv Use textacy to create a Corpus out of the ITMC-Ticket.csv
@ -105,17 +105,19 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printra
path_csv_split = path2_csv.split("/") path_csv_split = path2_csv.split("/")
filename = path_csv_split[len(path_csv_split) - 1] filename = path_csv_split[len(path_csv_split) - 1]
logprint("Corporization of {0} at {1}".format(filename, datetime.now())) logprint("Corporization of {0}".format(filename))#, datetime.now()))
raw_corpus = textacy.Corpus(lang) raw_corpus = textacy.Corpus(lang)
## add files to textacy-corpi, ## add files to textacy-corpi, todo bla hier cleanen, dict nich vergessn
raw_corpus.add_texts( raw_corpus.add_texts(
ticketcsv_to_textStream(path2_csv, content_collumn_name), ticketcsv_to_textStream(path2_csv, content_collumn_name),
ticket_csv_to_DictStream(path2_csv,content_collumn_name) ticket_csv_to_DictStream(path2_csv,content_collumn_name)
) )
# leere docs aus corpi kicken # leere docs aus corpi kicken
raw_corpus.remove(lambda doc: len(doc) == 0) raw_corpus.remove(lambda doc: len(doc) == 0)
@ -147,4 +149,4 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -281,7 +281,7 @@ def build_words_for_spellchecking(path2words):
def main(): def main():
start = time.time() start = time.time()
logprint("Init: {0}".format(datetime.now())) logprint("Init")#: {0}".format(datetime.now()))
ressources_path = FILEPATH + "ressources/" ressources_path = FILEPATH + "ressources/"

18
main.py
View File

@ -27,13 +27,13 @@ start = time.time()
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics # idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
# frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen # frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen
# idee word vorher mit semantischen netz abgleichen: wenn zu weit entfernt, dann ignore # idee word vorher mit semantischen netz abgleichen: wenn zu weit entfernt, dann ignore
# idee lda2vec
# todo modelle testen # todo modelle testen
# todo ticket2kbkeys, subj, cats in init.py
logprint("main.py started at {}".format(datetime.now()))
logprint("main.py started")
init.main() init.main()
@ -45,9 +45,15 @@ logprint("")
cleaned_corpus = cleaning.main(raw_corpus) cleaned_corpus = cleaning.main(raw_corpus)
logprint("") logprint("")
pre_corpus = preprocessing.main(cleaned_corpus) doc_term_matrix, id2term_dict = preprocessing.main(cleaned_corpus)
logprint("") logprint("")
topicModeling.textacyTopicModeling_v2(doc_term_matrix, id2term_dict)
""" """
ticket_number = "INC40484" ticket_number = "INC40484"
raw="" raw=""
@ -89,11 +95,11 @@ logprint("")
logprint("") logprint("")
""" """
topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="llda") #topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="llda")
logprint("") logprint("")
topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="lda") #topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="lda")
logprint("") logprint("")

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from datetime import datetime
import configparser as ConfigParser import configparser as ConfigParser
import csv import csv
import functools import functools
@ -47,6 +48,7 @@ logging.basicConfig(filename=filename, level=level)
def logprint(string, level="INFO"): def logprint(string, level="INFO"):
"""log and prints""" """log and prints"""
string = "{}\t".format(datetime.now()) + str(string)
print(string) print(string)
if level == "INFO": if level == "INFO":
logging.info(string) logging.info(string)
@ -145,14 +147,14 @@ def sort_dictionary(dict):
return sorted(dict.items(), key=operator.itemgetter(1)) return sorted(dict.items(), key=operator.itemgetter(1))
def normalize(string): def normalize_str(string):
# replaceRockDots """
string = re.sub(r'[ß]', "ss", string.lower()) replaceRockDots
string = re.sub(r'[ö]', "oe", string) textacy.preprocess.normalize_whitespace
string = re.sub(r'[ü]', "ue", string) :param string: str
string = re.sub(r'[ä]', "ae", string) :return: str
string = textacy.preprocess.normalize_whitespace(string) """
return string return textacy.preprocess.normalize_whitespace(replaceRockDots(string))
def deprecated(func): def deprecated(func):
@ -200,14 +202,18 @@ def corpus2Meta(corpus):
for doc in corpus: for doc in corpus:
yield doc.metadata yield doc.metadata
def savelabledCorpiLines(corpus,filepath): def savelabledCorpiLines_cat(corpus, filepath):
textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath) textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath)
def gen_labledLines(corpus):
def gen_labledLines(corpus, label ="categoryName"):
for doc in corpus: for doc in corpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + doc.metadata["categoryName"] + "] " + doc.text yield "[" + doc.metadata[label] + "] " + doc.text
def save_corpus(corpus, corpus_path, corpus_name): def save_corpus(corpus, corpus_path, corpus_name):
@ -235,6 +241,8 @@ def gen_dicts(corpus):
dict.update(doc.metadata) dict.update(doc.metadata)
yield dict yield dict
def multisub(subs, subject): def multisub(subs, subject):
#https://stackoverflow.com/questions/764360/a-list-of-string-replacements-in-python #https://stackoverflow.com/questions/764360/a-list-of-string-replacements-in-python
"Simultaneously perform all substitutions on the subject string." "Simultaneously perform all substitutions on the subject string."

File diff suppressed because it is too large Load Diff

207
test.py
View File

@ -14,17 +14,220 @@ from scipy import *
import json import json
import draw import draw
""" """
import matplotlib
matplotlib.use('Agg')
import os import os
import time import time
from textacy import Vectorizer
from itertools import tee
start = time.time() start = time.time()
from gensim.models import Doc2Vec from gensim.models import Doc2Vec
from datetime import datetime
import textacy import textacy
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
from miscellaneous import * from miscellaneous import *
from ressources.iir.lda.llda import *
import numpy as np
import re import re
import draw
# http://universaldependencies.org/u/pos/
#corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/", corpus_name="de_clean")
# array of zeros and ones interleaved
lrg = np.arange(2).reshape((2,-1)).repeat(1000000,-1).flatten()
flt = lrg[lrg==0]
flt = np.array(filter(lambda x:x==0, lrg))
lines_txt = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/corpi_vor_besprechung/labeled.txt"
labelset, corpus, labels = load_corp(lines_txt)
llda = LLDA(20, 0.001, 0.001)
llda.set_corpus(labelset, corpus, labels)
for i in range(10):
llda.inference()
phi = llda.phi()
#print(llda.vocas)
#for v, voca in enumerate(llda.vocas):
# print ','.join([voca]+[str(x) for x in llda.n_z_t[:,v]])
#print(','.join([voca] + [str(x) for x in phi[:, v]]))
################# termite plot #####################################################################
topic_labels = list(labelset)
term_labels = list(llda.vocas)
term_topic_weights = phi.transpose()
threshmin = 0.005
from scipy.stats import threshold
thresholded = threshold(term_topic_weights, threshmin=threshmin)
draw.draw_termite(thresholded, topic_labels, term_labels, save="test.png")
exit()
KBA10184_text = "wenn Sie Ihr UniAccount Passwort ändern möchten, k<>nnen Sie dies im Service Portal unter folgendem Link durchführen: https://service.tu-dortmund.de/uniaccount-passwort"
corpus = textacy.Corpus("de")
preprocess_replace_urls = textacy.preprocess.replace_urls(KBA10184_text,replace_with="URL")
print(preprocess_replace_urls)
preprocess_replace_urls = textacy.preprocess.transliterate_unicode(KBA10184_text)
print(preprocess_replace_urls)
#corpus.add_text(preprocess_replace_urls)
to_corr = "Sehr geehrtes ITMC Service Team, seit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen " \
"An das Intranet der BMP Mit der Dosennummer G1 303 04 12.05 G1 4 26-1 in Raum G1-426 nicht Mehr Zu funktionieren. " \
"Ich Wuerde Sie daher bitten diese Mail An den zustaendigen Kollegen weiterzuleiten," \
" Um Die Leitung Vielleicht Einmal Zu Pruefen. Des Weiteren haette Ich noch Eine Frage " \
"bezueglich der Moeglichkeit zur Nutzung Einer VPN Verbindung Aus unserem Intranet" \
" heraus Zu einem fremden Netzwerk. Dies ist zwar Ueber das WLAN-Netz moeglich, jedoch nicht Aus unserem Netzwerk heraus."
to_corr = "Wichtiger Hinweis: Die Information in dieser E-Mail ist vertraulich. Sie ist ausschließlich für den Adressaten bestimmt. Sollten Sie nicht der für diese E-Mail bestimmte Adressat sein, unterrichten Sie bitte den Absender und vernichten Sie diese Mail. Vielen Dank. Unbeschadet der Korrespondenz per E-Mail, sind unsere Erklärungen ausschließlich final rechtsverbindlich, wenn sie in herkömmlicher Schriftform (mit eigenhändiger Unterschrift) oder durch Übermittlung eines solchen Schriftstücks per Telefax erfolgen."
corpus.add_text(to_corr)
for doc in corpus:
for sent in doc.sents:
for tok in sent:
print(tok.text, tok.pos_, tok.dep_)
corpus.add_text("dfsaf fdsa fdsa",metadata={ "x" : "test"})
corpus.add_text("iuzizi gbjh iuzit",metadata={ "x" : "testo"})
vectorizer = Vectorizer(weighting="tf", min_df=1, max_df=1.0)
# create tuples of term_list and metadata for each doc
terms_meta_gen, terms_meta_gen_copy = tee(((doc.to_terms_list(ngrams=(1,2), as_strings=True), doc.metadata) for doc in corpus))
terms_list_gen, terms_list_gen_copy = tee((term_meta[0] for term_meta in terms_meta_gen))
doc_term_matrix = vectorizer.fit_transform(terms_list_gen)
id2term = vectorizer.id_to_term
term2id = vectorizer.vocabulary
for k,v in term2id.items():
print(k,doc_term_matrix[0,v])
#{0: 'dfsaf', 1: 'fdsa', 2: 'dfsaf fdsa', 3: 'fdsa fdsa', 4: 'iuzizi', 5: 'gbjh', 6: 'iuzit', 7: 'iuzizi gbjh', 8: 'gbjh iuzit'}
#line_gen = ( doc.metadata["x"] + " ".join([term for term in])
def gen_lines(docterm,term2id,corpus,label):
for i,doc in enumerate(corpus):
line = "[" + doc.metadata[label] + "]"
for term, id_ in term2id.items():
if doc_term_matrix[i, id_] != 0:
term = term if len(term.split()) == 1 else "_".join(term.split())
line = line + " " + term
yield line
for line in gen_lines(doc_term_matrix,term2id,corpus,"x"):
print(line)
#doc.to_terms_list(ngrams=2, as_strings=True)
# "" \
# "" \
# "" ".join(
#[term if term in id2term.values() else "" for term in terms_meta[0]]) for terms_meta in terms_meta_gen_copy)
label = "x"
#for line in line_gen:
# print(line)
#terms_meta_gen = ( (doc.to_terms_list(ngrams=2, as_strings=True),doc.metadata) for doc in corpus)
for x in terms_meta_gen:
print(x)
#terms_list_gen = (term_meta[0] for term_meta in terms_meta_gen)
for x in terms_list_gen:
print(x)
for doc in corpus:
for term in doc.to_terms_list(ngrams=2, as_strings=True):
print(type(term))
for doc in corpus:
for span in textacy.extract.ngrams(doc,2,
filter_stops=True, filter_punct=True,
filter_nums=False, include_pos=None,
exclude_pos=None, min_freq=1):
print(span.text)

View File

@ -66,14 +66,13 @@ def textacyTopicModeling(corpus,
###### vectorize corpi ###### vectorize corpi
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list) doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term") #id2term = vectorizer.__getattribute__("id_to_term")
@ -113,8 +112,264 @@ def textacyTopicModeling(corpus,
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
def textacyTopicModeling_v2(doc_term_matrix, id_to_term,
n_topics = 15, top_topic_words = 3,
topicModel='lda'):
n_terms = int(n_topics * top_topic_words)
sort_terms_by = 'seriation' # 'seriation', 'weight', 'index', 'alphabetical'
rank_terms_by = 'corpus' # 'corpus', 'topic'
logprint("#### Topic Modeling {0}".format(topicModel))
logprint(str("n_topics: {0}".format(n_topics)))
logprint("\n")
start = time.time()
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
####### Initialize and train a topic model
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)
for topic_idx, top_terms in model.top_topic_terms(id_to_term, top_n=top_topic_words, weights=True):
logprint('{0}: {1}'.format(topic_idx, str(top_terms)))
####################### termite plot ###################################################################
draw1.termite_plot(model,doc_term_matrix, id_to_term,
n_terms=n_terms,
sort_terms_by=sort_terms_by,
rank_terms_by=rank_terms_by + '_weight',
save=FILEPATH + "results/{}.png".format(topicModel))
end = time.time()
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
def create_ticket2label_dict(ticket2chunk_dict,corpus):
"""
Creates a dictionary to map a TicketNumber to a label
:param ticket2chunk_dict: e.g. { TicketNumber : KB_entries }
:return: {TicketNumber : label }
"""
labelist = ticket2chunk_dict.values()
labelist = flatten(labelist)
labeldict = create_labeldict(labelist, min_label_freq=1, add_default_label=True)
ticket2label = {}
for doc in corpus:
ticketID = doc.metadata["TicketNumber"]
keywords = ticket2chunk_dict.get(ticketID, ['DEFAULT'])
label = ""
for kw in keywords:
label = label + str(labeldict.get(normalize_str(str(kw)), labeldict['DEFAULT'])) + " "
ticket2label.update({ticketID: label})
return ticket2label
def create_labeldict(labelist, min_label_freq=1, add_default_label=True):
# nur die x häufigsten labels benutzen
labelist = [l for l in labelist if labelist.count(l) >= min_label_freq]
in_labelist_ = {k: labelist.count(k) for k in labelist} # { label1 : 3 , label2 : 5, label3 : 1 }
labelist = sort_dictionary(in_labelist_) # [ (label3, 1), (label1, 3), (label2, 5) ]
labelist.reverse() # [ (label2, 5), (label1, 3), (label3, 1) ]
labeldict = {elem[0]: i for i, elem in enumerate(labelist)} # { label2 : 0, label1 : 1 , label3 : 2 }
if add_default_label:
if 'DEFAULT' not in labeldict.keys():
labeldict.update({'DEFAULT': len(labelist)}) # { label2 : 0, label1 : 1 , label3 : 2 , DEFAULT : 3 }
return labeldict
#todo
def jgibbsLLDAv2(labeled_lines_path,ticket2kbs_dict, cleaned_corpus, path2save_results, top_topic_words=7):
ticket2label_dict = create_ticket2label_dict(ticket2kbs_dict, cleaned_corpus)
# reduce ticket2label_dict
labeldict = {}
label_list = list(set(ticket2label_dict.values()))
lbl_dict = {elem : i for i,elem in enumerate(label_list)}
labeldict = {k : lbl_dict[v] for k,v in ticket2label_dict.items()}
labeldict.update({"DEFAULT" : len(labeldict)})
def gen_lines_from_labeled_lines(input,ticket2label_dict):
line_gen = textacy.fileio.read_file_lines(input)
for line in line_gen:
label = re.findall(r'\[(.*?)\]',line)
new_label = "[ "
for lbl in label:
new_label = new_label + str(ticket2label_dict.get(str(lbl),"")).strip() + " "
new_label = new_label + "] "
result = new_label + str(line.rpartition("]")[2])
# new_label = str([ticket2label_dict.get(str(lbl),"") for lbl in label])
# result = "[ " + new_label + " ] " + line.rpartition("]")[2]
#print(result)
yield result
labeldict_rev = {v: k for k, v in labeldict.items()}
#line_gen = gen_lines_from_labeled_lines(labeled_lines_path,ticket2label_dict)
line_gen = gen_lines_from_labeled_lines(labeled_lines_path,labeldict)
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
textacy.fileio.write_file_lines(line_gen, filepath=LLDA_filepath)
# wait for file to exist
while not os.path.exists(LLDA_filepath):
time.sleep(1)
# run JGibbsLLDA file
n_topics = len(labeldict) #+1 #default-topic
FNULL = open(os.devnull, 'w') # supress output
cmd_jgibbs_java = ["java", "-cp",
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
jgibbsLLDA_root),
"jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile",
"tickets.gz",
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
output = subprocess.check_output(cmd_gzip).decode("utf-8")
topic_regex = re.compile(r'Topic [0-9]*')
#####################################
# todo save results in file aufgrund von results
result = []
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
try:
index = int(findall[0].split()[1])
result.append("Topic {} {}:".format(index, str(ticket2kbs_dict[labeldict_rev[index]])))
except:
result.append(line)
else:
result.append(line)
textacy.fileio.write_file_lines(result, path2save_results+".txt")
#####################################
results = []
res_dict = {}
count =0
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
if len(res_dict) != 0:
results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig)
index = int(findall[0].split()[1])
res_dict = {index : str(labeldict_rev[index]) }
else:
splitted = line.split()
res_dict[splitted[0]] = float(splitted[1])
if len(res_dict) != 0:
results.append(res_dict) # letzes an die liste ran
# every term in the resulsts to a list
terms=[]
for res in results:
for key,value in res.items():
if not isinstance(key, int) and not key in terms:
terms.append(key)
term2id = {t:i for i,t in enumerate(terms)} #and to dict
################# termite plot #####################################################################
topic_labels = list(range(len(labeldict)))
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
for i,res in enumerate(results):
for key,value in res.items():
if not isinstance(key, int):
term_topic_weights[term2id[key]][i] = value
term_labels[term2id[key]] = key
else:
topic_labels[i] = labeldict_rev[key]
draw.draw_termite(
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
draw.draw_termite(
term_topic_weights, topic_labels, term_labels, save=path2save_results+"_spaced.png",pow_x=0.78,pow_y=0.87)
# save labeldict
labeldict_path = path2save_results + "_labeldict.json"
with open(labeldict_path, 'w') as file:
file.write(json.dumps(labeldict))
@ -259,15 +514,21 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
logprint("start Category-LLDA:") logprint("start Category-LLDA:")
# labeldict ############################################################################################
# build dictionary of ticketcategories # build dictionary of ticketcategories
labelist = [] labelist = []
for doc in corpus: for doc in corpus:
category = normalize(doc.metadata["categoryName"]) category = normalize_str(doc.metadata["categoryName"])
labelist.append(category) labelist.append(category)
# frage nur die x häufigsten labels benutzen, rest raus? x = 50 # frage nur die x häufigsten labels benutzen, rest raus?
labelist = [l for l in labelist if labelist.count(l) > 50 ]
labelist = [l for l in labelist if labelist.count(l) > x ]
in_labelist_ = {k: labelist.count(k) for k in labelist} in_labelist_ = {k: labelist.count(k) for k in labelist}
labelist = sort_dictionary(in_labelist_) labelist = sort_dictionary(in_labelist_)
@ -290,13 +551,13 @@ def jgibbsLLDA_category(corpus, path2save_results, top_topic_words=7):
labeldict.update({'DEFAULT': len(labelist)}) labeldict.update({'DEFAULT': len(labelist)})
##############################################################################################
def gen_cat_lines(textacyCorpus, labeldict): def gen_cat_lines(textacyCorpus, labeldict):
""" generates [topic1, topic2....] tok1 tok2 tok3 out of corpi""" """ generates [topic1, topic2....] tok1 tok2 tok3 out of corpi"""
for doc in textacyCorpus: for doc in textacyCorpus:
label = labeldict.get(normalize(doc.metadata["categoryName"]), labeldict['DEFAULT']) label = labeldict.get(normalize_str(doc.metadata["categoryName"]), labeldict['DEFAULT'])
if label is not 'DEFAULT': if label is not 'DEFAULT':
@ -324,6 +585,11 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
logprint("") logprint("")
logprint("start {}-LLDA:".format("Keyword" if kb_keywords else "Subject")) logprint("start {}-LLDA:".format("Keyword" if kb_keywords else "Subject"))
# labeldict ############################################################################################
# ticket2kb_dict # ticket2kb_dict
kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";") kb2ticket_gen = textacy.fileio.read_csv(FILEPATH + "M42-Export/KB2Ticket_2017-09-13.csv", delimiter=";")
@ -358,7 +624,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
subject = line[1] subject = line[1]
keywords = line[2] keywords = line[2]
keywords_list = [normalize(x) for x in str(keywords).split(",")] keywords_list = [normalize_str(x) for x in str(keywords).split(",")]
if kb_id not in kb2keywords_dict.keys(): if kb_id not in kb2keywords_dict.keys():
kb2keywords_dict[kb_id] = [] kb2keywords_dict[kb_id] = []
@ -406,7 +672,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
keywords = kb2keywords_dict.get(kb_id, None) keywords = kb2keywords_dict.get(kb_id, None)
if keywords and kb_id: if keywords and kb_id:
used_keywords.append(list(map(normalize,keywords))) used_keywords.append(list(map(normalize_str, keywords)))
@ -418,6 +684,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
labeldict = {k: v for v, k in enumerate(labelist)} labeldict = {k: v for v, k in enumerate(labelist)}
##############################################################################################
def gen_KB_lines(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict): def gen_KB_lines(textacyCorpus, labeldict, ticket2kb_dict, kb2keywords_dict):
@ -433,7 +700,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
label = "" label = ""
for kw in keywords: for kw in keywords:
label = label + str(labeldict.get(normalize(str(kw)), len(labeldict))) + " " label = label + str(labeldict.get(normalize_str(str(kw)), len(labeldict))) + " "
yield "[ " + label + "] " + doc.text yield "[ " + label + "] " + doc.text
@ -451,7 +718,6 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words = 7, kb_keywords=Fa
logprint("\n\n\nTime Elapsed {1}-LLDA :{0} min\n\n".format((end - start) / 60,"Keyword" if kb_keywords else "Subject")) logprint("\n\n\nTime Elapsed {1}-LLDA :{0} min\n\n".format((end - start) / 60,"Keyword" if kb_keywords else "Subject"))
def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7): def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
start = time.time() start = time.time()
@ -461,6 +727,10 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
# labeldict ############################################################################################
# kb2keywords_dict / kb2subjects_dict --> {str : [str]} # kb2keywords_dict / kb2subjects_dict --> {str : [str]}
@ -476,9 +746,9 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
kb_id = line[0] kb_id = line[0]
subject = normalize(line[1]) subject = normalize_str(line[1])
keywords = [normalize(x) for x in str(line[2]).split(",")] keywords = [normalize_str(x) for x in str(line[2]).split(",")]
if kb_id not in kb2keywords_dict.keys(): if kb_id not in kb2keywords_dict.keys():
@ -488,9 +758,9 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
if kb_id not in kb2subjects_dict.keys(): if kb_id not in kb2subjects_dict.keys():
kb2subjects_dict[kb_id] = [normalize(subject) if subject != [''] else "DEFAULT"] kb2subjects_dict[kb_id] = [normalize_str(subject) if subject != [''] else "DEFAULT"]
else: else:
kb2subjects_dict[kb_id].append(normalize(subject)) kb2subjects_dict[kb_id].append(normalize_str(subject))
@ -586,8 +856,7 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
labelist = list(set(labelist)) labelist = list(set(labelist))
labeldict = {k: v for v, k in enumerate(labelist)} labeldict = {k: v for v, k in enumerate(labelist)}
##############################################################################################
def gen_key_lines(textacyCorpus, labeldict, ticket2keywords_dict): def gen_key_lines(textacyCorpus, labeldict, ticket2keywords_dict):
@ -601,7 +870,7 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
label = "" label = ""
for kw in keywords: for kw in keywords:
label = label + str(labeldict.get(normalize(str(kw)), labeldict['DEFAULT'])) + " " label = label + str(labeldict.get(normalize_str(str(kw)), labeldict['DEFAULT'])) + " "
yield "[ " + label + "] " + doc.text yield "[ " + label + "] " + doc.text
@ -642,10 +911,9 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
def load_from_labled_lines(path): def load_from_labled_lines(path):
path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/pre_labled_lines_wo_lemma_061217.txt" path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/pre_labled_lines_wo_lemma_061217.txt"
#idee plan #idee
# clean laden, pre laden # clean laden, pre laden
# unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee bigramme mit _ verbinden # unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee bigramme mit _ verbinden # nimm nur ngrams wo midn. ein token in pre vorkommt
# nimm nur ngrams wo midn. ein token in pre vorkommt

View File

@ -415,7 +415,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words=7, kb_keywords=Fals
keywords = kb2keywords_dict.get(kb_number, None) keywords = kb2keywords_dict.get(kb_number, None)
if keywords and kb_number: if keywords and kb_number:
used_keywords.append(list(map(normalize,keywords))) used_keywords.append(list(map(normalize_str, keywords)))
kb_entries_used = (len(list(set([kb for kb in ticket2kb_dict.values()])))) kb_entries_used = (len(list(set([kb for kb in ticket2kb_dict.values()]))))
print("kb_entries_used: {}".format(kb_entries_used)) print("kb_entries_used: {}".format(kb_entries_used))
@ -447,7 +447,7 @@ def jgibbsLLDA_KB(corpus, path2save_results, top_topic_words=7, kb_keywords=Fals
label = "" label = ""
for kw in keywords: for kw in keywords:
label = label + str(labeldict.get( normalize(str(kw)) , len(labeldict))) + " " label = label + str(labeldict.get(normalize_str(str(kw)), len(labeldict))) + " "
yield "[ " + label + "] " + doc.text yield "[ " + label + "] " + doc.text