commit vor refactoring
This commit is contained in:
parent
6ea03b2f65
commit
ecc8c0c54a
|
@ -41,8 +41,8 @@ filename=topicModelTickets.log
|
|||
|
||||
|
||||
[de_corpus]
|
||||
#input=M42-Export/Tickets_small.csv
|
||||
input=M42-Export/de_tickets.csv
|
||||
input=M42-Export/Tickets_small.csv
|
||||
#input=M42-Export/de_tickets.csv
|
||||
|
||||
path=corpi/
|
||||
|
||||
|
@ -64,7 +64,7 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
|
|||
|
||||
#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
|
||||
|
||||
custom_words=geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok
|
||||
custom_words=eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok
|
||||
|
||||
|
||||
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
31
main.py
31
main.py
|
@ -2,18 +2,21 @@
|
|||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
import time
|
||||
import init
|
||||
|
||||
import corporization
|
||||
import preprocessing
|
||||
import topicModeling
|
||||
import cleaning
|
||||
from miscellaneous import *
|
||||
|
||||
from miscellaneous import *
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
|
||||
start = time.time()
|
||||
|
||||
import init
|
||||
init.main()
|
||||
|
||||
|
||||
#init.main()
|
||||
logprint("")
|
||||
|
||||
corporization.main()
|
||||
|
@ -25,14 +28,32 @@ logprint("")
|
|||
preprocessing.main() # ~5h
|
||||
logprint("")
|
||||
|
||||
#topicModeling.main(use_raw=False,algorithm="llda")
|
||||
|
||||
"""
|
||||
topicModeling.main(use_raw=False,algorithm="lsa")
|
||||
logprint("")
|
||||
|
||||
#topicModeling.main(use_raw=True)
|
||||
|
||||
topicModeling.main(use_raw=False,algorithm="lda")
|
||||
logprint("")
|
||||
|
||||
|
||||
topicModeling.main(use_raw=False,algorithm="nmf")
|
||||
logprint("")
|
||||
|
||||
|
||||
topicModeling.main(use_raw=False,algorithm="llda")
|
||||
logprint("")
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
logprint("")
|
||||
|
||||
end = time.time()
|
||||
|
|
104
miscellaneous.py
104
miscellaneous.py
|
@ -154,18 +154,6 @@ def printRandomDoc(textacyCorpus):
|
|||
print()
|
||||
|
||||
|
||||
def corpus2Text(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.text
|
||||
|
||||
def corpus2Meta(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.metadata
|
||||
|
||||
def saveplaincorpustext(corpus,path):
|
||||
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
|
||||
|
||||
|
||||
|
||||
def save_corpus(corpus, corpus_path, corpus_name):
|
||||
"""
|
||||
|
@ -175,42 +163,22 @@ def save_corpus(corpus, corpus_path, corpus_name):
|
|||
:param corpus_name: str (should content the language like "_de_")
|
||||
"""
|
||||
|
||||
"""
|
||||
# save stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path, "w") as file:
|
||||
parser.vocab.strings.dump(file)
|
||||
|
||||
#todo save vocab?
|
||||
"""
|
||||
|
||||
# save parser
|
||||
parser = corpus.spacy_lang
|
||||
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||
parser.save_to_directory(parserpath)
|
||||
|
||||
##
|
||||
|
||||
# save content
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
||||
|
||||
#save plain content
|
||||
# save plain content + meta
|
||||
plainpath = corpus_path + corpus_name + "_content.json"
|
||||
textacy.fileio.write_json_lines(({"index" : doc.corpus_index, "content" : doc.text} for doc in corpus), plainpath)
|
||||
|
||||
# save meta
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
#meta_gen = (doc.metadata.update({"index": doc.corpus_index}) for doc in corpus)
|
||||
meta_gen = gen_meta(corpus)
|
||||
textacy.fileio.write_json_lines(meta_gen, metapath)
|
||||
textacy.fileio.write_json_lines(gen_dicts(corpus), plainpath)
|
||||
|
||||
|
||||
def gen_meta(corpus):
|
||||
|
||||
def gen_dicts(corpus):
|
||||
for doc in corpus:
|
||||
meta = doc.metadata
|
||||
meta.update({"index": doc.corpus_index})
|
||||
yield meta
|
||||
dict = {"index" : doc.corpus_index, "content" : doc.text}
|
||||
dict.update(doc.metadata)
|
||||
yield dict
|
||||
|
||||
|
||||
|
||||
|
@ -233,7 +201,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
|||
# load parser
|
||||
parser = spacy.load(lang)
|
||||
|
||||
|
||||
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
|
||||
with open(stringstorepath) as file:
|
||||
parser.vocab.strings.load(file)
|
||||
|
@ -244,46 +211,35 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
|||
#load corpus
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
plainpath = corpus_path + corpus_name + "_content.json"
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
"""
|
||||
try:
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
except:
|
||||
"""
|
||||
# neu init!!
|
||||
#corpus = textacy.Corpus(parser)
|
||||
|
||||
plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str}
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
|
||||
for plain, metadata in zip(plain_stream, metadata_stream):
|
||||
if plain["index"] == metadata["index"]:
|
||||
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=metadata))
|
||||
else:
|
||||
raise IndexError
|
||||
|
||||
for plain in plain_stream:
|
||||
meta = {}
|
||||
for key,value in plain.items():
|
||||
if key != "content" and key != "index":
|
||||
meta[key] = value
|
||||
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
|
||||
|
||||
return corpus, corpus.spacy_lang
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
def corpus2Text(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.text
|
||||
|
||||
def corpus2Meta(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.metadata
|
||||
|
||||
def saveplaincorpustext(corpus,path):
|
||||
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
|
||||
|
||||
def save_corpusV2(corpus, corpus_path, corpus_name):
|
||||
"""
|
||||
saves a textacy-corpus including spacy-parser
|
||||
:param corpus: textacy-Corpus
|
||||
:param corpus_path: str
|
||||
:param corpus_name: str (should content the language like "_de_")
|
||||
"""
|
||||
|
||||
|
||||
# save parser
|
||||
parser = corpus.spacy_lang
|
||||
|
@ -302,13 +258,7 @@ def save_corpusV2(corpus, corpus_path, corpus_name):
|
|||
file.write(json.dumps(doc.metadata))
|
||||
|
||||
def load_corpusV2(corpus_path, corpus_name, lang="de"):
|
||||
"""
|
||||
Load textacy-Corpus including spacy-parser out from file
|
||||
:param corpus_path: str
|
||||
:param corpus_name: str (should content the language like "_de_")
|
||||
:param lang: str (language code) ir spacy.Language
|
||||
:return: texracy.Corpus, spacy.language
|
||||
"""
|
||||
|
||||
|
||||
# ckeck for language
|
||||
if "de_" in corpus_name:
|
||||
|
@ -359,5 +309,5 @@ def yield_fromdir(path,spacy_vocab=None,type=".pkl"):
|
|||
else:
|
||||
for filename in filelist:
|
||||
yield load_obj(path+filename)
|
||||
|
||||
"""
|
||||
|
||||
|
|
104
topicModeling.py
104
topicModeling.py
|
@ -9,7 +9,7 @@ import sys
|
|||
import json
|
||||
import os.path
|
||||
import subprocess
|
||||
from textacy import Vectorizer
|
||||
from textacy import Vectorizer, viz
|
||||
|
||||
from miscellaneous import *
|
||||
import textacy
|
||||
|
@ -163,7 +163,8 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
|
|||
jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/"
|
||||
|
||||
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||
dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
|
||||
#dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
|
||||
dict_path = FILEPATH +"results/labeldict.txt".format(jgibbsLLDA_root)
|
||||
|
||||
# printlog(str("LABELDICT: {0}".format(labeldict)))
|
||||
#logprint(str("LABELDICT-length: {0}".format(len(labeldict))))
|
||||
|
@ -243,6 +244,30 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
|
|||
|
||||
textacy.fileio.write_file_lines(result, path2save_results)
|
||||
#####################################################################################################################
|
||||
|
||||
#todo llda termite plot
|
||||
"""
|
||||
topic_inds=[] #<class 'list'>: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
|
||||
|
||||
# get topic and term labels
|
||||
# <class 'tuple'>: ('topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14')
|
||||
topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)
|
||||
|
||||
# <class 'tuple'>: ('hardware', 'raum', 'adresse', 'gebaeude', 'tu', 'uni', 'ticket', 'email', 'account', 'nummer', 'mail', 'outlook', 'karte', 'drucker', 'server', 'service', 'antwort', 'verbindung', 'herzliche', 'einrichten', 'vergessen', 'wenden', 'ews', 'anhang', 'form', 'konto', 'nachricht', 'unterstuetzung', 'passwort', 'unicard', 'semester', 'system', 'aenderung', 'rueckmeldung', 'meldung', 'zugreifen', 'login', 'adressat', 'sender', 'kurs', 'made', 'mittwoch', 'note', 'our', 'korrespondenz', 'unbeschadet', 'boss', 'unterrichten', 'telefax', 'zugang', 'probleme', 'zugriff', 'mitarbeiterin', 'internet', 'daten', 'anmeldung', 'aendern', 'unterschrift', 'loeschen', 'anmelden', 'datei', 'message', 'laptop', 'benoetigt', 'link', 'montag', 'programm', 'ordner', 'personal', 'rechner', 'veranstaltung', 'august', 'lizenz', 'anschluss', 'mitarbeiter', 'erwuenscht', 'umzug', 'pc', 'uniaccount', 'amt', 'fax', 'it', 'institut', 'nutzer', 'bild', 'type', 'prof', 'verantwortlicher', 'bemerkung', 'antragsteller', 'element', 'hahn', 'eintrag', 'telefonbuch', 'ansprechpartner', 'universitaet', 'physik', 'abteilung', 'fakultaet', 'software', 'dezernat', 'einrichtung', 'telefon', 'lehrstuhl', 'buero')
|
||||
term_labels = tuple(id2term[term_ind] for term_ind in term_inds)
|
||||
|
||||
# get topic-term weights to size dots
|
||||
#[[ 0.02721858 -0.03898025 0.00047936 ..., 0.05862538 -0.07742336 0.04761928]
|
||||
# [ 0.14977875 -0.24192522 -0.00620335 ..., -0.0497216 0.08269951 -0.05715901]
|
||||
# [ 0.04977951 0.02296709 0.01214562 ..., 0.11444371 -0.15212482 0.21481788]
|
||||
# ...,
|
||||
# [
|
||||
term_topic_weights = np.array([self.model.components_[topic_ind][term_inds]
|
||||
for topic_ind in topic_inds]).T
|
||||
|
||||
viz.draw_termite_plot(
|
||||
term_topic_weights, topic_labels, term_labels, save=path2save_results)
|
||||
"""
|
||||
logprint("")
|
||||
|
||||
end = time.time()
|
||||
|
@ -275,9 +300,9 @@ def main(use_raw=False, algorithm="llda"):
|
|||
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
|
||||
# idee https://gate.ac.uk/family/
|
||||
|
||||
# todo gescheites tf(-idf) maß finden
|
||||
# todo topics zusammenfassen
|
||||
# frage wieviele tickets pro topic?
|
||||
# todo llda topics zusammenfassen
|
||||
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
|
||||
# frage lda wieviele tickets pro topic?
|
||||
|
||||
"""
|
||||
ngrams = 1
|
||||
|
@ -300,25 +325,25 @@ def main(use_raw=False, algorithm="llda"):
|
|||
if algorithm == "llda":
|
||||
top_topic_words = 5
|
||||
add_default_topic = False
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
top_topic_words = 5
|
||||
add_default_topic = True
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
top_topic_words = 10
|
||||
add_default_topic = False
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
top_topic_words = 10
|
||||
add_default_topic = True
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
|
@ -339,15 +364,70 @@ def main(use_raw=False, algorithm="llda"):
|
|||
|
||||
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||
|
||||
n_topics = 15
|
||||
|
||||
|
||||
textacyTopicModeling(ngrams = 1,
|
||||
min_df = 1,
|
||||
max_df = 0.8,
|
||||
max_df = 0.9,
|
||||
topicModel = algorithm,
|
||||
n_topics =n_topics,
|
||||
n_topics =15,
|
||||
corpus=de_corpus)
|
||||
|
||||
textacyTopicModeling(ngrams=1,
|
||||
min_df=1,
|
||||
max_df=0.9,
|
||||
topicModel=algorithm,
|
||||
n_topics=20,
|
||||
corpus=de_corpus)
|
||||
|
||||
textacyTopicModeling(ngrams=1,
|
||||
min_df=1,
|
||||
max_df=0.9,
|
||||
topicModel=algorithm,
|
||||
n_topics=25,
|
||||
corpus=de_corpus)
|
||||
|
||||
|
||||
textacyTopicModeling(ngrams=1,
|
||||
min_df=1,
|
||||
max_df=0.9,
|
||||
topicModel=algorithm,
|
||||
n_topics=30,
|
||||
corpus=de_corpus)
|
||||
|
||||
|
||||
|
||||
textacyTopicModeling(ngrams=(1, 2),
|
||||
min_df=1,
|
||||
max_df=0.9,
|
||||
topicModel=algorithm,
|
||||
n_topics=15,
|
||||
corpus=de_corpus)
|
||||
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 0.9,
|
||||
topicModel = algorithm,
|
||||
n_topics =20,
|
||||
corpus=de_corpus)
|
||||
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 0.9,
|
||||
topicModel = algorithm,
|
||||
n_topics =25,
|
||||
corpus=de_corpus)
|
||||
|
||||
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 0.9,
|
||||
topicModel = algorithm,
|
||||
n_topics =30,
|
||||
corpus=de_corpus)
|
||||
|
||||
|
||||
|
||||
"""
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
|
|
Loading…
Reference in New Issue