commit vor refactoring
This commit is contained in:
parent
6ea03b2f65
commit
ecc8c0c54a
|
@ -41,8 +41,8 @@ filename=topicModelTickets.log
|
||||||
|
|
||||||
|
|
||||||
[de_corpus]
|
[de_corpus]
|
||||||
#input=M42-Export/Tickets_small.csv
|
input=M42-Export/Tickets_small.csv
|
||||||
input=M42-Export/de_tickets.csv
|
#input=M42-Export/de_tickets.csv
|
||||||
|
|
||||||
path=corpi/
|
path=corpi/
|
||||||
|
|
||||||
|
@ -64,7 +64,7 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
|
||||||
|
|
||||||
#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
|
#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
|
||||||
|
|
||||||
custom_words=geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok
|
custom_words=eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
31
main.py
31
main.py
|
@ -2,18 +2,21 @@
|
||||||
import matplotlib
|
import matplotlib
|
||||||
matplotlib.use('Agg')
|
matplotlib.use('Agg')
|
||||||
import time
|
import time
|
||||||
|
import init
|
||||||
|
|
||||||
import corporization
|
import corporization
|
||||||
import preprocessing
|
import preprocessing
|
||||||
import topicModeling
|
import topicModeling
|
||||||
import cleaning
|
import cleaning
|
||||||
from miscellaneous import *
|
|
||||||
|
|
||||||
|
from miscellaneous import *
|
||||||
|
|
||||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
import init
|
|
||||||
init.main()
|
|
||||||
|
#init.main()
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
corporization.main()
|
corporization.main()
|
||||||
|
@ -25,14 +28,32 @@ logprint("")
|
||||||
preprocessing.main() # ~5h
|
preprocessing.main() # ~5h
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
#topicModeling.main(use_raw=False,algorithm="llda")
|
|
||||||
|
"""
|
||||||
|
topicModeling.main(use_raw=False,algorithm="lsa")
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
#topicModeling.main(use_raw=True)
|
|
||||||
|
|
||||||
topicModeling.main(use_raw=False,algorithm="lda")
|
topicModeling.main(use_raw=False,algorithm="lda")
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
|
|
||||||
|
topicModeling.main(use_raw=False,algorithm="nmf")
|
||||||
|
logprint("")
|
||||||
|
|
||||||
|
|
||||||
|
topicModeling.main(use_raw=False,algorithm="llda")
|
||||||
|
logprint("")
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
104
miscellaneous.py
104
miscellaneous.py
|
@ -154,18 +154,6 @@ def printRandomDoc(textacyCorpus):
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
def corpus2Text(corpus):
|
|
||||||
for doc in corpus:
|
|
||||||
yield doc.text
|
|
||||||
|
|
||||||
def corpus2Meta(corpus):
|
|
||||||
for doc in corpus:
|
|
||||||
yield doc.metadata
|
|
||||||
|
|
||||||
def saveplaincorpustext(corpus,path):
|
|
||||||
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def save_corpus(corpus, corpus_path, corpus_name):
|
def save_corpus(corpus, corpus_path, corpus_name):
|
||||||
"""
|
"""
|
||||||
|
@ -175,42 +163,22 @@ def save_corpus(corpus, corpus_path, corpus_name):
|
||||||
:param corpus_name: str (should content the language like "_de_")
|
:param corpus_name: str (should content the language like "_de_")
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
|
||||||
# save stringstore
|
|
||||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
|
||||||
with open(stringstore_path, "w") as file:
|
|
||||||
parser.vocab.strings.dump(file)
|
|
||||||
|
|
||||||
#todo save vocab?
|
|
||||||
"""
|
|
||||||
|
|
||||||
# save parser
|
# save parser
|
||||||
parser = corpus.spacy_lang
|
parser = corpus.spacy_lang
|
||||||
parserpath = corpus_path + str(parser.lang) + '_parser'
|
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||||
parser.save_to_directory(parserpath)
|
parser.save_to_directory(parserpath)
|
||||||
|
|
||||||
##
|
# save plain content + meta
|
||||||
|
|
||||||
# save content
|
|
||||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
|
||||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
|
||||||
|
|
||||||
#save plain content
|
|
||||||
plainpath = corpus_path + corpus_name + "_content.json"
|
plainpath = corpus_path + corpus_name + "_content.json"
|
||||||
textacy.fileio.write_json_lines(({"index" : doc.corpus_index, "content" : doc.text} for doc in corpus), plainpath)
|
textacy.fileio.write_json_lines(gen_dicts(corpus), plainpath)
|
||||||
|
|
||||||
# save meta
|
|
||||||
metapath = corpus_path + corpus_name + "_meta.json"
|
|
||||||
#meta_gen = (doc.metadata.update({"index": doc.corpus_index}) for doc in corpus)
|
|
||||||
meta_gen = gen_meta(corpus)
|
|
||||||
textacy.fileio.write_json_lines(meta_gen, metapath)
|
|
||||||
|
|
||||||
|
|
||||||
def gen_meta(corpus):
|
|
||||||
|
def gen_dicts(corpus):
|
||||||
for doc in corpus:
|
for doc in corpus:
|
||||||
meta = doc.metadata
|
dict = {"index" : doc.corpus_index, "content" : doc.text}
|
||||||
meta.update({"index": doc.corpus_index})
|
dict.update(doc.metadata)
|
||||||
yield meta
|
yield dict
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -233,7 +201,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||||
# load parser
|
# load parser
|
||||||
parser = spacy.load(lang)
|
parser = spacy.load(lang)
|
||||||
|
|
||||||
|
|
||||||
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
|
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
|
||||||
with open(stringstorepath) as file:
|
with open(stringstorepath) as file:
|
||||||
parser.vocab.strings.load(file)
|
parser.vocab.strings.load(file)
|
||||||
|
@ -244,46 +211,35 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||||
#load corpus
|
#load corpus
|
||||||
corpus = textacy.Corpus(parser)
|
corpus = textacy.Corpus(parser)
|
||||||
|
|
||||||
|
|
||||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
|
||||||
plainpath = corpus_path + corpus_name + "_content.json"
|
plainpath = corpus_path + corpus_name + "_content.json"
|
||||||
metapath = corpus_path + corpus_name + "_meta.json"
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
|
||||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
|
||||||
|
|
||||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
|
||||||
corpus.add_doc(
|
|
||||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
|
||||||
except:
|
|
||||||
"""
|
|
||||||
# neu init!!
|
|
||||||
#corpus = textacy.Corpus(parser)
|
|
||||||
|
|
||||||
plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str}
|
plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str}
|
||||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
|
||||||
|
|
||||||
for plain, metadata in zip(plain_stream, metadata_stream):
|
|
||||||
if plain["index"] == metadata["index"]:
|
|
||||||
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=metadata))
|
|
||||||
else:
|
|
||||||
raise IndexError
|
|
||||||
|
|
||||||
|
for plain in plain_stream:
|
||||||
|
meta = {}
|
||||||
|
for key,value in plain.items():
|
||||||
|
if key != "content" and key != "index":
|
||||||
|
meta[key] = value
|
||||||
|
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
|
||||||
|
|
||||||
return corpus, corpus.spacy_lang
|
return corpus, corpus.spacy_lang
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
def corpus2Text(corpus):
|
||||||
|
for doc in corpus:
|
||||||
|
yield doc.text
|
||||||
|
|
||||||
|
def corpus2Meta(corpus):
|
||||||
|
for doc in corpus:
|
||||||
|
yield doc.metadata
|
||||||
|
|
||||||
|
def saveplaincorpustext(corpus,path):
|
||||||
|
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
|
||||||
|
|
||||||
def save_corpusV2(corpus, corpus_path, corpus_name):
|
def save_corpusV2(corpus, corpus_path, corpus_name):
|
||||||
"""
|
|
||||||
saves a textacy-corpus including spacy-parser
|
|
||||||
:param corpus: textacy-Corpus
|
|
||||||
:param corpus_path: str
|
|
||||||
:param corpus_name: str (should content the language like "_de_")
|
|
||||||
"""
|
|
||||||
|
|
||||||
# save parser
|
# save parser
|
||||||
parser = corpus.spacy_lang
|
parser = corpus.spacy_lang
|
||||||
|
@ -302,13 +258,7 @@ def save_corpusV2(corpus, corpus_path, corpus_name):
|
||||||
file.write(json.dumps(doc.metadata))
|
file.write(json.dumps(doc.metadata))
|
||||||
|
|
||||||
def load_corpusV2(corpus_path, corpus_name, lang="de"):
|
def load_corpusV2(corpus_path, corpus_name, lang="de"):
|
||||||
"""
|
|
||||||
Load textacy-Corpus including spacy-parser out from file
|
|
||||||
:param corpus_path: str
|
|
||||||
:param corpus_name: str (should content the language like "_de_")
|
|
||||||
:param lang: str (language code) ir spacy.Language
|
|
||||||
:return: texracy.Corpus, spacy.language
|
|
||||||
"""
|
|
||||||
|
|
||||||
# ckeck for language
|
# ckeck for language
|
||||||
if "de_" in corpus_name:
|
if "de_" in corpus_name:
|
||||||
|
@ -359,5 +309,5 @@ def yield_fromdir(path,spacy_vocab=None,type=".pkl"):
|
||||||
else:
|
else:
|
||||||
for filename in filelist:
|
for filename in filelist:
|
||||||
yield load_obj(path+filename)
|
yield load_obj(path+filename)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
104
topicModeling.py
104
topicModeling.py
|
@ -9,7 +9,7 @@ import sys
|
||||||
import json
|
import json
|
||||||
import os.path
|
import os.path
|
||||||
import subprocess
|
import subprocess
|
||||||
from textacy import Vectorizer
|
from textacy import Vectorizer, viz
|
||||||
|
|
||||||
from miscellaneous import *
|
from miscellaneous import *
|
||||||
import textacy
|
import textacy
|
||||||
|
@ -163,7 +163,8 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
|
||||||
jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/"
|
jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/"
|
||||||
|
|
||||||
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||||
dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
|
#dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
|
||||||
|
dict_path = FILEPATH +"results/labeldict.txt".format(jgibbsLLDA_root)
|
||||||
|
|
||||||
# printlog(str("LABELDICT: {0}".format(labeldict)))
|
# printlog(str("LABELDICT: {0}".format(labeldict)))
|
||||||
#logprint(str("LABELDICT-length: {0}".format(len(labeldict))))
|
#logprint(str("LABELDICT-length: {0}".format(len(labeldict))))
|
||||||
|
@ -243,6 +244,30 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
|
||||||
|
|
||||||
textacy.fileio.write_file_lines(result, path2save_results)
|
textacy.fileio.write_file_lines(result, path2save_results)
|
||||||
#####################################################################################################################
|
#####################################################################################################################
|
||||||
|
|
||||||
|
#todo llda termite plot
|
||||||
|
"""
|
||||||
|
topic_inds=[] #<class 'list'>: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
|
||||||
|
|
||||||
|
# get topic and term labels
|
||||||
|
# <class 'tuple'>: ('topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14')
|
||||||
|
topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)
|
||||||
|
|
||||||
|
# <class 'tuple'>: ('hardware', 'raum', 'adresse', 'gebaeude', 'tu', 'uni', 'ticket', 'email', 'account', 'nummer', 'mail', 'outlook', 'karte', 'drucker', 'server', 'service', 'antwort', 'verbindung', 'herzliche', 'einrichten', 'vergessen', 'wenden', 'ews', 'anhang', 'form', 'konto', 'nachricht', 'unterstuetzung', 'passwort', 'unicard', 'semester', 'system', 'aenderung', 'rueckmeldung', 'meldung', 'zugreifen', 'login', 'adressat', 'sender', 'kurs', 'made', 'mittwoch', 'note', 'our', 'korrespondenz', 'unbeschadet', 'boss', 'unterrichten', 'telefax', 'zugang', 'probleme', 'zugriff', 'mitarbeiterin', 'internet', 'daten', 'anmeldung', 'aendern', 'unterschrift', 'loeschen', 'anmelden', 'datei', 'message', 'laptop', 'benoetigt', 'link', 'montag', 'programm', 'ordner', 'personal', 'rechner', 'veranstaltung', 'august', 'lizenz', 'anschluss', 'mitarbeiter', 'erwuenscht', 'umzug', 'pc', 'uniaccount', 'amt', 'fax', 'it', 'institut', 'nutzer', 'bild', 'type', 'prof', 'verantwortlicher', 'bemerkung', 'antragsteller', 'element', 'hahn', 'eintrag', 'telefonbuch', 'ansprechpartner', 'universitaet', 'physik', 'abteilung', 'fakultaet', 'software', 'dezernat', 'einrichtung', 'telefon', 'lehrstuhl', 'buero')
|
||||||
|
term_labels = tuple(id2term[term_ind] for term_ind in term_inds)
|
||||||
|
|
||||||
|
# get topic-term weights to size dots
|
||||||
|
#[[ 0.02721858 -0.03898025 0.00047936 ..., 0.05862538 -0.07742336 0.04761928]
|
||||||
|
# [ 0.14977875 -0.24192522 -0.00620335 ..., -0.0497216 0.08269951 -0.05715901]
|
||||||
|
# [ 0.04977951 0.02296709 0.01214562 ..., 0.11444371 -0.15212482 0.21481788]
|
||||||
|
# ...,
|
||||||
|
# [
|
||||||
|
term_topic_weights = np.array([self.model.components_[topic_ind][term_inds]
|
||||||
|
for topic_ind in topic_inds]).T
|
||||||
|
|
||||||
|
viz.draw_termite_plot(
|
||||||
|
term_topic_weights, topic_labels, term_labels, save=path2save_results)
|
||||||
|
"""
|
||||||
logprint("")
|
logprint("")
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
@ -275,9 +300,9 @@ def main(use_raw=False, algorithm="llda"):
|
||||||
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
|
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
|
||||||
# idee https://gate.ac.uk/family/
|
# idee https://gate.ac.uk/family/
|
||||||
|
|
||||||
# todo gescheites tf(-idf) maß finden
|
# todo llda topics zusammenfassen
|
||||||
# todo topics zusammenfassen
|
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
|
||||||
# frage wieviele tickets pro topic?
|
# frage lda wieviele tickets pro topic?
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ngrams = 1
|
ngrams = 1
|
||||||
|
@ -300,25 +325,25 @@ def main(use_raw=False, algorithm="llda"):
|
||||||
if algorithm == "llda":
|
if algorithm == "llda":
|
||||||
top_topic_words = 5
|
top_topic_words = 5
|
||||||
add_default_topic = False
|
add_default_topic = False
|
||||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
|
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
|
||||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||||
add_default_topic=add_default_topic)
|
add_default_topic=add_default_topic)
|
||||||
|
|
||||||
top_topic_words = 5
|
top_topic_words = 5
|
||||||
add_default_topic = True
|
add_default_topic = True
|
||||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
|
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
|
||||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||||
add_default_topic=add_default_topic)
|
add_default_topic=add_default_topic)
|
||||||
|
|
||||||
top_topic_words = 10
|
top_topic_words = 10
|
||||||
add_default_topic = False
|
add_default_topic = False
|
||||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
|
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
|
||||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||||
add_default_topic=add_default_topic)
|
add_default_topic=add_default_topic)
|
||||||
|
|
||||||
top_topic_words = 10
|
top_topic_words = 10
|
||||||
add_default_topic = True
|
add_default_topic = True
|
||||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdefault" if add_default_topic else "")
|
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
|
||||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||||
add_default_topic=add_default_topic)
|
add_default_topic=add_default_topic)
|
||||||
|
|
||||||
|
@ -339,15 +364,70 @@ def main(use_raw=False, algorithm="llda"):
|
||||||
|
|
||||||
labeldict = {k: v for v, k in enumerate(labelist)}
|
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||||
|
|
||||||
n_topics = 15
|
|
||||||
|
|
||||||
|
|
||||||
textacyTopicModeling(ngrams = 1,
|
textacyTopicModeling(ngrams = 1,
|
||||||
min_df = 1,
|
min_df = 1,
|
||||||
max_df = 0.8,
|
max_df = 0.9,
|
||||||
topicModel = algorithm,
|
topicModel = algorithm,
|
||||||
n_topics =n_topics,
|
n_topics =15,
|
||||||
corpus=de_corpus)
|
corpus=de_corpus)
|
||||||
|
|
||||||
|
textacyTopicModeling(ngrams=1,
|
||||||
|
min_df=1,
|
||||||
|
max_df=0.9,
|
||||||
|
topicModel=algorithm,
|
||||||
|
n_topics=20,
|
||||||
|
corpus=de_corpus)
|
||||||
|
|
||||||
|
textacyTopicModeling(ngrams=1,
|
||||||
|
min_df=1,
|
||||||
|
max_df=0.9,
|
||||||
|
topicModel=algorithm,
|
||||||
|
n_topics=25,
|
||||||
|
corpus=de_corpus)
|
||||||
|
|
||||||
|
|
||||||
|
textacyTopicModeling(ngrams=1,
|
||||||
|
min_df=1,
|
||||||
|
max_df=0.9,
|
||||||
|
topicModel=algorithm,
|
||||||
|
n_topics=30,
|
||||||
|
corpus=de_corpus)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
textacyTopicModeling(ngrams=(1, 2),
|
||||||
|
min_df=1,
|
||||||
|
max_df=0.9,
|
||||||
|
topicModel=algorithm,
|
||||||
|
n_topics=15,
|
||||||
|
corpus=de_corpus)
|
||||||
|
|
||||||
|
textacyTopicModeling(ngrams = (1,2),
|
||||||
|
min_df = 1,
|
||||||
|
max_df = 0.9,
|
||||||
|
topicModel = algorithm,
|
||||||
|
n_topics =20,
|
||||||
|
corpus=de_corpus)
|
||||||
|
|
||||||
|
textacyTopicModeling(ngrams = (1,2),
|
||||||
|
min_df = 1,
|
||||||
|
max_df = 0.9,
|
||||||
|
topicModel = algorithm,
|
||||||
|
n_topics =25,
|
||||||
|
corpus=de_corpus)
|
||||||
|
|
||||||
|
|
||||||
|
textacyTopicModeling(ngrams = (1,2),
|
||||||
|
min_df = 1,
|
||||||
|
max_df = 0.9,
|
||||||
|
topicModel = algorithm,
|
||||||
|
n_topics =30,
|
||||||
|
corpus=de_corpus)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
textacyTopicModeling(ngrams = (1,2),
|
textacyTopicModeling(ngrams = (1,2),
|
||||||
min_df = 1,
|
min_df = 1,
|
||||||
|
|
Loading…
Reference in New Issue