# -*- coding: utf-8 -*- import matplotlib matplotlib.use('Agg') import time import init from datetime import datetime import corporization import preprocessing import topicModeling import cleaning from miscellaneous import * # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/printout_main.log &" start = time.time() # idee http://bigartm.org/ # idee http://wiki.languagetool.org/tips-and-tricks # idee https://en.wikipedia.org/wiki/Noisy_text_analytics # idee https://gate.ac.uk/family/ # idee häufige n-gramme raus (zB damen und herren) # idee llda topics zusammenfassen # idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics # frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen # idee word vorher mit semantischen netz abgleichen: wenn zu weit entfernt, dann ignore # todo modelle testen logprint("main.py started at {}".format(datetime.now())) #init.main() logprint("") raw_corpus = corporization.main() logprint("") cleaned_corpus = cleaning.main(raw_corpus) logprint("") pre_corpus = preprocessing.main(cleaned_corpus) logprint("") """ ticket_number = "INC40484" raw="" pre="" clean="" for r in raw_corpus.get(lambda doc: doc.metadata["TicketNumber"] == ticket_number): raw = r for c in cleaned_corpus.get(lambda doc: doc.metadata["TicketNumber"] == ticket_number): clean = c for p in pre_corpus.get(lambda doc: doc.metadata["TicketNumber"] == ticket_number): pre = p for tok1,tok2,tok3 in zip(raw,clean,pre): logprint(tok1.text,tok1.pos_) logprint(tok2.text,tok2.pos_) logprint(tok3.text,tok3.pos_) """ #for i in range(5): # printRandomDoc(cleaned_corpus) """ #topicModeling.main(algorithm="lsa") logprint("") #topicModeling.main(algorithm="nmf") logprint("") """ topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="llda") logprint("") topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="lda") logprint("") end = time.time() logprint("main.py finished at {}".format(datetime.now())) logprint("Total Time Elapsed: {0} min".format((end - start) / 60))