2017-10-16 14:01:38 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
2017-10-30 12:56:52 +01:00
|
|
|
import matplotlib
|
|
|
|
matplotlib.use('Agg')
|
2017-10-17 10:13:49 +02:00
|
|
|
import time
|
2017-11-03 11:49:26 +01:00
|
|
|
import init
|
2017-11-27 12:49:05 +01:00
|
|
|
from datetime import datetime
|
2017-10-16 14:01:38 +02:00
|
|
|
import corporization
|
|
|
|
import preprocessing
|
2017-10-18 17:37:20 +02:00
|
|
|
import topicModeling
|
2017-10-25 09:46:44 +02:00
|
|
|
import cleaning
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-11-03 11:49:26 +01:00
|
|
|
from miscellaneous import *
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-11-06 12:54:59 +01:00
|
|
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/printout_main.log &"
|
2017-10-17 10:13:49 +02:00
|
|
|
start = time.time()
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-11-17 11:46:57 +01:00
|
|
|
# idee http://bigartm.org/
|
|
|
|
# idee http://wiki.languagetool.org/tips-and-tricks
|
|
|
|
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
|
|
|
|
# idee https://gate.ac.uk/family/
|
2017-11-03 11:49:26 +01:00
|
|
|
|
|
|
|
|
2017-11-17 11:46:57 +01:00
|
|
|
|
2017-11-27 12:49:05 +01:00
|
|
|
|
|
|
|
# idee häufige n-gramme raus (zB damen und herren)
|
2017-11-21 10:14:37 +01:00
|
|
|
# idee llda topics zusammenfassen
|
2017-11-17 11:46:57 +01:00
|
|
|
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
|
|
|
|
# frage welche mitarbeiter bearbeiteten welche Topics? idee topics mit mitarbeiternummern erstzen
|
|
|
|
# idee word vorher mit semantischen netz abgleichen: wenn zu weit entfernt, dann ignore
|
|
|
|
|
2017-11-21 10:14:37 +01:00
|
|
|
# todo modelle testen
|
2017-11-17 11:46:57 +01:00
|
|
|
|
|
|
|
|
2017-11-29 16:31:30 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
logprint("main.py started at {}".format(datetime.now()))
|
2017-11-29 16:31:30 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
#init.main()
|
|
|
|
logprint("")
|
2017-11-29 16:31:30 +01:00
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
raw_corpus = corporization.main()
|
|
|
|
logprint("")
|
2017-11-29 16:31:30 +01:00
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
cleaned_corpus = cleaning.main(raw_corpus)
|
|
|
|
logprint("")
|
2017-11-29 16:31:30 +01:00
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
pre_corpus = preprocessing.main(cleaned_corpus)
|
|
|
|
logprint("")
|
2017-11-29 16:31:30 +01:00
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
"""
|
|
|
|
ticket_number = "INC40484"
|
|
|
|
raw=""
|
|
|
|
pre=""
|
|
|
|
clean=""
|
|
|
|
for r in raw_corpus.get(lambda doc: doc.metadata["TicketNumber"] == ticket_number):
|
|
|
|
raw = r
|
|
|
|
for c in cleaned_corpus.get(lambda doc: doc.metadata["TicketNumber"] == ticket_number):
|
|
|
|
clean = c
|
|
|
|
for p in pre_corpus.get(lambda doc: doc.metadata["TicketNumber"] == ticket_number):
|
|
|
|
pre = p
|
|
|
|
|
|
|
|
for tok1,tok2,tok3 in zip(raw,clean,pre):
|
|
|
|
|
|
|
|
logprint(tok1.text,tok1.pos_)
|
|
|
|
logprint(tok2.text,tok2.pos_)
|
|
|
|
logprint(tok3.text,tok3.pos_)
|
|
|
|
"""
|
2017-11-29 16:31:30 +01:00
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
#for i in range(5):
|
|
|
|
# printRandomDoc(cleaned_corpus)
|
2017-11-29 16:31:30 +01:00
|
|
|
|
2017-11-17 11:46:57 +01:00
|
|
|
|
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
|
2017-10-25 09:46:44 +02:00
|
|
|
|
2017-10-16 14:01:38 +02:00
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
"""
|
2017-11-03 11:49:26 +01:00
|
|
|
|
2017-11-17 11:46:57 +01:00
|
|
|
|
|
|
|
|
|
|
|
#topicModeling.main(algorithm="lsa")
|
2017-10-25 09:46:44 +02:00
|
|
|
logprint("")
|
2017-10-30 12:56:52 +01:00
|
|
|
|
|
|
|
|
2017-11-17 11:46:57 +01:00
|
|
|
#topicModeling.main(algorithm="nmf")
|
2017-10-30 12:56:52 +01:00
|
|
|
logprint("")
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
"""
|
|
|
|
topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="llda")
|
2017-11-03 11:49:26 +01:00
|
|
|
logprint("")
|
|
|
|
|
|
|
|
|
2017-12-08 11:06:07 +01:00
|
|
|
topicModeling.main(pre_corpus=pre_corpus,cleaned_corpus=cleaned_corpus,algorithm="lda")
|
2017-11-03 11:49:26 +01:00
|
|
|
logprint("")
|
|
|
|
|
|
|
|
|
2017-11-27 12:49:05 +01:00
|
|
|
|
|
|
|
|
2017-10-17 10:13:49 +02:00
|
|
|
end = time.time()
|
2017-11-27 12:49:05 +01:00
|
|
|
logprint("main.py finished at {}".format(datetime.now()))
|
2017-10-25 09:46:44 +02:00
|
|
|
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))
|
2017-10-18 17:37:20 +02:00
|
|
|
|