preprocessing abgeschlossen
This commit is contained in:
parent
17e45c30af
commit
16d3e1cb70
|
@ -1,92 +1,4 @@
|
||||||
"TicketNumber";"Subject";"CreatedDate";"categoryName";"Impact";"Urgency";"BenutzerID";"VerantwortlicherID";"EigentuemerID";"Description";"Solution"
|
"TicketNumber";"Subject";"CreatedDate";"categoryName";"Impact";"Urgency";"BenutzerID";"VerantwortlicherID";"EigentuemerID";"Description";"Solution"
|
||||||
"INC20357";"schulungstest";"21.07.2015 08:19:34";"ZHB";"2 - Mittel (Abt./Bereich)";"B - Normal";"aa8315f5-52c3-e411-80c7-0050569c58f5";"";"aa8315f5-52c3-e411-80c7-0050569c58f5";"kevin arbeite gefälligst :)";""
|
|
||||||
"INC40481";"Telephone Contract";"13.08.2015 14:18:57";"Neuanschluss";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Neuanschluss
|
|
||||||
Antragsteller:
|
|
||||||
Melanie Hinrichs
|
|
||||||
melanie.hinrichs@tu-dortmund.de
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Terminvorschlag unbestimmt
|
|
||||||
"TicketNumber";"Subject";"CreatedDate";"categoryName";"Impact";"Urgency";"BenutzerID";"VerantwortlicherID";"EigentuemerID";"Description";"Solution"
|
|
||||||
"INC20357";"schulungstest";"21.07.2015 08:19:34";"ZHB";"2 - Mittel (Abt./Bereich)";"B - Normal";"aa8315f5-52c3-e411-80c7-0050569c58f5";"";"aa8315f5-52c3-e411-80c7-0050569c58f5";"kevin arbeite gefälligst :)";""
|
|
||||||
"INC40481";"Telephone Contract";"13.08.2015 14:18:57";"Neuanschluss";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Neuanschluss
|
|
||||||
Antragsteller:
|
|
||||||
Melanie Hinrichs
|
|
||||||
melanie.hinrichs@tu-dortmund.de
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Terminvorschlag unbestimmt
|
|
||||||
Einrichtung Dezernat 3
|
|
||||||
Abteilung Abteilung 2
|
|
||||||
PSP Element L-11-10000-100-302300
|
|
||||||
UniAccount myvowest(Westerdorf, Yvonne)
|
|
||||||
Gebäude Pavillon 8
|
|
||||||
Raum ID 031 (63292)
|
|
||||||
Telefondose keine vorhanden
|
|
||||||
Telefonnr. -
|
|
||||||
Eintrag Telefonbuch
|
|
||||||
E-Mail melanie.hinrichs@tu-dortmund.de
|
|
||||||
Voicemail Nicht erwünscht
|
|
||||||
Ansprechpartner Melanie Hinrichs
|
|
||||||
Tel. Ansprechpartner 5848
|
|
||||||
Verantwortlicher Nutzer -
|
|
||||||
Type Amt
|
|
||||||
Bemerkung:
|
|
||||||
Es wird ein Telefon benötigt,ein Telefon mit 6 Speicherpl.f.die Gruppenfunktion ist ausreichend. Die Möbel werden am 10.06.2015 aufgestellt.Weder Netzwerkdose noch Telefondose vorhanden. Dez.6 hat Vorbereitungen getroffen.";"Frau Hinrichs überdenkt die Situation und macht dann neue Anträge.
|
|
||||||
Dieses Ticket wird geschlossen"
|
|
||||||
"INC40483";"Telephone Contract";"13.08.2015 14:22:06";"Neuanschluss";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Neuanschluss
|
|
||||||
Antragsteller:
|
|
||||||
Anja Kulmsee
|
|
||||||
anja.kulmsee@tu-dortmund.de
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Terminvorschlag 03.08.2015
|
|
||||||
Einrichtung Fk06 Dekanat
|
|
||||||
Abteilung Bereich Studium und Lehre
|
|
||||||
PSP Element L-11-10000-100-060011
|
|
||||||
UniAccount manjkulm(Kulmsee, Anja)
|
|
||||||
Gebäude CT Geschossbau 2
|
|
||||||
Raum ID G2-3.22 (64882)
|
|
||||||
Telefondose
|
|
||||||
Telefonnr. -
|
|
||||||
Eintrag Telefonbuch
|
|
||||||
E-Mail anja.kulmsee@tu-dortmund.de
|
|
||||||
Voicemail Nicht erwünscht
|
|
||||||
Ansprechpartner Anja Kulmsee
|
|
||||||
Tel. Ansprechpartner 6179, 7370, 7179
|
|
||||||
Verantwortlicher Nutzer -
|
|
||||||
Type Amt
|
|
||||||
Bemerkung:
|
|
||||||
Der Anschluß ist für ein Faxgerät. Wenn möglich hätte ich gern die Rufnummer 3033.";"Faxnummer 3166 wurde unter die Telefonnummer 7179 im elektronischen Telefonbuch eingetragen"
|
|
||||||
"INC40484";"Defekte Netzwerkdose / Frage zu VPN";"13.08.2015 14:25:50";"LAN";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Sehr geehrtes ITMC Service Team,
|
|
||||||
|
|
||||||
seit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen an das Intranet der BMP mit der Dosennummer G1 303/04/12.05 (G1 4 26-1) in Raum G1-426 nicht mehr zu funktionieren.
|
|
||||||
Ich würde Sie daher bitten diese Mail an den zuständigen Kollegen weiterzuleiten, um die Leitung vielleicht einmal zu Prüfen.
|
|
||||||
|
|
||||||
Des Weiteren hätte ich noch eine Frage bezüglich der Möglichkeit zur Nutzung einer VPN Verbindung aus unserem Intranet heraus zu einem fremden Netzwerk. Dies ist zwar über das WLAN-Netz möglich, jedoch nicht aus unserem Netzwerk heraus. Vielleicht können Sie mir mitteilen an welchen Kollegen ich mich bezüglich dieses Problem wenden kann.
|
|
||||||
|
|
||||||
Bei Rückfragen stehe ich gerne zur Verfügung!
|
|
||||||
|
|
||||||
Beste Grüße,
|
|
||||||
|
|
||||||
Nicolas Rauner
|
|
||||||
|
|
||||||
LS Biomaterialien und Polymerwissenschaften
|
|
||||||
Fakultät Bio- und Chemieingenieurwesen
|
|
||||||
TU Dortmund
|
|
||||||
D-44227 Dortmund
|
|
||||||
|
|
||||||
Tel: + 49-(0)231 / 755 - 3015
|
|
||||||
Fax: + 49-(0)231 / 755 - 2480
|
|
||||||
|
|
||||||
www.ls-bmp.de <http://www.ls-bmp.de/>";"Hallo Herr Rauner,
|
|
||||||
die Netzwerkdose weist z. Z. keine Verbindungsprobleme auf. Falls doch welche bestehen, melden Sie sich bitte bei uns.
|
|
||||||
|
|
||||||
Mit freunldichen Grüßen
|
|
||||||
Aicha Oikrim"
|
|
||||||
"INC40487";"(SSO) Login via Browser mit Zertifikat";"13.08.2015 14:54:57";"Betrieb";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Lieber Support,
|
"INC40487";"(SSO) Login via Browser mit Zertifikat";"13.08.2015 14:54:57";"Betrieb";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Lieber Support,
|
||||||
ich habe gerade versucht mich mit meiner Unicard im Firefox-Browser für das
|
ich habe gerade versucht mich mit meiner Unicard im Firefox-Browser für das
|
||||||
Service-Portal zu authentifizieren. Das hat vor einigen Wochen noch tadelos
|
Service-Portal zu authentifizieren. Das hat vor einigen Wochen noch tadelos
|
||||||
|
|
Can't render this file because it contains an unexpected character in line 11 and column 4.
|
83
config.ini
83
config.ini
|
@ -1,86 +1,91 @@
|
||||||
[thesaurus]
|
[thesaurus]
|
||||||
input = deWordNet.xml
|
input=deWordNet.xml
|
||||||
pickle_file = thesaurus_dict.pkl
|
pickle_file=thesaurus_dict.pkl
|
||||||
|
|
||||||
|
|
||||||
[spellchecking]
|
[spellchecking]
|
||||||
input = deu_news_2015_1M-sentences.txt
|
input=deu_news_2015_1M-sentences.txt
|
||||||
pickle_file = words_dict.pkl
|
pickle_file=words_dict.pkl
|
||||||
|
|
||||||
|
|
||||||
[lemmatization]
|
[lemmatization]
|
||||||
input = lemmas.txt
|
input=lemmas.txt
|
||||||
pickle_file = lemma_dict.pkl
|
pickle_file=lemma_dict.pkl
|
||||||
|
|
||||||
|
|
||||||
[nouns]
|
[nouns]
|
||||||
input1 = nomen.txt
|
input1=nomen.txt
|
||||||
input2 = nomen2.txt
|
input2=nomen2.txt
|
||||||
pickle_file = nouns_list.pkl
|
pickle_file=nouns_list.pkl
|
||||||
|
|
||||||
|
|
||||||
[firstnames]
|
[firstnames]
|
||||||
input = firstnames.txt
|
input=firstnames.txt
|
||||||
pickle_file = firstnames_list.pkl
|
pickle_file=firstnames_list.pkl
|
||||||
|
|
||||||
|
|
||||||
[de_stopwords]
|
[de_stopwords]
|
||||||
input1 = de_stopwords_1.txt
|
input1=de_stopwords_1.txt
|
||||||
input2 = de_stopwords_2.txt
|
input2=de_stopwords_2.txt
|
||||||
input3 = de_stopwords_3.txt
|
input3=de_stopwords_3.txt
|
||||||
pickle_file = stopwords_list.pkl
|
pickle_file=de_stopwords_list.pkl
|
||||||
|
|
||||||
|
[en_stopwords]
|
||||||
|
|
||||||
|
pickle_file=en_stopwords_list.pkl
|
||||||
|
|
||||||
|
|
||||||
[logging]
|
[logging]
|
||||||
level = INFO
|
level=INFO
|
||||||
filename = topicModelTickets.log
|
filename=topicModelTickets.log
|
||||||
|
|
||||||
|
|
||||||
[de_corpus]
|
[de_corpus]
|
||||||
#input = M42-Export/Tickets_med.csv
|
#input=M42-Export/Tickets_med.csv
|
||||||
#input = M42-Export/Tickets_small.csv
|
#input=M42-Export/Tickets_small.csv
|
||||||
#input = M42-Export/Tickets_mini.csv
|
#input=M42-Export/Tickets_mini.csv
|
||||||
input = M42-Export/de_tickets.csv
|
input=M42-Export/de_tickets.csv
|
||||||
|
|
||||||
path = corpi/
|
path=corpi/
|
||||||
|
|
||||||
[en_corpus]
|
[en_corpus]
|
||||||
input = M42-Export/en_tickets.csv
|
input=M42-Export/en_tickets.csv
|
||||||
|
|
||||||
path = corpi/
|
path=corpi/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
[tickets]
|
[tickets]
|
||||||
content_collumn_name = Description
|
content_collumn_name=Description
|
||||||
metaliste = TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerID,VerantwortlicherID,EigentuemerID,Solution
|
metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerID,VerantwortlicherID,EigentuemerID,Solution
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
[preprocessing]
|
[preprocessing]
|
||||||
|
|
||||||
ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC
|
ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
|
||||||
|
|
||||||
custom_words = grüßen,fragen,damen,probleme,herren,dank
|
custom_words=geehrt,dame,herr,hilfe,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,institut,universitaet,name,gruss,id,erfolg,mail,folge,nummer,team,fakultaet,email,absender,tu,versenden,vorname,message,service,strasse,prozess,portal,raum,personal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,kollege,pruefen,hoffen
|
||||||
|
|
||||||
#lemmatize = True
|
|
||||||
|
|
||||||
|
|
||||||
[topic modeling]
|
#lemmatize=True
|
||||||
|
|
||||||
ngrams = (1,2)
|
|
||||||
|
|
||||||
min_df = 0
|
[topicmodeling]
|
||||||
max_df = 1.0
|
|
||||||
no_below = 20
|
|
||||||
no_above = 0.5
|
|
||||||
|
|
||||||
topicModel = lda
|
ngrams=(1,2)
|
||||||
|
|
||||||
top_topic_words = 5
|
min_df=0
|
||||||
|
max_df=1.0
|
||||||
|
no_below=20
|
||||||
|
no_above=0.5
|
||||||
|
|
||||||
top_document_labels_per_topic = 2
|
topicModel=lda
|
||||||
|
|
||||||
|
top_topic_words=5
|
||||||
|
|
||||||
|
top_document_labels_per_topic=2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import re
|
import re
|
||||||
import textacy
|
import textacy
|
||||||
|
from textacy.preprocess import normalize_whitespace
|
||||||
from scipy import *
|
from scipy import *
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
@ -93,10 +94,8 @@ metaliste = [
|
||||||
]
|
]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
content_collumn_name = config.get("tickets","content_collumn_name")
|
content_collumn_name = config.get("tickets","content_collumn_name")
|
||||||
metaliste = config.get("tickets","metaliste").split(",")
|
metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(",")))
|
||||||
|
|
||||||
|
|
||||||
path2de_csv = FILEPATH + config.get("de_corpus","input")
|
path2de_csv = FILEPATH + config.get("de_corpus","input")
|
||||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
|
@ -121,7 +120,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
|
||||||
raw_corpus = textacy.Corpus(lang)
|
raw_corpus = textacy.Corpus(lang)
|
||||||
|
|
||||||
## add files to textacy-corpi,
|
## add files to textacy-corpi,
|
||||||
printlog("Add texts to {0}_textacy-corpi".format(lang))
|
#printlog("Add texts to {0}_textacy-corpi".format(lang))
|
||||||
|
|
||||||
raw_corpus.add_texts(
|
raw_corpus.add_texts(
|
||||||
ticketcsv_to_textStream(path2_csv, content_collumn_name),
|
ticketcsv_to_textStream(path2_csv, content_collumn_name),
|
||||||
|
@ -140,6 +139,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
|
||||||
# save corpus
|
# save corpus
|
||||||
raw_name = lang + "_raw_ticket"
|
raw_name = lang + "_raw_ticket"
|
||||||
save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name)
|
save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name)
|
||||||
|
printlog("Done")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -148,7 +148,7 @@ def main():
|
||||||
|
|
||||||
ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de")
|
ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de")
|
||||||
|
|
||||||
ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en")
|
#ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en")
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
9
init.py
9
init.py
|
@ -264,7 +264,9 @@ path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
|
||||||
stop1 = FILEPATH + config.get("de_stopwords","input1")
|
stop1 = FILEPATH + config.get("de_stopwords","input1")
|
||||||
stop2 = FILEPATH + config.get("de_stopwords","input2")
|
stop2 = FILEPATH + config.get("de_stopwords","input2")
|
||||||
stop3 = FILEPATH + config.get("de_stopwords","input3")
|
stop3 = FILEPATH + config.get("de_stopwords","input3")
|
||||||
path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file")
|
path2stopwordlist_de = FILEPATH + config.get("de_stopwords","pickle_file")
|
||||||
|
|
||||||
|
path2stopwordlist_en = FILEPATH + config.get("en_stopwords","pickle_file")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -293,8 +295,9 @@ def main():
|
||||||
|
|
||||||
|
|
||||||
printlog("Build and save stoppwortliste")
|
printlog("Build and save stoppwortliste")
|
||||||
de_stop_words = create_stopword_lists(stop1, stop2, stop3)
|
de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3)
|
||||||
save_obj(de_stop_words, path2stopwordlist)
|
save_obj(de_stop_words, path2stopwordlist_de)
|
||||||
|
save_obj(en_stop_words, path2stopwordlist_en)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
6
main.py
6
main.py
|
@ -4,6 +4,7 @@ import time
|
||||||
import init
|
import init
|
||||||
import corporization
|
import corporization
|
||||||
import preprocessing
|
import preprocessing
|
||||||
|
import topicModeling
|
||||||
from miscellaneous import *
|
from miscellaneous import *
|
||||||
|
|
||||||
|
|
||||||
|
@ -19,5 +20,10 @@ printlog("")
|
||||||
preprocessing.main()
|
preprocessing.main()
|
||||||
printlog("")
|
printlog("")
|
||||||
|
|
||||||
|
topicModeling.main()
|
||||||
|
printlog("")
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
printlog("Total Time Elapsed: {0} min".format((end - start) / 60))
|
printlog("Total Time Elapsed: {0} min".format((end - start) / 60))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -131,8 +131,8 @@ def printRandomDoc(textacyCorpus):
|
||||||
else:
|
else:
|
||||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
printlog("Index: {0} \n Text: {1} \n Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||||
textacyCorpus[randIndex].metadata))
|
textacyCorpus[randIndex].metadata['categoryName']))
|
||||||
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
169
preprocessing.py
169
preprocessing.py
|
@ -24,18 +24,30 @@ with open(config_ini) as f:
|
||||||
config.read_file(f)
|
config.read_file(f)
|
||||||
|
|
||||||
|
|
||||||
|
global REGEX_SPECIALCHAR
|
||||||
|
global REGEX_TOPLVL
|
||||||
|
|
||||||
|
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]'
|
||||||
REGEX_SPECIALCHAR = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
|
||||||
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
|
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
global THESAURUS
|
||||||
|
global WORDS
|
||||||
|
global LEMMAS
|
||||||
|
global NOUNS
|
||||||
|
global VORNAMEN
|
||||||
|
global DE_STOP_WORDS
|
||||||
|
global EN_STOP_WORDS
|
||||||
|
|
||||||
THESAURUS = {}
|
THESAURUS = {}
|
||||||
WORDS = {}
|
WORDS= {}
|
||||||
LEMMAS = {}
|
LEMMAS= {}
|
||||||
NOUNS = []
|
NOUNS= {}
|
||||||
VORNAMEN= []
|
VORNAMEN= {}
|
||||||
de_stop_words=[]
|
DE_STOP_WORDS= {}
|
||||||
|
EN_STOP_WORDS= {}
|
||||||
|
|
||||||
############# filter tokens
|
############# filter tokens
|
||||||
|
|
||||||
|
@ -210,6 +222,10 @@ def stringcleaning(stringstream):
|
||||||
|
|
||||||
yield string
|
yield string
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def filterTokens(tokens, funclist):
|
def filterTokens(tokens, funclist):
|
||||||
# in:tokenlist, funclist
|
# in:tokenlist, funclist
|
||||||
# out: tokenlist
|
# out: tokenlist
|
||||||
|
@ -218,9 +234,75 @@ def filterTokens(tokens, funclist):
|
||||||
|
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
def processContentstream2(textstream, parser, token_filterlist=None):
|
||||||
|
|
||||||
|
#pre parse
|
||||||
|
textstream = preparse(textstream)
|
||||||
|
|
||||||
|
pipe = parser.pipe(textstream)
|
||||||
|
|
||||||
|
for doc in pipe:
|
||||||
|
|
||||||
|
tokens = [tok for tok in doc]
|
||||||
|
|
||||||
|
# in parse
|
||||||
|
if token_filterlist is not None:
|
||||||
|
tokens = filterTokens(tokens, token_filterlist)
|
||||||
|
|
||||||
|
# post parse
|
||||||
|
tokens = [postparse(tok) for tok in tokens] #todo informationsverlust!
|
||||||
|
|
||||||
|
yield " ".join(tokens)
|
||||||
|
|
||||||
|
def preparse(stringstream):
|
||||||
|
|
||||||
|
for string in stringstream:
|
||||||
|
# fixUnicode
|
||||||
|
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
||||||
|
|
||||||
|
# seperate_words_on_regex:
|
||||||
|
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
|
||||||
|
|
||||||
|
#normalize whitespace
|
||||||
|
string = textacy.preprocess.normalize_whitespace(string)
|
||||||
|
|
||||||
|
# replaceRockDots
|
||||||
|
string = re.sub(r'[ß]', "ss", string)
|
||||||
|
string = re.sub(r'[ö]', "oe", string)
|
||||||
|
string = re.sub(r'[ü]', "ue", string)
|
||||||
|
string = re.sub(r'[ä]', "ae", string)
|
||||||
|
|
||||||
|
# cut_after
|
||||||
|
# todo addressen enfernen --> postal.parser idee zu metadaten hinzufügen
|
||||||
|
words = ["gruss", "grusse","gruesse","gruessen","grusses"]
|
||||||
|
|
||||||
|
for gr in words:
|
||||||
|
if gr in string:
|
||||||
|
string = string.rpartition(gr)[0]
|
||||||
|
break
|
||||||
|
|
||||||
|
yield string
|
||||||
|
|
||||||
|
def postparse(toktext):
|
||||||
|
"""
|
||||||
|
:param toktext: spacy.token
|
||||||
|
:return: string
|
||||||
|
"""
|
||||||
|
toktext = toktext.lower_
|
||||||
|
|
||||||
|
# remove_words_containing_topLVL
|
||||||
|
toktext = toktext if not re.search(REGEX_TOPLVL, toktext) else ""
|
||||||
|
|
||||||
|
# lemmatize
|
||||||
|
toktext = lemmatizeWord(toktext)
|
||||||
|
|
||||||
|
# synonyme normalisieren
|
||||||
|
toktext = getFirstSynonym(toktext)
|
||||||
|
|
||||||
|
# autocorrect
|
||||||
|
toktext = autocorrectWord(toktext)
|
||||||
|
|
||||||
|
return toktext
|
||||||
|
|
||||||
def corpus2Text(corpus):
|
def corpus2Text(corpus):
|
||||||
for doc in corpus:
|
for doc in corpus:
|
||||||
|
@ -303,52 +385,16 @@ path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
|
||||||
|
|
||||||
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
|
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
|
||||||
|
|
||||||
path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file")
|
path2DEstopwordlist = FILEPATH + config.get("de_stopwords", "pickle_file")
|
||||||
|
|
||||||
|
|
||||||
|
path2ENstopwordlist = FILEPATH + config.get("en_stopwords", "pickle_file")
|
||||||
|
|
||||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
|
|
||||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||||
|
|
||||||
|
|
||||||
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
|
|
||||||
"hallo", "gerne", "freundlich", "fragen", "fehler", "bitten", "ehre", "lieb", "helfen",
|
|
||||||
"versuchen", "unbestimmt", "woche", "tadelos", "klappen", "mittlerweile", "bekommen",
|
|
||||||
"erreichbar", "gruss", "auffahren", "vorgang", "hinweis", "institut", "universitaet",
|
|
||||||
"name", "gruss", "id", "erfolg", "mail","folge",
|
|
||||||
"nummer", "team", "fakultaet", "email", "absender", "tu", "versenden", "vorname", "message",
|
|
||||||
"service", "strasse", "prozess", "portal", "raum", "personal", "moeglichkeit", "fremd", "wende",
|
|
||||||
"rueckfrage", "stehen", "verfuegung",
|
|
||||||
"funktionieren", "kollege", "pruefen", "hoffen"
|
|
||||||
]
|
|
||||||
|
|
||||||
filter_tokens = [
|
|
||||||
# removeENT(["PERSON"]),
|
|
||||||
|
|
||||||
keepNouns(),
|
|
||||||
|
|
||||||
remove_words_containing_Numbers(),
|
|
||||||
|
|
||||||
removePOS(["PUNCT", "SPACE", "NUM"]),
|
|
||||||
|
|
||||||
#removeWords(de_stop_words + custom_words),
|
|
||||||
removeWords(de_stop_words),
|
|
||||||
|
|
||||||
remove_long_words(),
|
|
||||||
remove_short_words(),
|
|
||||||
remove_first_names()
|
|
||||||
|
|
||||||
|
|
||||||
]
|
|
||||||
#todo filtertokens haut alle raus
|
|
||||||
filter_tokens = None
|
|
||||||
|
|
||||||
clean_in_meta = {
|
|
||||||
"Solution": [removePOS(["SPACE"])],
|
|
||||||
"Subject": [removePOS(["SPACE", "PUNCT"])],
|
|
||||||
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
|
||||||
}
|
|
||||||
|
|
||||||
def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
|
def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
|
||||||
|
|
||||||
|
@ -365,7 +411,7 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print
|
||||||
|
|
||||||
## process and add files to textacy-corpi,
|
## process and add files to textacy-corpi,
|
||||||
corpus.add_texts(
|
corpus.add_texts(
|
||||||
processContentstream(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser),
|
processContentstream2(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser),
|
||||||
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
|
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -392,14 +438,39 @@ def main():
|
||||||
THESAURUS = load_obj(path2thesaurus_dict)
|
THESAURUS = load_obj(path2thesaurus_dict)
|
||||||
WORDS = load_obj(path2wordsdict)
|
WORDS = load_obj(path2wordsdict)
|
||||||
LEMMAS = load_obj(path2lemmadict)
|
LEMMAS = load_obj(path2lemmadict)
|
||||||
DE_STOP_WORDS = load_obj(path2stopwordlist)
|
DE_STOP_WORDS = load_obj(path2DEstopwordlist)
|
||||||
|
EN_STOP_WORDS = load_obj(path2ENstopwordlist)
|
||||||
NOUNS = load_obj(path2nouns_list)
|
NOUNS = load_obj(path2nouns_list)
|
||||||
VORNAMEN = load_obj(path2firstnameslist)
|
VORNAMEN = load_obj(path2firstnameslist)
|
||||||
|
|
||||||
|
filter_tokens = [
|
||||||
|
# removeENT(["PERSON"]),
|
||||||
|
|
||||||
|
keepNouns(NOUNS),
|
||||||
|
|
||||||
|
remove_words_containing_Numbers(),
|
||||||
|
|
||||||
|
removePOS(["PUNCT", "SPACE", "NUM"]),
|
||||||
|
|
||||||
|
# removeWords(de_stop_words + custom_words),
|
||||||
|
removeWords(DE_STOP_WORDS),
|
||||||
|
|
||||||
|
remove_long_words(),
|
||||||
|
remove_short_words(),
|
||||||
|
remove_first_names()
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
clean_in_meta = {
|
||||||
|
"Solution": [removePOS(["SPACE"])],
|
||||||
|
"Subject": [removePOS(["SPACE", "PUNCT"])],
|
||||||
|
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
||||||
|
}
|
||||||
|
|
||||||
preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" )
|
preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" )
|
||||||
|
|
||||||
preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" )
|
#preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" )
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
||||||
|
|
22
testra.py
22
testra.py
|
@ -8,6 +8,8 @@ import json
|
||||||
#import textacy
|
#import textacy
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
|
|
||||||
|
import textacy
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
import enchant
|
import enchant
|
||||||
|
@ -54,8 +56,12 @@ corpi.add_texts(
|
||||||
print(corpi)
|
print(corpi)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/"
|
||||||
|
|
||||||
|
LLDA_filepath = "{0}labeldict.txt".format(jgibbsLLDA_root)
|
||||||
|
laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_hilfe': 18, 'sap': 7, 'pos': 23, 'verwaltung': 4, 'lan': 1}
|
||||||
|
with open(LLDA_filepath, 'w') as file:
|
||||||
|
file.write(json.dumps(laveldict))
|
||||||
"""
|
"""
|
||||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -85,20 +91,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||||
return corpus
|
return corpus
|
||||||
"""
|
"""
|
||||||
import os
|
|
||||||
a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt"
|
|
||||||
b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt"
|
|
||||||
d = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt"
|
|
||||||
|
|
||||||
c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
scriptpath = os.path.dirname(os.path.realpath(__file__))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
327
topicModeling.py
327
topicModeling.py
|
@ -1,82 +1,39 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
print(datetime.now())
|
|
||||||
|
|
||||||
import time
|
|
||||||
|
|
||||||
import enchant
|
|
||||||
|
|
||||||
start = time.time()
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import logging
|
|
||||||
from stop_words import get_stop_words
|
|
||||||
|
|
||||||
#import words as words
|
|
||||||
from nltk.corpus import stopwords as nltk_stopwords
|
|
||||||
from collections import Counter
|
|
||||||
import csv
|
import csv
|
||||||
import re
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import spacy
|
|
||||||
import textacy
|
|
||||||
from scipy import *
|
|
||||||
import sys
|
import sys
|
||||||
csv.field_size_limit(sys.maxsize)
|
import json
|
||||||
import pickle
|
|
||||||
import configparser as ConfigParser
|
|
||||||
from miscellaneous import *
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
import logging
|
|
||||||
from nltk.corpus import stopwords
|
|
||||||
import csv
|
|
||||||
import functools
|
|
||||||
import re
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import spacy
|
|
||||||
import textacy
|
|
||||||
from scipy import *
|
|
||||||
import sys
|
|
||||||
csv.field_size_limit(sys.maxsize)
|
|
||||||
|
|
||||||
import logging
|
|
||||||
|
|
||||||
import csv
|
|
||||||
import functools
|
|
||||||
import os.path
|
import os.path
|
||||||
import re
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
from textacy import Vectorizer
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import sys
|
from miscellaneous import *
|
||||||
import spacy
|
|
||||||
import textacy
|
import textacy
|
||||||
from scipy import *
|
from scipy import *
|
||||||
from textacy import Vectorizer
|
|
||||||
import warnings
|
import os
|
||||||
import configparser as ConfigParser
|
|
||||||
import sys
|
|
||||||
import hunspell
|
|
||||||
from postal.parser import parse_address
|
|
||||||
|
|
||||||
csv.field_size_limit(sys.maxsize)
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||||
|
|
||||||
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
|
||||||
|
|
||||||
|
|
||||||
|
# load config
|
||||||
|
config_ini = FILEPATH + "config.ini"
|
||||||
|
|
||||||
|
config = ConfigParser.ConfigParser()
|
||||||
|
with open(config_ini) as f:
|
||||||
|
config.read_file(f)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
|
def printvecotorization(de_corpus,ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
|
||||||
printlog(str("ngrams: {0}".format(ngrams)))
|
printlog(str("ngrams: {0}".format(ngrams)))
|
||||||
printlog(str("min_df: {0}".format(min_df)))
|
printlog(str("min_df: {0}".format(min_df)))
|
||||||
printlog(str("max_df: {0}".format(max_df)))
|
printlog(str("max_df: {0}".format(max_df)))
|
||||||
|
@ -94,47 +51,7 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en
|
||||||
printlog("doc_term_matrix: {0}".format(doc_term_matrix))
|
printlog("doc_term_matrix: {0}".format(doc_term_matrix))
|
||||||
printlog("id2term: {0}".format(id2term))
|
printlog("id2term: {0}".format(id2term))
|
||||||
|
|
||||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda',named_entities=False):
|
||||||
corpus_name = "de_corpus"
|
|
||||||
|
|
||||||
# load corpi
|
|
||||||
de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# todo gescheites tf(-idf) maß finden
|
|
||||||
ngrams = 1
|
|
||||||
min_df = 1
|
|
||||||
max_df = 1.0
|
|
||||||
weighting = 'tf'
|
|
||||||
# weighting ='tfidf'
|
|
||||||
named_entities = False
|
|
||||||
|
|
||||||
"""
|
|
||||||
printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
|
|
||||||
printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
|
|
||||||
printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting)
|
|
||||||
|
|
||||||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting)
|
|
||||||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
|
|
||||||
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
# build citionary of ticketcategories
|
|
||||||
labelist = []
|
|
||||||
|
|
||||||
for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
|
||||||
labelist.append(texdoc.metadata["categoryName"])
|
|
||||||
|
|
||||||
LABELDICT = {k: v for v, k in enumerate(labelist)}
|
|
||||||
|
|
||||||
printlog(str("LABELDICT: {0}".format(LABELDICT)))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(LABELDICT), named_entities=False,
|
|
||||||
corpus=de_corpus):
|
|
||||||
printlog(
|
printlog(
|
||||||
"############################################ Topic Modeling {0} #############################################".format(
|
"############################################ Topic Modeling {0} #############################################".format(
|
||||||
topicModel))
|
topicModel))
|
||||||
|
@ -198,44 +115,156 @@ def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(
|
||||||
printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
|
printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
|
||||||
|
|
||||||
|
|
||||||
# no_below = 20
|
def jgibbsLLDA(de_corpus, top_topic_words):
|
||||||
# no_above = 0.5
|
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
def label2ID(label, labeldict):
|
||||||
|
return labeldict.get(label, len(labeldict))
|
||||||
|
|
||||||
|
def generate_labled_lines(textacyCorpus,labeldict):
|
||||||
|
for doc in textacyCorpus:
|
||||||
|
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||||
|
yield "[" + str(label2ID(doc.metadata["categoryName"],labeldict)) + "] " + doc.text
|
||||||
|
|
||||||
|
# build citionary of ticketcategories
|
||||||
|
labelist = []
|
||||||
|
|
||||||
|
for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
||||||
|
labelist.append(texdoc.metadata["categoryName"])
|
||||||
|
|
||||||
|
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||||
|
|
||||||
|
n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||||
|
|
||||||
|
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/"
|
||||||
|
|
||||||
|
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||||
|
dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
|
||||||
|
|
||||||
|
|
||||||
# n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
#printlog(str("LABELDICT: {0}".format(labeldict)))
|
||||||
|
printlog(str("LABELDICT-length: {0}".format(len(labeldict))))
|
||||||
|
with open(dict_path, 'w') as file:
|
||||||
|
file.write(json.dumps(labeldict))
|
||||||
|
|
||||||
|
#for line in generate_labled_lines(de_corpus,labeldict):
|
||||||
|
# print(line)
|
||||||
|
|
||||||
|
# create file
|
||||||
|
textacy.fileio.write_file_lines(generate_labled_lines(de_corpus,labeldict), filepath=LLDA_filepath)
|
||||||
|
|
||||||
|
# wait for file to exist
|
||||||
|
while not os.path.exists(LLDA_filepath):
|
||||||
|
time.sleep(1)
|
||||||
|
"""
|
||||||
|
printlog("")
|
||||||
|
printlog("start LLDA:")
|
||||||
|
# run JGibsslda file
|
||||||
|
FNULL = open(os.devnull, 'w') # supress output
|
||||||
|
subprocess.call(["java",
|
||||||
|
"-cp",
|
||||||
|
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
|
||||||
|
jgibbsLLDA_root),
|
||||||
|
"jgibblda.LDA",
|
||||||
|
"-est",
|
||||||
|
"-dir", "{0}models/tickets".format(jgibbsLLDA_root),
|
||||||
|
"-dfile", "tickets.gz",
|
||||||
|
"-twords", str(top_topic_words),
|
||||||
|
"-ntopics", str(n_topics)], stdout=FNULL)
|
||||||
|
|
||||||
|
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
|
||||||
|
|
||||||
|
# twords
|
||||||
|
subprocess.call(["gzip",
|
||||||
|
"-dc",
|
||||||
|
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
|
||||||
|
#####################################################################################################################
|
||||||
|
printlog("")
|
||||||
|
"""
|
||||||
|
end = time.time()
|
||||||
|
printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
def main():
|
||||||
topicModeling(ngrams = 1,
|
|
||||||
|
printlog("Topic Modeling: {0}".format(datetime.now()))
|
||||||
|
|
||||||
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
|
|
||||||
|
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||||
|
|
||||||
|
preCorpus_name = "de" + "_pre_ticket"
|
||||||
|
|
||||||
|
#load raw corpus and create new one
|
||||||
|
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
||||||
|
printlog("Corpus loaded: {0}".format(de_corpus.lang))
|
||||||
|
|
||||||
|
#idee http://bigartm.org/
|
||||||
|
#idee http://wiki.languagetool.org/tips-and-tricks
|
||||||
|
|
||||||
|
# todo gescheites tf(-idf) maß finden
|
||||||
|
ngrams = 1
|
||||||
|
min_df = 1
|
||||||
|
max_df = 1.0
|
||||||
|
weighting = 'tf'
|
||||||
|
# weighting ='tfidf'
|
||||||
|
named_entities = False
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting)
|
||||||
|
printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting)
|
||||||
|
printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting)
|
||||||
|
|
||||||
|
printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting)
|
||||||
|
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
|
||||||
|
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
jgibbsLLDA(de_corpus,15)
|
||||||
|
|
||||||
|
# no_below = 20
|
||||||
|
# no_above = 0.5
|
||||||
|
|
||||||
|
|
||||||
|
# n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
topicModeling(ngrams = 1,
|
||||||
min_df = 1,
|
min_df = 1,
|
||||||
max_df = 1.0,
|
max_df = 1.0,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = len(LABELDICT),
|
n_topics = len(LABELDICT),
|
||||||
corpi=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
topicModeling(ngrams = 1,
|
topicModeling(ngrams = 1,
|
||||||
min_df = 0.1,
|
min_df = 0.1,
|
||||||
max_df = 0.6,
|
max_df = 0.6,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = len(LABELDICT),
|
n_topics = len(LABELDICT),
|
||||||
corpi=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
topicModeling(ngrams = (1,2),
|
topicModeling(ngrams = (1,2),
|
||||||
min_df = 1,
|
min_df = 1,
|
||||||
max_df = 1.0,
|
max_df = 1.0,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = len(LABELDICT),
|
n_topics = len(LABELDICT),
|
||||||
corpi=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
topicModeling(ngrams = (1,2),
|
topicModeling(ngrams = (1,2),
|
||||||
min_df = 0.1,
|
min_df = 0.1,
|
||||||
max_df = 0.6,
|
max_df = 0.6,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
n_topics = len(LABELDICT),
|
n_topics = len(LABELDICT),
|
||||||
corpi=de_corpus)
|
corpi=de_corpus)
|
||||||
|
|
||||||
topicModeling(ngrams = (1,2),
|
topicModeling(ngrams = (1,2),
|
||||||
min_df = 0.2,
|
min_df = 0.2,
|
||||||
max_df = 0.8,
|
max_df = 0.8,
|
||||||
topicModel = 'lda',
|
topicModel = 'lda',
|
||||||
|
@ -248,82 +277,12 @@ topicModeling(ngrams = (1,2),
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
|
|
||||||
|
|
||||||
|
|
||||||
top_topic_words = 15
|
|
||||||
|
|
||||||
print("\n\n")
|
|
||||||
start = time.time()
|
|
||||||
|
|
||||||
n_topics = len(LABELDICT) # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
|
||||||
|
|
||||||
# build citionary of ticketcategories
|
|
||||||
labelist = []
|
|
||||||
|
|
||||||
for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
|
||||||
labelist.append(texdoc.metadata["categoryName"])
|
|
||||||
|
|
||||||
LABELDICT = {k: v for v, k in enumerate(labelist)}
|
|
||||||
print(LABELDICT)
|
|
||||||
|
|
||||||
|
|
||||||
def label2ID(label, labeldict=LABELDICT):
|
|
||||||
return labeldict.get(label, len(labeldict))
|
|
||||||
|
|
||||||
|
|
||||||
def generate_labled_lines(textacyCorpus):
|
|
||||||
for doc in textacyCorpus:
|
|
||||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
|
||||||
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
|
||||||
|
|
||||||
|
|
||||||
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/"
|
|
||||||
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
|
||||||
|
|
||||||
# create file
|
|
||||||
textacy.fileio.write_file_lines(generate_labled_lines(de_corpus), filepath=LLDA_filepath)
|
|
||||||
|
|
||||||
# todfo ticket drucken
|
|
||||||
# wait for file to exist
|
|
||||||
while not os.path.exists(LLDA_filepath):
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
print("\n\n")
|
|
||||||
printlog("start LLDA:")
|
|
||||||
# run JGibsslda file
|
|
||||||
FNULL = open(os.devnull, 'w') # supress output
|
|
||||||
subprocess.call(["java",
|
|
||||||
"-cp",
|
|
||||||
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root),
|
|
||||||
"jgibblda.LDA",
|
|
||||||
"-est",
|
|
||||||
"-dir", "{0}models/tickets".format(jgibbsLLDA_root),
|
|
||||||
"-dfile", "tickets.gz",
|
|
||||||
"-twords", str(top_topic_words),
|
|
||||||
"-ntopics", str(n_topics)], stdout=FNULL)
|
|
||||||
|
|
||||||
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
|
|
||||||
|
|
||||||
# twords
|
|
||||||
subprocess.call(["gzip",
|
|
||||||
"-dc",
|
|
||||||
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
|
|
||||||
#####################################################################################################################
|
|
||||||
print()
|
|
||||||
print()
|
|
||||||
|
|
||||||
end = time.time()
|
|
||||||
printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue