From 16d3e1cb701a88fdd563dfba8517f03cc550f17a Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Wed, 18 Oct 2017 17:37:20 +0200 Subject: [PATCH] preprocessing abgeschlossen --- M42-Export/Tickets_small.csv | 88 ----- config.ini | 83 ++--- corporization.py | 10 +- init.py | 9 +- java_LabledLDA/models/tickets/.others.gz | Bin 80 -> 89 bytes java_LabledLDA/models/tickets/.tassign.gz | Bin 200 -> 4151 bytes java_LabledLDA/models/tickets/.theta.gz | Bin 228 -> 8068 bytes java_LabledLDA/models/tickets/.twords.gz | Bin 739 -> 19902 bytes java_LabledLDA/models/tickets/.wordmap.gz | Bin 375 -> 4376 bytes java_LabledLDA/models/tickets/tickets.gz | Bin 480 -> 6782 bytes main.py | 6 + miscellaneous.py | 4 +- preprocessing.py | 169 ++++++--- testra.py | 22 +- topicModeling.py | 397 ++++++++++------------ 15 files changed, 368 insertions(+), 420 deletions(-) diff --git a/M42-Export/Tickets_small.csv b/M42-Export/Tickets_small.csv index 7936a66..520a4a8 100644 --- a/M42-Export/Tickets_small.csv +++ b/M42-Export/Tickets_small.csv @@ -1,92 +1,4 @@ "TicketNumber";"Subject";"CreatedDate";"categoryName";"Impact";"Urgency";"BenutzerID";"VerantwortlicherID";"EigentuemerID";"Description";"Solution" -"INC20357";"schulungstest";"21.07.2015 08:19:34";"ZHB";"2 - Mittel (Abt./Bereich)";"B - Normal";"aa8315f5-52c3-e411-80c7-0050569c58f5";"";"aa8315f5-52c3-e411-80c7-0050569c58f5";"kevin arbeite gefälligst :)";"" -"INC40481";"Telephone Contract";"13.08.2015 14:18:57";"Neuanschluss";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Neuanschluss -Antragsteller: -Melanie Hinrichs -melanie.hinrichs@tu-dortmund.de -  -  -  -Terminvorschlag unbestimmt -"TicketNumber";"Subject";"CreatedDate";"categoryName";"Impact";"Urgency";"BenutzerID";"VerantwortlicherID";"EigentuemerID";"Description";"Solution" -"INC20357";"schulungstest";"21.07.2015 08:19:34";"ZHB";"2 - Mittel (Abt./Bereich)";"B - Normal";"aa8315f5-52c3-e411-80c7-0050569c58f5";"";"aa8315f5-52c3-e411-80c7-0050569c58f5";"kevin arbeite gefälligst :)";"" -"INC40481";"Telephone Contract";"13.08.2015 14:18:57";"Neuanschluss";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Neuanschluss -Antragsteller: -Melanie Hinrichs -melanie.hinrichs@tu-dortmund.de -  -  -  -Terminvorschlag unbestimmt -Einrichtung Dezernat 3 -Abteilung Abteilung 2 -PSP Element L-11-10000-100-302300 -UniAccount myvowest(Westerdorf, Yvonne) -Gebäude Pavillon 8 -Raum ID 031 (63292) -Telefondose keine vorhanden -Telefonnr. - -Eintrag Telefonbuch -E-Mail melanie.hinrichs@tu-dortmund.de -Voicemail Nicht erwünscht -Ansprechpartner Melanie Hinrichs -Tel. Ansprechpartner 5848 -Verantwortlicher Nutzer - -Type Amt -Bemerkung: -Es wird ein Telefon benötigt,ein Telefon mit 6 Speicherpl.f.die Gruppenfunktion ist ausreichend. Die Möbel werden am 10.06.2015 aufgestellt.Weder Netzwerkdose noch Telefondose vorhanden. Dez.6 hat Vorbereitungen getroffen.";"Frau Hinrichs überdenkt die Situation und macht dann neue Anträge. -Dieses Ticket wird geschlossen" -"INC40483";"Telephone Contract";"13.08.2015 14:22:06";"Neuanschluss";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Neuanschluss -Antragsteller: -Anja Kulmsee -anja.kulmsee@tu-dortmund.de -  -  -  -Terminvorschlag 03.08.2015 -Einrichtung Fk06 Dekanat -Abteilung Bereich Studium und Lehre -PSP Element L-11-10000-100-060011 -UniAccount manjkulm(Kulmsee, Anja) -Gebäude CT Geschossbau 2 -Raum ID G2-3.22 (64882) -Telefondose -Telefonnr. - -Eintrag Telefonbuch -E-Mail anja.kulmsee@tu-dortmund.de -Voicemail Nicht erwünscht -Ansprechpartner Anja Kulmsee -Tel. Ansprechpartner 6179, 7370, 7179 -Verantwortlicher Nutzer - -Type Amt -Bemerkung: -Der Anschluß ist für ein Faxgerät. Wenn möglich hätte ich gern die Rufnummer 3033.";"Faxnummer 3166 wurde unter die Telefonnummer 7179 im elektronischen Telefonbuch eingetragen" -"INC40484";"Defekte Netzwerkdose / Frage zu VPN";"13.08.2015 14:25:50";"LAN";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Sehr geehrtes ITMC Service Team, - -seit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen an das Intranet der BMP mit der Dosennummer G1 303/04/12.05 (G1 4 26-1) in Raum G1-426 nicht mehr zu funktionieren. -Ich würde Sie daher bitten diese Mail an den zuständigen Kollegen weiterzuleiten, um die Leitung vielleicht einmal zu Prüfen. - -Des Weiteren hätte ich noch eine Frage bezüglich der Möglichkeit zur Nutzung einer VPN Verbindung aus unserem Intranet heraus zu einem fremden Netzwerk. Dies ist zwar über das WLAN-Netz möglich, jedoch nicht aus unserem Netzwerk heraus. Vielleicht können Sie mir mitteilen an welchen Kollegen ich mich bezüglich dieses Problem wenden kann. - -Bei Rückfragen stehe ich gerne zur Verfügung! - -Beste Grüße, - -Nicolas Rauner - -LS Biomaterialien und Polymerwissenschaften -Fakultät Bio- und Chemieingenieurwesen -TU Dortmund -D-44227 Dortmund - -Tel: + 49-(0)231 / 755 - 3015 -Fax: + 49-(0)231 / 755 - 2480 - -www.ls-bmp.de ";"Hallo Herr Rauner, -die Netzwerkdose weist z. Z. keine Verbindungsprobleme auf. Falls doch welche bestehen, melden Sie sich bitte bei uns. - -Mit freunldichen Grüßen -Aicha Oikrim" "INC40487";"(SSO) Login via Browser mit Zertifikat";"13.08.2015 14:54:57";"Betrieb";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Lieber Support, ich habe gerade versucht mich mit meiner Unicard im Firefox-Browser für das Service-Portal zu authentifizieren. Das hat vor einigen Wochen noch tadelos diff --git a/config.ini b/config.ini index 49394be..5e99a06 100644 --- a/config.ini +++ b/config.ini @@ -1,86 +1,91 @@ [thesaurus] -input = deWordNet.xml -pickle_file = thesaurus_dict.pkl +input=deWordNet.xml +pickle_file=thesaurus_dict.pkl [spellchecking] -input = deu_news_2015_1M-sentences.txt -pickle_file = words_dict.pkl +input=deu_news_2015_1M-sentences.txt +pickle_file=words_dict.pkl [lemmatization] -input = lemmas.txt -pickle_file = lemma_dict.pkl +input=lemmas.txt +pickle_file=lemma_dict.pkl [nouns] -input1 = nomen.txt -input2 = nomen2.txt -pickle_file = nouns_list.pkl +input1=nomen.txt +input2=nomen2.txt +pickle_file=nouns_list.pkl [firstnames] -input = firstnames.txt -pickle_file = firstnames_list.pkl +input=firstnames.txt +pickle_file=firstnames_list.pkl [de_stopwords] -input1 = de_stopwords_1.txt -input2 = de_stopwords_2.txt -input3 = de_stopwords_3.txt -pickle_file = stopwords_list.pkl +input1=de_stopwords_1.txt +input2=de_stopwords_2.txt +input3=de_stopwords_3.txt +pickle_file=de_stopwords_list.pkl + +[en_stopwords] + +pickle_file=en_stopwords_list.pkl [logging] -level = INFO -filename = topicModelTickets.log +level=INFO +filename=topicModelTickets.log [de_corpus] -#input = M42-Export/Tickets_med.csv -#input = M42-Export/Tickets_small.csv -#input = M42-Export/Tickets_mini.csv -input = M42-Export/de_tickets.csv +#input=M42-Export/Tickets_med.csv +#input=M42-Export/Tickets_small.csv +#input=M42-Export/Tickets_mini.csv +input=M42-Export/de_tickets.csv -path = corpi/ +path=corpi/ [en_corpus] -input = M42-Export/en_tickets.csv +input=M42-Export/en_tickets.csv -path = corpi/ +path=corpi/ [tickets] -content_collumn_name = Description -metaliste = TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerID,VerantwortlicherID,EigentuemerID,Solution +content_collumn_name=Description +metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerID,VerantwortlicherID,EigentuemerID,Solution [preprocessing] -ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC +ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC -custom_words = grüßen,fragen,damen,probleme,herren,dank - -#lemmatize = True +custom_words=geehrt,dame,herr,hilfe,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,institut,universitaet,name,gruss,id,erfolg,mail,folge,nummer,team,fakultaet,email,absender,tu,versenden,vorname,message,service,strasse,prozess,portal,raum,personal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,kollege,pruefen,hoffen -[topic modeling] +#lemmatize=True -ngrams = (1,2) -min_df = 0 -max_df = 1.0 -no_below = 20 -no_above = 0.5 +[topicmodeling] -topicModel = lda +ngrams=(1,2) -top_topic_words = 5 +min_df=0 +max_df=1.0 +no_below=20 +no_above=0.5 -top_document_labels_per_topic = 2 +topicModel=lda + +top_topic_words=5 + +top_document_labels_per_topic=2 diff --git a/corporization.py b/corporization.py index 50cba8e..64e4c47 100644 --- a/corporization.py +++ b/corporization.py @@ -7,6 +7,7 @@ import time from datetime import datetime import re import textacy +from textacy.preprocess import normalize_whitespace from scipy import * import os @@ -93,10 +94,8 @@ metaliste = [ ] """ - content_collumn_name = config.get("tickets","content_collumn_name") -metaliste = config.get("tickets","metaliste").split(",") - +metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(","))) path2de_csv = FILEPATH + config.get("de_corpus","input") corpus_de_path = FILEPATH + config.get("de_corpus", "path") @@ -121,7 +120,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la raw_corpus = textacy.Corpus(lang) ## add files to textacy-corpi, - printlog("Add texts to {0}_textacy-corpi".format(lang)) + #printlog("Add texts to {0}_textacy-corpi".format(lang)) raw_corpus.add_texts( ticketcsv_to_textStream(path2_csv, content_collumn_name), @@ -140,6 +139,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la # save corpus raw_name = lang + "_raw_ticket" save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name) + printlog("Done") def main(): @@ -148,7 +148,7 @@ def main(): ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de") - ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en") + #ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en") end = time.time() diff --git a/init.py b/init.py index 596190f..71c28b2 100644 --- a/init.py +++ b/init.py @@ -264,7 +264,9 @@ path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file") stop1 = FILEPATH + config.get("de_stopwords","input1") stop2 = FILEPATH + config.get("de_stopwords","input2") stop3 = FILEPATH + config.get("de_stopwords","input3") -path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file") +path2stopwordlist_de = FILEPATH + config.get("de_stopwords","pickle_file") + +path2stopwordlist_en = FILEPATH + config.get("en_stopwords","pickle_file") @@ -293,8 +295,9 @@ def main(): printlog("Build and save stoppwortliste") - de_stop_words = create_stopword_lists(stop1, stop2, stop3) - save_obj(de_stop_words, path2stopwordlist) + de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3) + save_obj(de_stop_words, path2stopwordlist_de) + save_obj(en_stop_words, path2stopwordlist_en) diff --git a/java_LabledLDA/models/tickets/.others.gz b/java_LabledLDA/models/tickets/.others.gz index 6502fb1287e175c9641c423f72c79f8e028335c1..48c3c1b0bebe2f656322266a3437a2e3e02d9404 100644 GIT binary patch literal 89 zcmb2|=3syT?=zYwJhqr#G2UcgWMJAd*)7I%rE2Fkwko!^m`c^3tTRs@-F9qROr_>e phqi@FHN)d7>jJ$tu5IlU4EIa(%n}T^a^OG%H$$}9;c5Or69J;VA>RN1 literal 80 zcmb2|=3syT?=zYwJhp7TVC=@I7UQ{6wR0O=60C*!FQUCw| diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz index c7df8fae1c85c0544eac7df0f7c5dd6541ccb53e..f815b2d1901c638c278ac88335ebb97da5c23ea6 100644 GIT binary patch literal 4151 zcmV-75XkQziwFP!000000L@!jk|nE-{Et=S5`sYpYX1vsxx0u%WMy6Jepa{beWd&V zLJ$lZfV_X*UnTx({I&S&@n`ur;?Lu6#@_>DZ>`5?&GOWMEjCG&nh2Yj>a&ml9DS4`gcn)M+K1bMOY z;+KK&mxD}@6=dBY1q9Uwsx1=KU8uHDZK2viwS{WszDWydC2gcnH_C0KPxIyW-9owN zhmGVvsuNBweI;&;gd1sYv(x;jYMy5kaoXDyech=^ z%ABs8^&{t1j9+5>607WeY~U4%PHLnhXcRcPCMm~Zu=Zf>!Ev}r2S;OcQVWV-{83_^ zg8CMdI&3G0~!8##!51>g%%mJYaBxUxq zok?M$WoDk*nQCB4@=D2qCCN*Y*QPvgaxRv%>u3(8?96> z?SLACjua{))dU^zQQBjrs;;-_*vh&t{SvwQ6oKAr(h8(GgT5hH~>@fBx>wuH)YD(q=!VpvquY$Ih%;VVoNW12OTjWcsj zWx7DAyPYhSEwFlLJGHAF7-{3~V5*+{Vj1koEFYSE+=7xhQ+77|?*7S`d9@FLdfzis z-)ZWim9taEE?!SEK|*b)5KC7>5=BNjNRb02P*Mqm{R#zCXg8Qx04#*3q=r@^6u?t0 z^bTqb|ZT0m)q z;ez4>bih^7ofXL;aKRKT0+c172rA&39JU2K+c^61SrN^KO|D3`u$NL;_F3|v5q#t21F z1x?TeCKv)2Oo5~{O1WAqhrQEax28F1nd$-)41o)#V4W4ZCb(1b)jGFR%bDw4pqcAK zTT`@@DE4LevWz8QX_sJd9Dv}HIgIm{pkBoONkK=3@LtpU&NJ*i{VjYK=e~+^u)tE z9v>j99uaw!O(ykScxL{GfiX8_V=o#bw9cQ5q>R7HXQNC}e$3?ePW!e!t~Q4}GL7Zb zEKd&o6_qMdix8TQPy`hK-zFg1U|xc%g`Oh}fgiPda!di`_nY$JJXaaZiY|nZN>4VR zM=uE#fUo?(Q`T6)30tpUY24^dhUpTde(q9*O=_AWfy^QBqux_Nw)CBMs{CH=Kc;BG zky(Cc$v*UwEJ~5m@jzZbQWp$bZ440udMqBm&CCm9&?9pQTrdR-0G~GC!d^oz6l3RJ z|3HVDJW}-qY=Tqp9sV7m9s2oCg*AsN$F!PYyfR%dkIYFLvU3joD8sVRL-qcBXf^UcGa`de;zjO>OT2I|s9CzB^Nh7MYqF zzMc-Q7^LD-O1t=Z5iBppCSX8tM4O1s%*IQVxLU%+wSt zLg;>$DKox}nF*C;Py~Zx#!RJVSG}Q>9ToayCXDDO6AtTdscUPj>7SUOUV}b@{Duk+O3}E8kt%KX+BiJZgpQf+wLal0(-$Y>d7|PeQXVr(P8+5 z6nkPa8=E8Z%Gst&$s5wzSZ>d=&1_j#CL7yrmNQi~7Tiq#%$X@G{K6qCn<|Ihxqr)V zx#!!5(#>?anJzc;DUwBP<76N5k~6;19JL+ko=uH;)>0$d(wV0Sw|Q#y%#?VkvdhY8 zRrYwED(%|#yUu4NhoZ4E1q%8monoshtLgryi>83VYL$jsSj$!$DeoWZ-Vulz^$QUX zO0eQP>1CC2b|6eHh?sAd!`_$OHob#q78PsM;x?x^StOazTm` zi>YtMPF>hR2GG!}MY=NN$?lrPqH`S0L^<-7^It4VH z0-8M2i#}=B*AH%iVj9z9N6N-@SOW$A2-Xcp;*31&Fge<=ydP_Y3kdK)cObZ zA@HMCvTM97Emi&1ZVpYmsnY@mu3m1AIsn%-Hzx*VxR3!90juNk28je#yJmWnT;LBUXVZau z)>h#+2hGHY9S&$pH~_O5aJ?Mj(b7vhPc2HiKO-jo6WUBBEI(HP(P-_6i1vsmFnR?` zyyzRPlNu>5_$u%9M&k`&o20tZBWGu=j*C57Og<^UmMb#@8kqqIG z=^gpa8(b?yrdsQzmrqQ)b?3+J!rur-0(3oYT1lbCoBq8QMF0Wy#adEtJ)B)AW6TbTDSMr*=Yn zWuCQ%Y}mKVJInIyM$VFAx{+S|7N+kS>&TpEOqJR~wJz!ORK^K%S?$h?k_@LxYK#1= zGxXb6B)WJTehbQxny*_v=dB&xr+t@aUkP-(aFx}L`>mm}u~Xw<8M+XAQu9nza`u`x zl8*Neu&0aPw}Nj4MTEb2nzMMrzZi5Z1nM}Ub!^~Lta2+o+*qRJMN3RPAz1Io(_PCLn;;; zGYbGxnW}Q&RQL(j8f(LS7cLmJ1Qd7WS)(bQBD#Z(5m{6f1RUhZlOA~Z~ z36jqRQ$RP=^`s@^1J^qiiPB>huZkRQ$xxQ0S4zJe13Dt<0FLs&83)5uuMA-)V_o_Q zlf%j)hPB=p)^62wdE;^44SVdZk~UJF6&Gy**YUfjPEW0s*Y)Y}?6bc8IKl;*#de0s z`!HRG^5?1b{<;@Z^S%5oWrOGG>hC89uCufKY9kk7KaZt;?t^Y8T#bLb$T(kin2rH< zPu352Bi_@CDEW2t>Z@G+p)kPl%62~sdoHeg-$TeY=R{<1s(Xlhq~i9&N4TUU%a7mb zmetP|G&labfOr~zg$nY4809_3to_cde7YL-&bKi-$#aFHRjSu@H}1NB@Vt1Cc6J4l z8I)*-w^GjaDX(ek?K~sO4yO6dAZAxkER*r)el=oeXzYqcOZFl2^maiv?wR=(;<|wM zSD2lLn68U>PoGLhe(uMmf0_BHMXl~l9KA(zgfbBJ@mwIkcUlGV8X@rCkV zew(uLukIh0J0ApgKKMhhQuuy6==wY%EX;5|GsF`;z82IA!kcWkhi9hV6Xx?l&$ooP zpyX@9|DR8NKJlNQfd3nvBFH*F~b9?8d^}d_R zyW~8X?NUzHyzj$OO*tV`-p7=#n%QtVDpQwDQzj%+H6Ic!<*D~~`uczKIQ83&`nE5x z;G*x3QE#MpXUT)geqQD0)nktS%&V>2kGa_Dv7TmkmQ#afS>5-TX;|{}XC4IW5`XGi zY$`IGib|)Nn&$({S=4>Wp0bbWZd65B*Q~n0*@OoLlUJKePhIWsS0^=|9<&se+Ap6) zBlf7@Lg`AL3}q^pax2yV-R3Wl49idq$DSY?wxNIbw6nMs%uFu{JsP29vy+{8_c#7K;s z&piE$Uh@*1!!3l;DU_kiQ|qsHKKiunlhJJZmDWK^8Md+x2LZZ}O8EfJ)?Y010RRBA CJzxa@ diff --git a/java_LabledLDA/models/tickets/.theta.gz b/java_LabledLDA/models/tickets/.theta.gz index d75dd2e0daba512a7c5a1b720d8ef1f7b98d5d20..8d1f466a6aaaa9387100e9840af0b1ad0fad2565 100644 GIT binary patch literal 8068 zcmb7Jd0bNI{!SZPbh7zPsimef(``y|87nnJvGPuK$2OR3quAh&b_nbe@@?DT6bmY*_UFEhQfo|$gdWiDVt1KQns%pNUv9L%**t`udmi5!){QiIa`)*9{%=# z94#?M1{VZaChrV+G0r=A)@7`}t~g{6(Z%OZdO;(TypouPVnZa<2+BoyYrXCz)Qf{dgSlv{#Q#)X#iP-#B-0p)qw>;v3J5SRK#ov5rzKdnO3yn{n-Rj>nQq27t9j6Nh?>QO+6^CVxCv7d530i{#+9G; zJZN68dsM4(`aM+-idSvF{I&84inkV%(X0xrY#cXS+qiW9{=IxUZ}rK#%D$-QC+H(- zy;plzZM^+eKSog7|DAcR?g6cb8Z7ltMG&kZGhtBPemVrR2g(?YWgGo|Y{hq^wo?3F zwo9R*;~vj+BBl$2#35M>@pXRRE0s6@Hba93Ub9~Kr?8ZLCM?vsuuR66Y#75C*%Mat z4PjuUskGR@!NFUC=DH-i1$Fsy@A3&l2$goCc_jYz$w4oSJjlU40q&jzjguPEqnptQSh*0`3OTh&bG3`$=zUq#2iGQ zBD*nHI)ck5B$!BAbA>Vw_1}D(qS2xrsP;aq(>HuJ6CDrcN(YRYqHK1Otfoo_j2n0I zru!18{19?qu^_BpD^;>Jpl;89EcYb=j~LW90t$y5(oZ|#o&lJ=&g!I^F6xCKDCsypvba!YL7Q4@%Dk& z>P5M(KNo!l1PxR`&>_v=Nj9J9PtYQxOd>5BxPd033d~xv$zpP(h}H#*JtfF8&Td({ z+aK;|U-jtxT1QHh%~Q%w$>5z~v@V_Ys*3>0elA^gdnpZNZ)+bVoi|kK__SMaa0orj zk*@Nlo`HtFs=q!8jiBWTo6ph0+3q1^LUOpRV~(9=N%){4$&dgYf^x&{!)xcC&zOdB zB&Y0h5phDciZ3Y@o`Ql|(ePFSQ&O)Jrs2J%{V%#*>daWIL9;YpUd=FnZkOk=_I>ie z*->fAfb?woDg1g~Bu#8Wipu%nMceTTqI+yaqc8*j=JKZGk~pUwVY-{OZfkU?4|s{O zQu@G1IugwvV{)Ub3{~Gk&ApYVgq}|@$D%8gC2eIS<&}z(*Rbh-`H>n7BpDUVDlZIX z5|cv==xf-`eIUbVR^B|KQG6N#F3u2@?4`sW)!M6x)s6hmWWC@qJa*j1Q45nwf7}t7 zN(}L_W^h^1fEgIQ(VN|CPF+ofh5dIiO?`%H>c*Y~6^JGzUdBsP(wpNTJs^lys!u<# zQSm?}pQ!+MpdgEk647eJwDJ|1ZcoDP-4Tqp^c|gsrHY%J&&+@Xutw!4TuRDL^#JZmZ_6A_j4R!~Z+REz_~6t*?Tf;R z*M`euMY#tMt6aSJy@v{o`LRQZVw3$lR<-N4Tjx9Ed+l)EAugQP?R1xeX}4PKn4N(s zn228gtnb}m!PkXmg*yt57hWm^F-DX1(;Uj&2QMh_<(!xG6w2HwYx98e?EA5G(OF0% z!DxSpjU)VleQdgLS<`eBkk05;8wfuBN(Ia-$OrB3QWcyRJjb7?S!e?Bs7(p&WC70 zmO%_4t5dI@`LSh+XA~8npdFB5?!4Uf9Q*|w))%vP7I+k`voxCMBf93lWd_|kzcSd@WnX(0Rzw_P!1+CQzn}ndvY< z@$G-#X6u}8u5PMrs%xrmdffDsM0=g!w-DKQ@|@qAZKup>ovA)D^gr|Y$Z<*B`$5Tqr-LW=vEgZcPx_RUp~kGY4%;cV z3S07gS{cGHz#zU1&8?PBGH-^Jzgd>0(W-t-t)kxZG;RR@&YC5c z`Q+qaeTo+pZH5HxZ`CYU2N$iHEH_YNDCZItN_JRF9Ab~lCNer>l2Q?Xs<_|u^s5d^T|QM(YWjy zI4^)1_#)i?3Ov%U*!lVa82!7dFGiU6Bhf|jCZ&1gX_wKOGWu0vxb;yDPuGGoZn-+( zBF;_8g^JMTvsWmDw=&9ttWGY{AA|pi8}6Gq3N4wqc!951$FHsZHq#O|c_kfR)+7of z^Rayw_$PmmK<-wU)t!$B;$MpwKgvXvtlEGtN^TI^(Ca2bYv5&h&)DiM4y+ZnC2%POMI)}Zq#m` zzd9e|HqTh$z#U`E?+oXAt9Z9F<*b`tqAH7(pN*+O>g6W>y&1kTP~|J8zrZ#CL=IHrxa9^0gK1+%Dq@Y= z*NRYcLY$P5lriL?R#{lkQZUVkGMg=YYGJuF>DUJ zdEYk88=qrjn93JQ`Y`d{w$&M4@v@Z|?sM{%7zXz{OP>)tv~h+G0bLw5QJuHpDcS(` z#KsNvCd&y3IY(oqk#Vvlz}a(pK%5yAq84gfJyHy4(`tHp;$+QEoJdalKwG&(tRtLP z?Y+b^o{?+KY0R}gs#vyGFAy*-L|ETFReyNWt+NvyP$j@xOA(f1^p3gY5up1nmXdI) z2{3wRNj^Ugaoo)fF^>#wx6fHSIc_guxs;gXB$iNOtVE^%mfv(Kp;C?#J=w`?DGh-@ z?Uy_C9E}?5%aO)x+&fWG4umVPz_m;rkxgod$KLbwLBctEkAig@MrnTl%dDw76YU(!FOO z!M3CAk3ccJyn>sYBz67Qd+5hw%M?Jq^%55r9cwQw_F=<`A@i%2)pu=d&ku44gs~Uy!tkp;jEhN>Oz`~AA=d$1xX`ObAlDUE-byM#_$9{_ zyd1+B#H)XU z{z+!>gyTXPrm&4pQltix!Dd`-mzuX14+%`si5;raid)tkwxVV6!%GR)VuqvaQ`jGAgNc(R>cl^ z@h4<;z3mkWV`3&NARfXThd&qw9t=1X;2(epIIOTAR54JY6BUwhWA_84FjV>xP0#@R zUIsLPEJ-=Z;J2)Qv3$}f07eP9{*K8vM{^b(4>h+XrDej3}vei^PhMXDF)o|tj4RBKpRaF{td z5}f0=CiYXIfi-{?+!p}^kgCSQRjpO!y!b`a4W@@c!SxV;|6qtNA~0ak^4_H-IuBzX zQ@u@cQcHNQE4`#YCf6l}uzzV5Rtf*IJjWTZ3X2eNshB%{tL2pp`!8>$?FS;6HDn6Q z%BQVAh2E%6p7H#GhvgR#fh&F++7zMSR@7i^%>L1~f~_W#sm)str3MviKHr~LWxCNl zq5navx6XLmb-tj~bmLv3IE&?*b17!zL90z#ag3ZH@p%SHJL4aX<4+Jqr9{tL)E0%y`=@rYqvTFkR z{ns_RLGqrhb$5`-AvLT`a;fuM?9i8~3`yQgUG^~PdJ)q#MA}&i`A5SfMnr4yWVA)e z#-oECe^Hs6x9WyzfrWg}=h3u(EM+((t04@GqjB#+E}GTP^A*E`VsMaG`k(!Va=`j= zg&{(Bb&LO5f#tRg4FCCt^`P~|j(;w)poYtx0}s=d?L2~i#Ci91v5u*w=aH}$zw_g5 zNghlq|8Z>Ym%kaTh2GpaU#g>+?A2SfhClX?I@!`vz1pp_yJ*3>zOqF}b=5d`7ySB}hmdub77h*~DEUbQzfFVg;ued6ij^!ambrce%-6%qQl&9Vu< z)LIx**x)9~7d$R-V%NL=4#%VO0HZc$q8g0Odd-Jt@@M^~hB?nYki_8wYHqI=mr{72{k7I znqv9cgO(iFUOA*D@I$Z_27riZRqG<|{+#E*-VltDSFd|NUzECJ1X)@(7~tGpMHuXM z7wwWZ(a0nH+$a>XHgo8*xSe;`o4c}wWO6FhHS8H6P0@?V`IUM*)L}t;5wmWHFVh$5 ze@-;K@e+58po@%uyApD20I^M0vVq$dK7&l4PzrME(bS)nY}1piiqYh7epm3s2OrvH z$aKL_mf)%FNOlf|PK*B7Aj?JoYyP=~rSK=aU;+*-P(y|{?b$1by{Pvy{Q#b!_t(>D zP#8erKneJ;nCwvQFAQ#a72e!_^RfZ4$o})jfAaM zISE^B*Db$6#|x`A^n+xjO~)${v=M93B_ChICa{SPPf^u&*T}S!D))G7VHX?B*@avOMGaCS%DJVGmjV- zo{`frgCfDLh$Otqe!|DTG+0^er=k_sFKWn1L2UN1wSP8^|B~)Y+Hqbv6!7&d6zxjE zmqU{%>fqia5!AdE(-!=Ug-8cCXS7xJQvc3E1azOH68o!uro8LE`L6A#G~aEk^X)*l z*&C4FmnKn1a;sm2%b4N-rbl-S`A@r*N8*ItM+MjBE6kEvNcNb4i*ok?_@?V5DG!?rlr^@?f%O9qYajPcNDt;h<{oi~9~ z+tU=N(F$>&_dE}!jYqwyecLKIX^c&XiWwOb`(b@3S2K;VQ?081FeFOPUuvda% zFkEjC1^L@K+wW)${dz35&iAfE!AgVS>~9um(N3uW$DF}ajjB^UBlxI0=gRP?ogw1i e8Z*O=#?Tpp;o1Kfzp+-QC>@8r&hc26tz1*Iei*IIiuZk zW@dZ3r=LR-1BCeB--X{q66HwUX4X4G<|E<6lbJ^#FQq6{Iwurhc~w)_V61xjG+UEP zX;R*(AXRbg8m@6yIt3?-C9zxbW;zn&TBz`4%p4kT7}i~oVsH0km-uq`WADXc?N9dk zAC>@TB9*;e!N24_>5KLTJi2PlP0vLv6-FPX87wRltpeRh6Yj*g;`dHOrStawDNbO4 zv*w!18h-Ps;LdEyckcK=PyhZN)YHtE;|;l<=HvFl%hz`L=z7N2LJee=`AbhOM`^6m z8?T%ifFB`A_)RkjydCx7ZnFSJ^pOdF9S=)Qi*tA67J6{}9{KRIw_!C68!Z3ui^m_y zzOxyY%DVv$pPP`3q=TrLReOItmgN&&o`ff<#Uq2w7Qguxm!|aHM^K`TivM@iqXVy2 zWfi>bc+<$c{H^o{B|Q!M&*}C{z;dIH-xkljxyo5$X7?sJGkfmdl&GjJ6YS$Jxlb88 zFM`4#8qmc^X^_X6i3YOSwM*9pjgS(X=co9I$O0mQ@crWX3zxoYljrp!xPKfNwrnAZI}%!xEK|UD+*`?uD7Zyo8~IiAJaH zf>*1GA18Mr-&AnrD7zK;Ou37#z;IrySU~;9NYu&Y*GX%VAiOpI+2=7EVHDH4?zi{&82#<|BPoR4 znScPwdyw&hNmmV}S154ZQ~@8H&Gt_~!heXCBHeXQ%!Q3!V=@@}9T|1s#^o zw;`Nw0>5Zrpp$3uhHNraptCDRZd}sH6Y6+hnnSAS_bYvL>9}@m9Njg6+Y{W1%lK~VI&3M z^b-$*Y%iGNz(fz&%g2t#!^F7qrf8sFmbO)gz_jhO&V>`9#aW#1HOo4>^CcSnsuR=V zo&MmSrujOfpV2_#uW7HOBEGLH7uw&V3?IeetD}-*GPPxLt_LLmoxU^ONZ!i+0HmIT z6n3JLfIrV(43%76tbACkEjh!@(?zQvg`ja8hO#P3q~<*XTE#5_O94yO7O_5QOxe=* z>*|N|OH3DmAMHRpCVePdg7mbdC8bW<~|7|%j;Xwr~tkQ{8>VI z=UDxEdGA^@Z_@ZJnEy|%pVF6tA0!~5+}-Uqty}yVIW7^V@z_EY9h*lCm|&uL2SMVw5RO)J>7@$i0)^+4y;j-|cBDs+cq0GRHYEs4X{% z0b^^l^jkH0`Kod2?bh|o6>B9k^_>pO$Y{YHZz1`s@1!_(ehK~b2NxaX(`O~8(bZ!W zjJAQwaW;afgqY8t_)44*_)fy+N5&9A{Bdffv1TiDdw5#yyy1gd@^F=s1k!#|gr8}J zayYxnckPD6nFv2i$@5Zk@bltFX!Oa^(6Uq_@D5acD1CWcaAf!T@@5GtNp*a5sq!&f zDBr?c%;FjLh#F$|<{8#*%4RK{Z6N>25*!EX}aKi|pJ5WQCUeHDIiXX{ zx;5RsozmpQVn%&r(S&^r{;@!9USIA>(HCn81o=X)DH?y5v7<7Ux2H5#Tm=>f5MTze za&6Osf_j^UfAnBB8>lN%RY9}U6gc;W+u1>vpyH< zs1?g74N#J%ta~Wm$~_^QhX+!H-ZSQDJsGS<^&E)t8X&#Sy0Jy~Z<>~KtuYPT z)#`x&==N-#uq;6`j(aY{s%WWL111^zOIo_C=1RE_7E9WHK2hURP$$s83L)WxZi2eT z<3Gpr^0qUO&%P3e&Ycqb0<3)!E~|$x)sL zl@tvmUA8I9VhO*c-7g|if>>{KF(v7tO(_KOSl`|BgcFIGTb!Tn1@j(xxT`AO@UIIK zs7)g}(y5H%M2y@L=nrU+K7}DQutwiXOcro5catA~mTi$*`Yl+U7?nUVJQV*Eq-*R+ z&EjvHuyml>DpEVN2_>G$p3rKfN%$?B0OUqF_6D~Xw0S_a9U<|5zTW)y{Vy}tYH5XPC<{xeNl8_Aj!N>c zZ`+onb-;5490r?0bqGB?1?xIr{I>}_Tdilsk{d!2iD0oc3FcDZX`iq9K`$0x|uI>bk zqWvu)EBOP33_TE)4Dv^_Y57w{+6tC>m?$IRN#w@~FW7KZ!wn-BJRJof3>XK@4+ zECTOu{%sd|I<5uoDejt|@9j0+l|BRWKwZ3ngSi4Go+}sAJg#Ik(lae*CzU^NMAL6b z)4mHaP(HE~p`3TMjI4(Vd1LsTR@9OL8TX5@5R%alJZzygKB&?JE56hvxo^S0HK9|N zcMxDgFJ(ciI<>&;C_BPs@DUE?Yh9|woMr&bSj2U@KDCeqwnTp-)#c)X829X1+=zB2 z=!efYNxS-?^5Nj=OSl?YC8qqnd&BT}psqTO+9%^XG3-+4PNgo9KwT=vVQ5c&BCu>t=AYE6qw)lUF_Q!XJxBG%&#X=bAlAJGYxn?}Ha?s+fww6i!hOM~JT% z?>Kn+w;6QjipX!u+yA+%6*zSjDSy%9kw5mVWZTQ2**@BW{Z^LT3C7rCV zQM|VHs2GPl<>x}B$oD4-1NDM@L0R|g-tEXg5djLxE6^)Q^w4`^I`uNrWFk_IBth@D z_b*p7EOT<#O{&~~y;Agot?%}hi&jX9y=Fr3wi=rL{;}~U20m_jdL>ZL*7&aw89CCd zv9*~vcxbhp*zl1(@=nn;#@^lXPR?5-ASr}pHvif$9V=dvnubnJ=~qvhQ|%0|4iY|` z91`9-lrq9n$~93Cnm>e(QaghGs)B9<*VwLNTyce|!|)E+_@OnN zV}%*KNKFc;Q0wKt)P)H(x5YtYH^D^%;KPP$gwvqO4$;((SO-FR^~S3j`zlIcT|=*x zm*qN^IA+x|mY0v;HQQwqTVei;skaT|-$cY;KDzjdq1kr7jpau}WwfmsWLs*-KWn#o zue5#S_kD4l5x{m?N=6$^k2~%|4f@VJ5n}tuc9=uWhGj)(on20QroRG+tv!t4_jrLo zU{MeQ!RjKIdY-XF)8+Kgh8tr*U87`Mk%sr%L9r2g7mruczqT_VVrQ<MGY>|4T(%+5 zvV5ZYWUxi|Kq~0n5Iri!Ea>=CL@<+sTGjg13a{AoILP#@+3GasL%B*VtC+gup3`R#D!tAJqn1 zPg}c4l#srTxsSX2i05s=jrx_he%9)13va|^QCqknkQd~^b3a7&~(-(7Dm&SL*(Zp*mtp#A0 z&9?B?L(NUeag(AQe@f4o>eBiZ6$KeN^m5f>QZJwOM3;Ye0`=R4 zpIwj}kX(&-F1*twoiafMy;=t*8TbPmaksVULVrk$+|51luL_WYO%N&W0q%_^As!HN zJ1fwNd{f_t=O^=;S@IeEFnltFMqvHYZ%_exW>X2zszzjxe;+4T4lDvlBWs@wL(2=8 z)B_o2xc~Ss-FcE7HSo{GbXKsJ(e&SW%3|e=NJL)?5-0K6r#uW2CD(mxh4usbz`de9 zq1=;IYxo8$y$$xp5a29-%T*s0@J0}g;Gv5Xo=W5Cn zYRtXRfE4xVKZgd|n1!-^v5NznY6*>3%OPjw%;nu@q0X}wetFm?3+if(>}9T-SWs1A zi}jjk5J5Ep5IX*>1QGoC^=fty>ZP&l9>yti+M~=jPgjnmQHe2rri3hjKNGzPB7{@Z z%EQJTkioaTm;qQN6GuHUI3owng%=JHl$?t(I%|J#jJ7inpiu8j(8`%F7G%aw@A7z@ z@Kk)DA02W~+FUXaKPOMNJ!CzY;SSEJ=;cE}@d>exY~ zL;6$G0=fJQaZEmKt_%_n??p|wcdSCx+7A+b;lyQw)AmL73GwwY!F)X;5kcE`7L1$c zIIG(!kiq{nWmJ~B^V^`{a`OP{7u)r zmo!nL#?S_6l4rgM)N+Bho0$hzx*J1Xsw-Hz8H?AR_JS;OQ6~8>PkKYmE1)si3!pIp z;IzhRFF>82)dzIiyANpuX9-evH>Tf9K$$$;JnjW;f>3e{x#&k5qbIwejifBi)acn$ zle?oQL#xyi;WM-n@kxg1*^|YexWE%tdf>69H0c5Duaqi?uCMWf{jzsx{;{4O+trPK z@&GEhoq<#_`s5mx{%2)r$b>dgQ{oo25+e%rT!|V@#rEh*LMLN~GA+B*{AdO1ZAk}m z)+xIQBes&ZrF-_bE;qO<+%jkr*4E=M>Gp<;U91ypT86}^(cxH1EdJvY-KXnc-qjp) z1=$>?cp*-H5kv5p13Vw8nsbftW1ieOa*X{_l-nyza%>u0D^af*fceDrg=UqJ3xj^GqLYH02?4snSC zd^NmHeC~^iY0shNERI5`Xby5$Zw|r;t!xvpgZ3=XF|3`7-%R*((&@Hx7=%AXKM6Fz zHVysahiB3;=)!A{x7*E*v^p~%W|9uAiKjN0#agHy%?edZ45DrN1VSr<8Hm7;XDvi^ zHy&+($Ht3cse~L@^*~_VHz@r}traatR`EwX4JFlx7oq6Q6dt#{TUGYc6t0OQU3bD+ z!WQC>QG+SIDjqV2ZFTPgpvp<-aZv1djMPF9LW|Dskb3k~$meh9$2J|R(z+J7-=@my zql6}G>OH?Vsg3VrD+(6>PPsa$-rrf0wP-grB&FG8yE3DthhDbU=^y65PQLLZ)QzWm zz=i>pd48SiQ4;0CG^Nf1+2_;O!UnJXI@zuX7hRkn2=yXkOmD6@Q_jS$ZdE|KyuPYl z(MOUS@*GUq`3?uwC9jPcJ{=Jf#a5I{bNm?WqUh|CxXXvv0Swt2C)SS{9YsQL-`22C z)f69sqZcfJmmeh!t_?;iycUl)Vd;EjyN0RuMH25iWdlA!Wux1D9U+xX^(TeqE_+7* zgcB!*RpNkTw}#p{%+Or*DdeZiE^wU+XJRo*td)cZC37B;Fv0+Zd<0#JkU#FH#yKyP(=~75 z!t5T9ZFldwK=Pfp5EXiqOooPhw9aXOY~QKb0Ey3>8CNn-hmN|)4v)|LnKh0uu3uxX z@0Nve7+UHlyrj&EUgB}WnJO)R3fniXqtk23&>KAq&-_3Y&0R56yXb3UNRRwE8s@$b z3%qPN?LfF7!#_l{MRs2Jj1fzx0+LQ^zXcM8+%9qlEsy1|c94CmI-$3xDHYl;r~N>g zIk-KiD7xqQ$0)1OS70|=_)oOJHDWEyx9g*z(+g60JqI|Gk~NfbBMmz^b}n2{^#aOP ziV_v!XtP3enOiOqFU>l;fahrSI7*<>$@F=bqs%B4q}RHp?q*snlAZukN0F)u%>{jw zGu=ujV=!Ca&AVWC#v2XWDSacZ*X&c~b4~djw{1}jFPj}p(_li!6Q;zp946Vg5RCse zzXUmymCaBBtLdt*#eK(2x%PEm{1P}Pzq z_O59_=139Sjp?(=LYC0L&{yI1L5G`pZVApTOco1#-U?neIorlbfWW0cdCVYH}RjN$h8w=epV-jDUgP2i)zf{f?_& zE8`jj&)Pdr%c(1}`D=m}V{M>D@X3KhR+0U#IemtnaU9Q3N4ye0!BqPnIp>hc=I$p! zW}ESchR1P8nh;vm+H5sZPRFh;HP@WG@7Zt z)pbGW8PaS-HOFWZ{4zd{@rpaF)q2}BDKy`*uIY~q!g_GcLctkQTz~6yEeO%TN>%aG zGJgM-ORiY8L^TJk%;neVwrLEgC#Ge5tlxCY_}<2Q`~%{rqEw3K`~xmv_28OI_RBR_ z4Bw(lLGoF58Q+}DhL47a^GFk#a~L?bT{G5D&d8!5MbDAto@&(HeSBNV43N{)a@X?W zn~Y{W%c7om?HrG(tXkuWX9|3uxK~zmT{zI`sP(kzdmR21G^934dE z6B~pzJHb$3`V3Zn$XIo_AP%J@MkL+G)jamF7H6 z?)HBmVr^Ug_{#R+zPPWs{(}D(;#hkjO07ncS=1nn_ZOp&rBVuHFIOUrceYgundi(a zMi}q!0-YIF(Og74rP;%A9MUNIjzP*G6V&To2pJ9@^qtz)S6VjSpOvmisY?`kW%t(R z^yh;*e>yZ15YvV(i6A~I+OXjB?`8mK@{bRmJ|#JXhte{3wys4*x>)^JrtIlOW%USl zPys>&Wds8vO6uxUDsy5gvZ|K#o=Tyr9dm5HJjyberFuW#ZWWVN|5##-57J~lO8Lh4 z2K~Jl80=1qr}vw=;TODa(qTR)2;e_KfcgH{B&o9~Ug;-FrP2O38`-f=>wpwj#&Azx zOH9@~cMRCtl_M199t6{KlU2l+XfB7_;?hBl!b-FRo9uS+z>0x>@kpEfwzI+ZJKC8) z;WtiQltb;t6@5Q7A-eKbLG<>ZOC_E*fy;erYtOTE#~yg z3I3y#T$l#%?rr*+(K&nz#15Q{gP!+AIWy-?WqkxgJ)R^El~ttJ-mc3{N}J}IC$?Q| zhZj(6Kbxg8FRjP{mNP=REc;WoFsXQr^eT7NEDR>)js3kp!vRBY~0X#1@RFN{S+V`s{N1ZCs-*f-T95(1*6sedSZR1MNH!CB95=eMvRwX({vMz z9s><443ovn{JVf1DMD%)DJr(O@=Q_86DGC*l{C3My7+!S};e$i(|x_D)_!Z_tIiTh)$`8S!;(4v!BI!uS-SZq}>4D48Fqa22wZpg8}gR?JCZmJk>g1_@A70W`B zSWAMFWdCxD$%U4+8FPxs*?rb}#4uT>8q#2I4xg06OBoEe3r@-?w46UnvYC}}3#DjeNZGWj)|E9r&%EtoN>OunpT#jSw2G~y)`Tl9zV(pMTym$2$-!-69LCTJ z1vwubN0QYjLy8vNA_c;x+rx%(V=oibM5%=Sifnl~fJDAE%*S_ZDn9`(e_^rE8Q zx3b}F_vw)o$4@Q$XC-}2v>{F#Q&#zyC_l6$nzRKO0t-DN_fBCo+NqQQ*-A{ixDy^l z7_A}*>pX*mA`JK&k4_TCv=}WjCNX&L7n^!5rklk(9v!F#oID_vMo`X+P3)g=Fe|>E zum@p@FXPdLt}Q_Svqy$h$Ts4?i?@=9z;qA}s*pwV`Xi{7J<|bpp6MjZ_Da!Af0yn^ zM1eCT!zyBN%7ZEX^pIvlI@wl zy__z;0o8{Je1REkv|`Pyt@jGu*il78oa{@b!K8NqqgZ1}S~D)CNFnsLml$`!in6>e zzO3RK1WoXuL{AS8j2rk6r#biTZuh2wz$Oqp`;|CT0=g?~AY}~MoK2-IP$28^ZpT4o zXW3}ju*vg03M+ZBmOCzviM91u&Vi^V-z|>!x2$QCJ!0>3m@z#RY2%&ttjv}tSJJ+> z$Q>hZx6cjg--;xyqzI<%WG4qcRz&vVwvv+rAHIf?uA~6P63WrUrOgpSNjWP~gJ~7L z&{i6ahycpc8iW9~pf%FS$yC^&yc)>z;`gnn6zn zHfa;lBuSQlL4>)q=)pg5tI?pW*0F&>lLx0Elv>eaw0eR7c3&LWE4rptD)z!n$lziOpLjU3LlWZ8b32C)MDymVutRm(jwUY@30Km7f$v|AM<= zkzW}_Wj-C;`t z=jgwKA5G7lQ+8JYgjy;%y`A)g=&$_3#=R&^GaT-8B~>mq6o-GjgCIo3lw2b9Y?+5Q zsGI`r&2Av0I^a-QSxkxJM`El20Cu)s_dI6iVZ2zoHiMNrLt`P)uuBRN32uusE@sYJ zxPl+Z#fT&DOTI$TXkj7)CA0 zLq@F?UHiIz1df+ga zGh5Y(tC`uWkIoe+A;^NJNEH?Yr0QnsOD?0UD^3|1Ts7-1p&(0Lc!LxXnu+x z8P@R~_mKKFt`ynXyQc1P+T$}L(xDiq7%M6{FvJTj40jURXZMFrFpE>4Fz_MR5A7W0 zjdZ10M?AqLAQ_Utibj0knoQE9Kf-w4{~67&&i)y#5NcNY{jmB_pHW>LuLnB@0OL!K120q?CDjjg zF8V`-FedarFBbPls3?m({5UN^j|8A7Ry>%oxF`W5J2ytWpWyye7y#_Q%C!F+^8=+5 zwve<);&9M@6>na9^Nfv_5dk2$v@(4SUM0ci5=&|R3E!`%hPl#13xUYagzL);f!I47 zZo}grUt%-PjGJZ0z`0gVYkyi+p2&uks>r|ipt%TmI*O^JTd%ClE}SYav^Gx}UCm){ z(fquqsi&#AXk!^U@`A zvXMbR6W&RvE}rcqBoqGd#R1|ga=!li{IWeWzxtECUZqv@`gZJXgjhyZ1@0bmOvx2L zTZrF$Xf+zd_F)w1+`hN4(e!VrEQ}E|Dy)Hb-aTb|E~Tx0}6)S~zF7OhhnQ0=XUzSHH+RbTry zycSb5=O;XWEiS{Wi418X!DW`4diwMSSrcACPu}JAPEUyuGjVe>C_@Bgta(j!Li5)G zEyfQ@qHl21RhfndVckBl;KIULebLQSfK20((tZ4G)W*^#!31!)EXSZ{Fv7d6rzW5D z4noJ<&lq&vDt4EE*1u!v$FY0mgm^te+2Lx&E2yBrEjUa3ByN8z?gy7KJ$=e5`J4qp znu?Y)1&~reL*6FWN~X{JT~OBf`0dP>gDYn4=tcgi?T27-nxaKj5#Ger?dMSb<)PuD*AQg|M2hy zoyecS5M`!wELza^w0`uAeOcbtvk|@wH1w;B=Y{l6p0-K6jisS|x}q})e*eRx>8qz6 zZfxb*7I&9*1-EwPP)k!@+X!ULYl+J^Hi*lT)4C|!*}Yz6bJ?cliREl57bdr!cHE*6u zSpKspCKLH#Hxv02)n2VL$H;8?)ceZfH{fH&D4zB$t>ZRU zVIAT(#?~{<<%rt0Co;yobmKQpOnqc7-59##am$|d!L_*g@f)j0=p=1s@_zeqZ0`HY zuPuBoG>TI&MRm|CC6&q)z;$N2lB6$OnaS5bLy%XC4@{%8lqRG~q`uE#w`X<@#I4(g$u5hOKvdK@;d1##2wbtg*Zc+2goI(i)C=)t9i;L zgRf4D$F5-TipPMDNz@c8luFVGTE#i@mB#O!_%$osHtdB^ z;O?0USx6%V$bC%n$f2`u>-+@5`$8gAIdS}98rPzrtpT}2utWxH_j zY;5&{hkkvbSYMn>y*&cn95>0-$wS~bOs;wsHZ1-HlPa^JmwZ(R- z=aM0=>r#BOm}JI)^Tj2D9$cAOw?E2TOeMoF=4H~fkLSR|H&<^pmN+(_Mrg)qKUSA+ ztAJ9plr-?Od7t&a%~H3|+Wo~PlrNV`ru`f&*wl8+gx+M#1Ol8^|KftDmPY6m3Ff`k zvJaH`tPls!V?LI+%AH1N!fZB1r44pS?8nfnJv(?z_|2Bs!xDq#?iSN;CFpaQeA&qc2 zg-YhNrEVVx-Wt}7e9d$WW-+xFNv8BRV~Kq5OnkEDcGS98AGeHdRq1=Hk^`w0S_kfW z+J0(UUOGkHcJ1}$(arRyg?6qz(e8jM?H7u{FhrSzZzBPSRhgj_vNAI3Z#Q}ilCd&9 zsCWK+XP@|-MV9n1^C@bM_CP-53sYxq2~^EwoyhC4K)WV)hz!*MV4}P3Na@)<`atEWOko{jxPj!^tJ{(4}a78KM(-Iiep1SJlpmh7-tZ8;XIK zigdvYdeO$g@z1@fkyN#g%IFvtDs@ZRA<@-F)@tbWqs8QI!zHFgiPG5rV560VVA!Z> ztWP*L7&fY_b~s;xeG%AP1+mN-EvxkJQ7;W%EsEsc0ZpFhWpi<;ZkdGQS6nDLm|v&sEHrXN1;mKx%(P^6~^ z`ejGUeYVg*XSLU4la869r?$Mb{e`Dx)0fj+nbCSwg6&0KTlEu7!0Dx-srn}&7+ZSI z5X=u96Gdk)rz&Tz#K&6P?7FgQ3#Q%-63sO;2`A z-E!MwdHT`Shd6J{ja^bbSt~o_#cRLG^V!>UXn44B%^n7;Ei0Kp1bd~Pjup-O?yH)y ze>j%Ir!H-3Ht8TdzCE8cJHDyoX?H1V|N55dH~?||SjlN)Rsj+G-T(cuh2%rB*NZH0 zarf+-ed?lAI&SW`W?(ge$tRVN5kNOoJYCL*viI)7w_B*9F0%!?D?E>OfpG9qPF5%O z3$g+gr>J`?jxJ?QIm@ugquI1zF0PC`mna!OH5VlD^}HAy_|o#r-PJOnVe(ny&lIex z;``AaZ9@xvwr$I<=p zk3;{FmmKlaBksiSAjgSwaSD8zm-A)X7iD4|vPXNl{Kx>!eE|d6jBIF%RQUyx1G>kR zC4bJXu%&6Oz%LKI@+X4hT~>_`#jMHF{SLosY4^O%{1wO~i`8NL-=+f@W z@rC@tWc6L9GSy7JD#c9kWFL$tP0e8ihqct6|g6WQrdTgTFwaa+eC z8NgK~x-W~_YAo-lcxy+~Dy%X{CG_gB9lY`W(MV4Zt97rhuE;K7C}&mE2DNLDvE6S* zhl2_k4c1|agY!DYMj(VWm78NUrYRLt%+_2~bsZ0F?dE_D2R2k28oHsNfyK#6 z=ne|Qt_o3>5`_3HP|rkKz{y0K&xrUAc)%tflkJM#~ho;t8V+jyCHw-weTS@E1*v}gVQ6(nT+DJf6*g43EzV?Dh z=fK#5iKR$VsA$DNHZsyfMj&HQW1-?8sEH&)a^VpAkk&@1xS7jLFhOZ$4dr=C(xsR& zLZFS6i=<++S8JJrU?t8QcaN>Fbqy(FxoJ$aJ7mQ)Fm%m0FrgmTX_@;F%F>Ab7E~wt zsP*I248;J0ky7RSMk7=jfL~rm7U^xF1QaA#3+`piM+TNi|2h)93#v? zi2khOqd((IJW08$WV!3E%5G-X%jV(!AhN*1d6b}Aa~$Vqd98Q+nqGiS%I{ltdRBt@ z$0nv^CZ@T53~O&&PIxurS4na#x=1#oKT0tu7T$hXXW?5mEBEln%xi+IEwV0bTp? zHyo%Fyn<0JmwD4X(i9n1gw^eQ;us~;0Khux9Z&MgyNFy6m%cuR>GkRQEq^W5dv04J zT>iHI!=Gg%zc=WkraOhNx;^b`#}RVnap`AAJuE6wizP9|QgD#VNJ&tW7UQqw=Y#3s z?@Kg^tP&(S#bD(m(JT`ru?%75RuEZ2D1Cz_NTXE2iQq1zBp^3FC3Zd5U`ysn*zwfm?DM87oH@AmoZ_NHGRRgOb8`k z0CQW>!>1%$Ej_0)O1lS`x9y=u%+~x zn`$!=f|fK7aFZZctYhXOCM(pTel-mO0u~mYU;$e{J_QcrpQypTi=%gmv1S zbwdJ9v%N6-kHV(UZH#l1+t~()(`N##8oY(#`hgn!v}ez1sh%&3Vzh875$pYF#yIrz zd!)9p4Ia}j{!fkBHOcQ;@n>GJCv^;X^eKztzLNi7O$s|V6p~_Nsw>a_&CpO{BE&I13t_cfrlV4wwN#NHjEO?3 z$sW;1{+>O?DEtq!_D)_HYFtGgO@KNc`P-(^T@wCoRlKkmx;cq9bXmO+t+`2(tW|+2 zw}Ki0{@tZ8!=lT(uys#NOidO;8!ODbyqH$fNl8t%JSJLvR0De`o@zY0AFjW>Fk==o zOezX#FjeZNvtzC>V^iOLcA;EN%(5cp_Z2B4OZ158X$%G*J`Ubc|B?n?*p42RG#g3b zry?DM&E@^vD|+Z_B4AD%DTc%{P^- z!UliU8WnfPh_OhCzUNn5;F*;WY9l@W*fU*VA%ugidtUZ3)yL1eg86^y6@%&)h2V*TV*h;K>|t-(c2FX={w*!Z2f| zmzq>k>=UFcdI@?UrQs*Y4NR6;2>%EL3J4Q$Vz7%73XR7GKw9EZQ6H3j7J`N{4-~~n zmWF7#EJX(c6O||+#zev0kLG|ov&WoJ{}!1OD%4nsB9H(TTxip10|4HwazYhDbH_r2 zE`#|sgZcfp$Y2@p@0?KJMF#iV`)?2!rRag?^sx|3|fbezQ$YqG>QzFh<}5-e>% zDsEOyw+OWRuuX&9fTs7F(}lgR)`eYJg9HB~_NVc2Av?3e@7)fYVi)cHlwtvq zvseusoQ0h57d);83EfjJ6E-nmqDccvrh1#m=7)yiN1yS$v{nF}J#9_K6 zL*@zRg0p(ifLtBrNn?Vr_#Y)T!ctwkwjzB$_P{fz_1VOpoJJcZ9ck7Nn&wE<7`qiL zFE}ECbd_;RS(o@La**{0xvhk*4}RrOcHTJZDFe{=Y4fF#voq3tb##JFcyP=~j!NV!VKBLttC*d1wzmvkRw2fOeeWQn8N34Yd*B>L$a^+#Pt9(?0qNC)s%rlSq3 zoPkluTBMSaEP9X?!%NITlxOE5oDR9caX^RVvrHkO zc%GRFDL978)#eHg{6ifpi&QQt7%X*htX30JspIe<$~*$tr$N_&oo~E6NTEVJNQ3`T zECH~G=D@cBwBSYL z5CT+i*{3RkaMthc0|_WPV1C+Qsv>|527f}x2kv;i$(msS=6~tA1D0Gz0GmX|gCtoC zetrcv1QEcBq&W@r!in=BS>A^dgpZ1zhxgrGZm?!-<`7Ufebq?N|^`9DW9Vp&^)N{b=Bq-isy^e|n^Z8HJ*AawA^6?W*BSWjr zAcy?OaOXf1Yic*i@NIIyn{-`6PF@qwB?!0sr~uDLpnP51zepG;ZGJa!rY9iGa`oV zsZ9?i^BKcZs#mr6qd;8Eh(DR-%d;kWb|u3Dj0UYZ^mUX^*JA{GXm--~XN{6%{)(gs zzACsJvy2t=P)2+t5&r+o*6EE=lEii{;g?e1x4%rzjm=i$=YFkE3XY=NX;O=bi`_F5 z<>Y+Jrt~agVo24}av-@hJlr;-bz9n>aHzq~bj+omrbG>KG;tiHwtYKV0_M?Fq7;KwIDhWd<7CHG6w8Ps-0Q3mH z4ZZ^OKc7MeGr=*F@_H>nhvHB12ZGhXU$}ybH@JeOw~&A8;0kWH`wWh~^yo#*^L$ZE zW4tT`;(T5#W$i}@+TLssDitArr11TbsJ~LHYwdCW7GdlAbSAX!$cxKP!*nV%Rl>u* zs0RuWpA*(_e8>;IRVXRtweN3imwXBoZfmJjP_b=I5Q2}6y`)OCw|rN~=(+TW3V!8jb7 z_^E&J&21wI908|gW9lN8*>C-#OU=S~lTV{CA+%L+uqMNh&~LaeMx-q+QAeFDo-CS= z*t;WPE-`g3w@=-fRQGGj!+fbfRBby7PuGU^nfiWRtZpVo^)*>4>G8PFYe^aK_-#Fj zk_>Z-Eq;`5(>>E^}AEH|0h8*0r#s%aavPCQY%8&q=?fVQwK3gV5 z_}mN2eW6|3$I#4V0*2Kh6G$FLuYh5-^xXe!b$)IVHHdrx zt4(Z0Pt$>PxqzMLpRLZ|mJGJlHQR%rgf&lLg=)*(Lty?38?>$=r-8!;`dtJ7Zy3jX zP(8L`E#Vi1Ewnxx+bvO?c_~MX?1BpK5T5I%^M6Zo($Dk62EP4!pW`vF3t8ha|6kzX zeqy_qrFW^P?>iiQU=<@; z$U4+{omQGxko8^lF2axT-^cQ|kWEVeuZeSyYU23fI3yt^khBN&1*n%%ms7TSH z@|$3P=luTIvoq)3y|Z`D&g`81erD@OiKcnJ-c@U&;Tx7mZY$SE-1@ND`DWG%pO!|W zl2)TU&oko*e)Xv(-MhE!{e@JvwHRvIMKwqK!U8xs!TaJX@I)UpP4JQ-4w^cPBtW_@OOjmJHfp{q zPIxe8`Q~YYc31+#;c(HZymW-&l+1*P4uwQkyIrG@j|)bt4F%xPe)ctBxn8r7VF6#w zN)P@Dr8eBTTqe2T3sX=rFr{<=c+-V^0Hc7)*Jw3(PovaVV%mj#z*oZ*z*o!i7WryW zo`)df$u^9F_udSv*uGQ%V$J!(l)p9`%dlD&r3tAMLqF3XA`l2*N(yWwlPo?2P7tvh zVt#E45rtfslHiU(9c30t*iDbH#k{%iLErUbi>W=BV%6uJ!h_lTrcux(N$QO?yqF_FPCh$T`;&gO zy+)t~hR2vMZzkHNzluul&ahRL%=Xx5u{dvy!`5FD#adzs(pSRZCPhzBf;J4~5cT3k5RqF|DF1f?uQbz5EJ>0xh z&aB_VoCZT>v=pZFTB$JC*`oDDY?qX=2-4rEP$@hxWJykB;w8Ck5w(WcuS?r5@p@vV zGQMp^Wmhxtfi-Ib|*CA%#qT$-KdEElti8$8Hme7U$a=%FRK zo{4v>XNzjU(kqKCni;gB_PmH7_3sA_8wbiUK6p6>ealLf({H89sP(wzI$))mz0P)V zDjc?CU8BmyJq*V6Dtxv3|t9-$|?tW+~eib4YFEWoz z4{vgLBx%)X6I*0~iy(bL9wSLs?V2rQIVSzC71e&%s|i3nq64g*>hHMx$Qriky3?nz zi9k0p4Ya`HdiglFlXbUSaJ8>vF2c0g;n%GkicLs|^6S zBh!EtibtX2?1zLo(%FjUdE5HL$UXd^T2}zl5mTx8yy; zb&0ueBQ&r%w=cXEbDeJerurpzOf1uKH_x8pFn8eT0-s==T z<0X?QW>YOMSCr7@Yp&65moB{@f6pYiWc4011-=%uS_JKeN~dcv?cAgQ(U?I{9xP4? z5G|eDk9WJ$Mao*4d+#(s6NYy?BwIuFl2Hn-@V5#fz^I^jJM>7n)%N+@$R}Ob8cKkO zJi6AfbG~pJxna4bp+tJn2p8&h6)&FY2EO7dym+a=2)8;S|3@2e$=J^*-mM}oil-T0 zDTGQhfWZng}ZNo{fOcqZ^WZMQ8jck*q<bR*`epDvoV0?J`ntQh`rZl) z)}Iw4GK8a~ANNsu*cA=4x$r<#*5>+U?VeMeSG<*UOH1b!S3a21UsjIDTq;J+zwnzn zKr_8KBC`g@0xxBk)A?bxR?G8Knnw*r_1BgsO1sno$56)Q#H;z?^~=iqr#dhAE9u%v z-AhToesKh7dBg1HNziaeWn!rHxx`Q%Q1-q!LiQ3kCTD=yk2y;G_dfMF&Nbr;yvXVC2^ZNum~T-K1Cl*Hh;D!Yj#zIW-vscseVftR3n35<1?0p)P8jz4o zZNK}nS>UCBSw=y^?U*qW@{X}!p|Ewj3IVh!4Z^*vLRM;&sB}0^54+v^Wg$Uxvtt#b z1(m+8$Aqk@JEn)N*39V>N>Loxut14I|8cGwL9`q|H3A=PMWF+~0+G6|a$u-zhz8A% z17DX1AZ|VI*FFv)bj*T2A2Jm zys!0HfCEU-!|Klun2=|Y34OvB4uA$NasZ&^=fL0oHuoJ8d#DxViYW&v1hDoAJt8Ep z^$CCjph1fqKsEAckprkkvO7SA+5d6?H&EysI!E-d$G&+E`lJrTI{?!Eypsd3)N%k- z$S8a0k+*P>1K1}N#?`i>(tnw(Lg1Z|prP^(u28szSB)4ti?^Bw38|9@h?eefkL>j) z0R^X|cE|DTeS_CrlI$ff9l6gy7{&ZF{86=zkCQrWwygmBmtH3Wx!^#ACRu30!_4n= z3g?a@yTE96BtpmSiSFu3KH_h~AVPj&6xcqLl6}}+ofa?pyG=lW9?_ovx_6F+woe{r zz*!15dM-AS0uK~o$%6{rMUDrYIO;U0+=1wx-yZ9o77|W@t$7q!rvNMS27X=&-Qoz?|bVIaA)4#bSCerrhT%5Ce~@uDbE*LQKYRxm^$w}iQopI;r>(E&r`0?-^?C&e ztd;#b8L+&z|1o}X3Ep&MtVpcTT~+2}T-VaC}M+_8QK;yNYxw+!CL n+T-#Y@of0JGua#cJ#xDok1BR$_uD+rSaAGU>V6UPR#)f0V?9EV literal 739 zcmV<90v!DxiwFP!000000Ckp4Zrd;rh1d2edV+%gq`VJMv_Td{R@qS&Z4;3wfufq! zXX~yP>o{_1SsZTlJ&{AsnBwdT@nH_5f`&<9}T z`s^myZU);NU4c0tqTip=6pjVOw}+oEFAtB84^IVwt&{U@c>$=cb0HPz_vz?DbLz*z zp9_*7wySMx%O^gdDK#Y6k|Pw^I>`q=nnTkpbHI%OD#==P_<#uQvEsKS)a zMVNjMIC=i)G2Ja2NeQbA5t?Zwl#mLoz%aq2Foqgp3^(g$C~7p3gv&}(#;B4DF9^!1 zl(Zt0b49ewtbhn^jYQx6Y(oxj*OIgAt>nOVg{^8RBfb{Pyf-W(S`o>GV3n*Ys&dEW ziiP2Fh&U<}%?)Ljp{>Wk#W)wspkYv`ltMt=Lc!z)m(Fi+8l1K;@bjyb2&FiqOmN9K zr`ydh7i%Ay622^qg3hOlMO;AX!M+#B|M($<60Vr41SdpNLnXUUAOznc2!ZPiLL{pY zLKOEYgzd73Ap~bRlxya$^4;y0VCuA>no%P`Cp5IoHqnivO*kE=Vd&%RYaziZlnPxD zW;8gfj{Vzs9g_?;erR#h*>QjJ!G+pRxT1BZIHu`TSYQeXEPClciPsLcu#GDSUi%1# z9{SL@aKGL1Ze3 zU@9oX5*QI3qwP8zErvvj%b-JCXd@+L^I}ADv>K3HaA$tx{hRf3xHf*SKoTz(-6B3` z*ND%-U6SW`>dsRE)$Z)zNYS^!zp|hE1{tK@V#v9koo~daA4~u@FYqX zcFth~)y41;0y$g;saEj4pb|{UARm}o1!;~KL7KxXNOQ?LNRdUBIho@;Cv(m6hYc_D V{~sHIv0t`a_78b$v-B7X002TRW+(sv diff --git a/java_LabledLDA/models/tickets/.wordmap.gz b/java_LabledLDA/models/tickets/.wordmap.gz index a19d86310490be23347d6094450777a85ee0474d..6051dd9865c31ea70fd335a99454dce98daa368d 100644 GIT binary patch literal 4376 zcmV+z5$En7iwFP!000000F7F^cHGL9&FeFq>jw5LPfU(@C0nWWSpUvMY-kr^c32x1e92f^<@?zv zja@ZZHICdsV|*~#_IOB1(eQTPswtSpIBa;-+;3y#8lvJm` z^jOto6-(#Kgy*Yd;+@@^Z1kkgQqt|~!}kXJn`~B>Z`CL5+Q#f`%y@|IBt$RcPioMO zQ7?6Ab~=_RVuN?q#B5RucVSZJrgz3>Ja$q(Y=?8xn!+Ea-QgQ9j+|i`psPkNoZU8&#%#>5&g)L?>LZp{PFUX;QBuo}LkJZ$~6A zT0Pnn#;mcEsLm$stofz%2;>mbZ1?KvEzIpHwYqe@>7r#S4N`ab)^&I+Q8Sg@5$PZt8qb{r;10wBPi?o&wdCEu>Qo4zHJ6eHUR8G%1%xZMnM8+5?p74WC zkHuwX4CyT$zd@L~6=>Q!jWv3cg2z*E*H4dw@i+?49D7;Ewkw1m9aKGYQYA-yW4WLL z0(}YyQor1C!Nd@s>YV}d6+UG(00DSjae~RHS|)Wu^)b6rj;f3GAkiCTzRV_CmYjUG zGfg6vP)zk^Y{59}N!{6GQH4M}KUv>}IO%LhZo4a{XSkw94r9md;!Yj{TS`W?I^#kU zP!@u0K0&A1jJZ}2@aVKx1t*O1Go{B|WJ4$@xiE#cxVrlKn_e@YVNbfqCHj(7h`63@ zO{)e_P`bM0z?~-6Ox;SO{iwy2L}e?djyZ+}ny1_apVn&+ zcSGPH#DfzobF$Z@BEgi7%{R2naK#>ZL=+P;?hacMOc155y4gbT5a$S0lmIBJZDVL@)qz=k#!-iS18#|f+P#~>nf3?Kn0 z2A6CHv@mOkaNTBMID%X@jcp*dlbKnEhWv$vNM?=2&KK+}kO;tr2vm|SlT_rglsCg9 zk*kf4pwj3AY)TDI0h)+Q^Qr(hpw}JiOZAPi)~cQ6Njx)OV>(#!Q7#$$AasJjB3bf# z>~}q;go}k^`!zc*B&E08P*8|xs0+}*Gn_vU_eszvoTeU;+KJQDiPZ45@S4!t85^rE z3=*IV%Hs2%BUzCN&ovSixZcg-`*y69g67X}j9<30wCmO;BKsrXFr>(1ZVh<)7^8xTz&TKg4H^ih9d3oh^zx*3w|t=T9eO%*r=O z=r`fU*MW6OL}k#G*8mTCH+6~&CW-Kh!cJKv0JfwXa6sJSz`7Y&W5{=Jcz2fOfa4K| zy}@4VI#TkB@KYq{#O90Z={`}LAY5dlz{58>F8Jcw2ya5cVA%n+p%yYu#8oz5ko*su z>l=lJ5y^gwMp-r36nxgK-B=GJ!AtiZq#pdj`sm%JC)h)+k&%!h60zD_?NN#(zL&rb zik&yt90EmblDW3T2`wrj+;=!4`bhk&K`p7mB8=+e;hP&pL_%lbdSNVBHT=3{7jO?M zkN{YI^o#DvT$v&Lu(?o-C0a4>=*i|D-B~suogK|)*%ss6n~}XIALVM5pq;28kuDnC z=dfK7LAomLlelfs0+EE8tttpSvBn-y9){`R-|Vcvmcv&-PCy#f&tU4T0lkD+HcA>M zyO}`YGrLznCB?EuWE9kH9tu0;Y66(4G81j6@+3lgRjBgU4h=|sL=ti52rbWt+o(mG zBhVeO(ivU=H_Z8H8kYL6!4mcMjCg%uN}fo!V=*s99LA?*j9mV6mHI%q{h-@U94Bhj zp}Vjpx*qZn3PZgbf|9;RAM%?9a_LUw<&bqFDhl`?9eeF|z_xTD19a*)Vu%>N_e{O5tVM^T zHr901Wu0Y5R}~sgi|pJEfm%2)zKk%jf+Tj8Y6cx!*eHr?0N1%HUeNZf8N@PK7G6D~ zn$ta=MokD33wyi#DQ*jSp|Zn`rpswj);PST41@VoIiRJod8I|20i}h6vLYW0TM-~Y z#DiLNA5j`!eje+D zrG~a_`Z(%OaFFa}s}^TKAVn3lLCiBmFpWbS14Iv@M%?vg7Y6(m`Xt+qs+CqHL_Az- zatG?JO;403IV~LH&!8*@9JZF;ZmV^Lg>`6b`W^nRAo0pB38>7(5Kt zc#WqxVU)gPlo7tE(;Ja-^3-WA>C)ekV#Ka`O+&i4rqvBeeF41`0Y(T4=fwH28ze0* zm1xSEeQ49!fk3oRXbMX%!S7v`#z4VU>ZpLuJ3VrUV2!km24pk@dV@XP<+}buC@DKF zLI)uhXH8cDbyNSzrVS>zPa2S{#RqEf^VVpH_Pr6MG5AdC9Wq;y}mPYnkpQ#$EK$wtU3R662Nl|%zQLM?~zPy(*A$=op-)o4{U^ppty=WqXw zB@;~(P9cLU#T`ohKE0Ve1ZCTD3{Ng#%^102&CW2X_yfJet52*HuxW^98Gc)H_%pr- zR-9e<0IXJrOFX-1(?xyNbilrH-m#1hC@1fc0>lbXw7$+H#giV#x#Gi6o~q0YD0Ttq zY8-1yI<}!Ut9p>Xrh7w3E5X6f?IpOkBoG0uNopIwHA0^;e7UcjQmD+2CJ;`g3FyQ$ zzGr?%T}_!v^!>c025ViTHy>I78Q3rZk!1T{sEvXDpnf}#vOL?a{@CPm#5kB5K28Y5?Ty;)i2p~+gE*Kvdd zUekxa1V@(z3`KBU_tcogcj>2R7+RmiexMH4SqJxJ}GnZ)DAIyFnj`rU-t zq5rB4^^z!9_@X$=I|5%{c>Mn?J~J z(15Dfx)bw3r%z=M)|$i*3>1;|O;+_bb-SHIhj)C$GMjcDBF^{2(D; zg$N>D%*@=>;MYIvV2H|eFfP<-GvB}+U9FEhmD%j_;#y>2Zrw8Rl$k466YK!Sb4|N$ zWP(7LD9VJ@WpO0OpHvlByDP=KcoC3LG)Rf^Qb0D8>qPsQv-68LmseMp=!|i! z2Mdk#DtQc~k=@f60y!h2E4>nqO%%=-97=J3u-ITvs#>)7YJ)y;ic>|BtS|9`tyKo< z$M!M?1+{_i&|C({XSFLlIa-u%lOfm3^p{<&;g|!Rf|Mg~m+(X>$VKpwLec0NzCf6c zW4DI`B?&!3BJ*xR58W#r9w~w&@1=UN!=MiT>AJ)B8mET*Tl;HMat`G6qSrUWdRk{r zjTyz0;5W$X;k(p5QgnAteD_&81WcfMzwG3(vq1tQF@wIe1Rt^hG zL)Ct%Dwo0=Esigk5`h7CZPeR8TwQ*+eevpIb8~rdeo<|P_?RCkXJ-u=HX#%hGY2b>xkh*g?M&L>C!cD0lJ;g2sj|gA=Ve@%Opu1UtzGn z%XoNgfd63X|IE028C9HK82t8kKmF_Pe>=Owx5-{j)7!|6@w)7btn0P`inLaARk<9l`X_WbPfmYb%k zV2PbE`M1`{bPUv96{;Xqw13CfPq1lPMB!s~A#a>i6TTR@e=RVfwnyOKQjEViU#8{f zp&h7YGkV&?obYI$XzN_<1J5YJYDLANvdT@}UHrs1c_`lf;p^bCI=}g_*<8Q4y10bc zfA{M09p6dXz9XP{mJOVV_Vz^ob`390jIPU;2vG{_v=71~}LlW%h?%gQiX-Y&ZO zqR}tI_y^!${x9saxBLDycl8o~J literal 375 zcmV--0f_z|iwFP!0000006mexZrm^oMDP5o9u_FtAcq|HMq9KktV9VEnN9p{z4ga3 za*JGS!5(qm4A~aRPA)~Zrqns@LXy?;dU)tgKW}gM`|aiB_Da0C)f;x{Gb)?;lKkZI-fg$)bZ;M>koI&gLpc8h+yCpPjfmEtqeQ zY4u}$h%FpqO79J{2HyF@O}0%Glcw81f`S1_9;`eEK(`(=JW`HIMGk{_mxU#g(E6Qu z3CP5r0W4&mWb0944BGeu^ej#mn??l^Iv8EdCP4WQ@(#YNX&zPld;JaljsTjM`5|B zEiwFp1apzeA|8!|%Yh`qE0PI}LlHAC3-SaCD+H*oAMBiakqm)Kw8bMDPdB&w>1)F@<84n;87 z4Iq)oyw7tV*{^>2y6A0ftF|dj(|fb%a9LYl3{7df-mTYtVO`_hY}pUZqL|yA^^NHZ zQ}))?yfyxYiPpC1i-j$X9p<*!m}^(pnA@9SUAQ@?;z!~)%VD-G*2dL^_1nQV9lnBf zbQ^DH%Z>4UgOwa!Ut5o*Zd>2iSQTzGLytuj{kx4V4Axs>6@JA9#jTaLX>IQo{kYko z^OmnS^J3lF($=?L-AIcm0aj+Z|I*4#Mb{5=hi%q&>FqHsX8m7_3WhV*L)R6RdAq=S z7E2kbW>{mytygzm7oGLjZf1LH*2P)Znfasr#}Z#zyS})1Qv7J@d1<%K_n633LWc+b zt72(NTRbpBxix-Rdl32BS>M~PYp>gNao;X#>v~MPE}C|>yl>}r;LF{1{{h3iY`KV0>TIJ zGaVkfb|@C!tk=aukI(fWW8+rfj(29|x^~qYEX`*;JzJaF+20Ne$j7qnHf~oO4kj~4 zUYem=*vdQd&Vo1l4^@Q^kT<<8T(dOT;N$&=j@!R(Jvp*}zp%x*YgR27(aatAHdNqS z4|#&bUU@THIT1MzdVKx5sLiHtH^r(p*59Cv6r)?OesxHTo8|c@v=?Hv#-ctcwSShV zo)**9Q1=v5=+;p2A>^WpUp7r`K)HqO+6J`2sIN|JdBXlF-+z+P(_P0+PxxD$U~nNE z{@!s#1?)-FwV>D{3^}Ki(%J|!f3VQ<0(bhqc~&i<*Z6&?#p?Ntb;7Z^KVVvzZw|Zj zO|m=qxrVb54UfOb{)@`yly2eRUA zUb&`f{o3?yl`Rtctp>Z`p1c9wd_@N09nu^O>TEGLJ#LgOHY?VTbD3^qSG}$aHmf#Z z9(eBMwZg3WL6!zG$&IwDbvqwwT~OFem||hDf^k{%c1QN6_xiy%LszsOLwl#nZ(vj4 zywVsm^ewm7(FAA2koA|QS;bkyWW&<$KsRs|+oHu0U&d|L?ZS!A6kauEO-J5!lnP4L z&en}hkMX4%9-wh}&*7Uoh5D94DPFu?*TrUDg)KvGH}F9aNik%$x%KMTR07y%*Bf7w znI1g!z7Wa6y69p-NJmCJ8ES`JyT;lMo~bLIV)YFS<_0U~q8|#Y0zda~M;3D0 zwBC3MCEZh)Kgy`nSE45$83EZ{^}Y?3W0aw`>EgY7(NqF(bXkjDRNnp$ioS~@@r7}~ z(F!})G}kS~hW=o{3-X;}>FTNz{|J1r?T%hd)QiMAbX3ovEc^GMk@X8lQ9d9hPz513UR0q77iL`d|ef>!M07S1ckXf}CTUbustZY3Ok3`c`7cgZYCY2@EjJg+x z<*p$o!+fC@KCNbSXu_D)U?HMV5lS`nR_=oHo#}U=lJ{WF04cLen#$F{(U5@IYK^sU zJ~&zcJwO~lJD7Q(m5GRPXJGbh2|Ma6_1Vn(jQcdFRBqYcIE=dhgV%OLnsd_n=E{5NS-E`QtUr4^bT;}($TT?L!YD&GE3hC_Cj>z8$HJYj76Fz zQy_e!32}=!OAwyA?FuuSwL{Z?3YXpNm>~%;$tQr=X8EpzFii2+ryaET=VIcZNB>+* z9JKgnx@TCGJK6fr==RJ&2Ez!z32|-B&qf*t-ue?eXu0m^`;W7g>JnTshuY>(#~gRQ z9`_XDfw}7-Fz+z5qGyXwap`8R4^kPgsPdfFPhzGi-X7z}DMb=N7#fUpDRROr&9eE^ z_Aw(weAQKvML6{rE)nn~+FC5x5Dz%QY$uj3#;jF_$#NP{4?!7M7n? z{|~|*nZt#}rga}_p7>7mhV$rs!<-TpDl9DsRyqbKUB0Ku4g=7X;pt}guIpvVbNbqr ztSgNV)Zp46Rvk<%_P)e)X>9?RbRd9|J3i>`+rBt^arN|h^iZ&8!3+f&0;G9DgRqb2 z-Pdi?o6I+S6MX|5NFIBTC?X;%{PMJW*8nWFO+Z-ywRS-OGfuK@)m*P=K4^8YnJ1=! zlc3Adslx3CSO6SZAaBqI%mr7Q=%z0O8?Mh3kU}l^s8{R@S94@A)YKPC3d{fBxbm>(fJh74<6tpbn$LmG+~Umg0ojc>{@U3|Dt z55+}>4~0eK566Hm%{hvg4*#uLESR$p$ODmmadkcprfj<$tm116dNmU6H{$IeG)hoG z03*r2)fF%7ieR=+`6A8bR1&YN^8>c2K~xFTjRt08c4-cn4Y5|?SBKls_7HFkxn$(O z6VU%|*uRJ25u@dc?|{g90|V4SPRhWw5Y(j25tj!q4lx&czacbk-)R^)_rnalD<17X z_RC=%G8V*j@yQ6$Vw6c`88#9ZZfz&Iv6z^V-?l)t#Y+f3My?Pt+GPBvnDs~&bqgmT zn04MHO@KZ^)T%y+crcZpjFE3hReHEGKS_?nuvvvX1aqHDxLTlUadI<$1HtjZ4rphd_e*`TU zfaf5TW7H6WFbImNw@$$Su}8;JyO+>Ar@sBNW!}vCc@Wb_?>23}eCKToN3`010K5i8 z>UseRBQz`?jd=-$8Rw&9P?({*SwfSa1ORrsq4*9z#9%Q@`dR?L`t$Z0sj2>tCe!F>xL*tuOulqasew;D7%4-_Bp2|8Vic!_N#}j>0+J(utt% zv-ma*NC*KM%q(-fqST6w9fNV~#~L4f43m!YL1fm$nuO2Ar{E z4z+T-kVoa}#AGo)-K5#lJR#2%D4S!;9YCDUk?z*6S>1Fsi443oB$Z?S^WcTJ$9REr)Tk)sq_z%k&d>%{uobgU&EDK0No^vFZ3YIvh zfV{JSbGCFdQQ77wbCr_gAs#9fRrELHys1M;$g+?EujY>w9b|XLb(&g$y#q?2yv+CS z*P99~dt;gbi$1hV!8xWn!ofuN#3#QQE+-p0Uh zzpt1QB#VL#PL&5SA+XenL;P-VowyJsc=N>d`;VnsY(4?JV+FaPAhcBlOVUBe?HYbN zUWVF6Np-<%vlYE^%DB>*n_kB%KPn&Jk?8g~j!gWEcS@_y4+8(#HE_lh_3w#Snh>>M zb)k+SS&>W05#4?ig(xdp4PEp><8PVf0&lLO;Aj|)DY6<5g7GfKK54-r6bk&VG?$xa zb)+&Jt0j%d%NUTREG&J8SeKaSMI0Dm`@E%GcD+e?+_?aC5JDg&(IX^pELE7&v!oVD z4pwk#Tl^E`h>5HHdkOLX_=hb#>iwrrAJFf>(g9xUJ0S-^F`EI59@f?&UB7xgDZFX+ zD6umWc-g*bJo7Mkbbn};4ok%q&^0N}@{O8V#GGUvR{IF+sYTZoLk}B>Ct8G;qz`_~ z%z86)7tJ+cfRuY0!-5XBip72NI_9^65D5Me5F?%hh6lnTB@&h&S*DAmOO(Zep#wfR zAv*-`HZn0NS21@RsTSdTO;-`#4MAH3Q1j}Y{;#x+?xSm~UQm8W@k|OWbiad5`6VUw z+_0?HO$JUbbZ~ORG@a0~KC{tD6eWwG;{_3QGj9QJmTe!BSa!$!L!?%2HVP-TLxyoO zVItWf7Lz0b)7JeiraTU^XQzZGNTAeRy#SZW{H{Py(MoMix=F@#TwotjpC(N?c++lT zjxqRL#Zb}ZbYkLc`iW9MZX_*11U!BAV zK}Pn!_ptJ!B_i-^%a|WV`Xgoa+4GC1&o8d_zrVV?yn6B7xi%V6JI$~d=)T!g1iLiJ zDau8b?J*i$DhAU<2+dPHK9&|rJFt$83CX*fY=teWnX%ELOIS2w4qAdObG-VF&Kd%i zJTD=5LaXE}PbfGFmpIu-f0fZ3b^G=WfxYO4mA(!*ptcnMBaV&kY!L||eOXY)r14`d zKock-yQ$JQR%SvA%faCCE2U61*rnHOrDD5U8YB?Ih6`v1?IgQYA{fY;R)SSHoPCT5 zdS;fm*_stCzAnXcXpiR;LS~5DPKYS1k5(tCc4#&mZF|HbaU?T!MUDR@Jj{&86s3sO zb^fq@CscFEnuuxp=(4ha03n}VMV)yJKTSU*rej(R0qt8V;v)zSQ;V?!sjnzb>JBu0 zc8GER`@jAZr0G~FO6YGAaSYPzGNQ<%tUNg-*d|50CqgSiOh6Nc;U#T0oxGJh&TcK$ z62Cy3n&L-^_!u>b^!_j*=m}dOm9O9;n!c%YW_}}zpv)! ztVTegdLo1|A&ad>9bN=xMOz%kxIQ$KOiEADxEy2{zVr4StbP~FK7{mJd=0MMHQH(@ z{wX!-X0cXy#F8t3bTMpbZl_tFXdN_%^&_BcXKO-G^==WmOZsfg1Ed7n2qUv!!)FoI zNG*VCq!EliVV5OXD_|aXVxcc-cm)sGu9&q5bV!c$7zXz3JB?!o28hYHX7$2!s~gG| z(Jea-&YB%lmz%1N*fEC#VU>xxlej|2zh$Wvw1_`p;7g_yZePxpTX;f$Br|7q7ePo* z(c`KXUP5gZ0F7P|9t95&x)LXWL&WzAOd02TDhl%TF%k7H#V6Oa>keD9aj*8D{FEd3+Q&BP({{a8X)Y%W}+ zwvW7w8*94>(g97I{eQGIYopyMZQTtjRiI=r522qmMxB?ld$-TNj&X8Izs8gnbOSq< zoPhOy!mt?(~veCOIjG9?F$DzG2xkjuqphwM@+g_Fg7;FZnav*bVF3WEOB%q zJ!}KMbCb@Tn0RHMVL8ASH7+1nZ6H`f)(xW8Nmj?PM^u&e1sThXGO6DPxGgSX;g2{} zGIXFn3v-$~VctZavjIu7gMpB02`8OD9HZp`hDALUJ1L%+zI%6HXU(Zf{PB8F!DHyr zCB!=7>^OTSx`{&}`o3y6H4syAHRN0iojWAI1ox}`2QTdhf!!%NmOyt%P_g4WwG_E5 z**$*pxbyJys&r$C=j!bF)#I}lH~OB;rS2e&(9uT{==ik`+vQ>Jb(On|K6{Sh(Q-RKx4`+Yvz)+PEA|QmTsf} z-8T%V@*bl^)`PNvcwhy_vuRAIxgJ}W+)(kzR5FbQP$_S8=k;N08Lh}m2&P%Dr98T1 z?Ll=%E6UoIbC>~YdH$fClR(l4eJFIt>EG1nw19drALn9StTGZ#4f9}{4x&ERnZ(#m z^=d07dF;UtIXjt0rYn1&LH;P={htk!-yB!8?q$no|=8>;FB$ok4YSV6&sWE{}AGdh0YRO8?~yYl*TC9 z-f$yK|yyT z#l|AUG(j~@LQS9L>SM3_dWv6_eXy+eUS}pb7;Et5im4i`Y64KloA+^(rJwv9fSBpj zYI&+TKsKAnN?W_$ttPE0l3N?k8pT%mlg$e^bhl(3Eu-IqFOcxUNppMF{~{>hCk`Ck z`<$bsW5*X{HRR z!yx?gDiyn9M5K|B3huNuK?_yP{2TzA=&H15IikFzkDX;tXKGRa-M541Q6){@_vy|U z8iWq+B(#EJl0&_J{{6+{#~0%l*2b??&)ZTGj&X#Zmy&fg<1vkhhQnYT6c>@cnNq>n zp%}0oQ{Ul7FkutPOlMEVLnt~byaU3l=xiii8yd=zB{?DKX__(2i`g=>Z1PCNz6Bi$ z_5xx`&AUmVb1aqdbIp31@JsEYA!LY*`;iT^Qdo;;Yr<)SSYwC@!g!4!PFcCkdQqUo zg+-v^77{uGf5A`LVtO>9Ujpq*O=zH~r~6B5<{i_^tsH-0jU?<@6%HsC~;C zYciNduvzjz*gBchJROnSS@B{aAiW0bWt-wnS(RXOiY85yY3Qg~Y^PCN51C*;db?hK#f<;Z+vEO|yq`bX%cEl_r#oCyEUPD0@`4g`5KQJxTjrD5)P7q4_t2#7 z#-f%K@x(tx`yc10s;J->J0n&vT0aH20Ui>(vIrEOrBvP8^xnPI%niE-;?uuG+8Uf#OB4*aCfp2^OJvkeDUJ)_QUh5=NISq&$H5i z{3h)#M^WI`*$bBFp#ah!BL;AiM^VI@{MiJBWp(qZ6Vwj4$sRSEkyUI|Er_k5EtA|_ zmfq`g{H9ekEsoCET)`7ORa;A+VQBSr?4KE*9?ckQrX$b+3Fa)|RHYg6uW5;CzIF}U zBTV0VJ(e3fwMS;h^N|7k#9Cw&KL&1c%qZ3QLJ>8lX^)Cir%#%VUY-jZ&6-HUpLKZu z3pj-^#_S`QRnQcikG(hHT=Xw+F}|UTeLTK17DdG{Uzl_BU38 zoR_+G3oVZabfcm8n1$7AHB8GfN1S^@RxFpty^%);K$^jCO;`Mu2VfqaJ-L5&aY=b( zt%OMOj8wXGR!)cP-+S#s5G*7OL~|F9aI|b7h^TcMs%t7bWK<;9AG`es4N~-<^~;u? z{ieq~W;upW!w=h1N7puX4C*UKwVH&=q!X!Z-`Y8(Lfko!?qkeiU&9+$5t{|!OI`nX;@4C@Jp;OPkM?+dj3R@(AeR7Vkg?CDlM#h!>Snh zANl7VK&IXP{jv`81ewnpVI`glqILiIi>s@rw;!KhTwcBS;o-#(hlP)W9;b*o<6i+8 zQ~GV=78Q5jM3_ElgHq5Q1yBFnfO2YD4#j)NT|QTUXQp}Au@{|xdc*p*HpSJFhr@_F z&YQUu(F7WjAhC?(*yXxy>ekG8EO_S_QN`#Yo#G80E(hJH$t>#O(>!Q9{8Jl;*BT)o zYscZZ)&rWP=AF?Ddv%KxskNZRNSebA^AiUy@BOSplc#c0@r2h6Ug7`hj}A=(kXXwd zlTbV*!t4py-F3{GG`j<(8_#sepX0cpMLa)GatH_)8ssZKnr^>3#% zVM7^)))=FCV_?$0MC;qT+PU)?F`FYWz#mA_cksgLB4nkU%nmGOCuYgR^bY(~`D z|36pk^|93o13sCMND9&hZpkuM+Ye@@tA#{*ay;{XG&utMlVR6iCY_Jgb_s5`6twd- zUN)hN^>GKam6FA9NIq7>((4Yh+HmMgWOqMC0ssO!EtzErK&{%D77uP~ZR*T-i zAF^0N$?S<|ljsnU7I;g(Wl0#Uf=v64EM5)AC*Z6EL802l8LHgGtU*1wk*xiN!l#?x zEzNB_ulFdJ(!ZDdudDqm^YvE0HuE&*4U2E~I-2{2vBj(H=N&{WnhPvOHxmP{@_5UX W7wftD4FCZD{{sN9$Shmv0{{ToOz1@b diff --git a/main.py b/main.py index faa9c8e..0cdd6ca 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,7 @@ import time import init import corporization import preprocessing +import topicModeling from miscellaneous import * @@ -19,5 +20,10 @@ printlog("") preprocessing.main() printlog("") +topicModeling.main() +printlog("") + end = time.time() printlog("Total Time Elapsed: {0} min".format((end - start) / 60)) + + diff --git a/miscellaneous.py b/miscellaneous.py index debe414..d1a3fa6 100644 --- a/miscellaneous.py +++ b/miscellaneous.py @@ -131,8 +131,8 @@ def printRandomDoc(textacyCorpus): else: printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) randIndex = int((len(textacyCorpus) - 1) * random.random()) - printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, - textacyCorpus[randIndex].metadata)) + printlog("Index: {0} \n Text: {1} \n Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, + textacyCorpus[randIndex].metadata['categoryName'])) print() diff --git a/preprocessing.py b/preprocessing.py index c7f0d65..26e755e 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -24,18 +24,30 @@ with open(config_ini) as f: config.read_file(f) +global REGEX_SPECIALCHAR +global REGEX_TOPLVL - -REGEX_SPECIALCHAR = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|?]' +REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|]' REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?' + + +global THESAURUS +global WORDS +global LEMMAS +global NOUNS +global VORNAMEN +global DE_STOP_WORDS +global EN_STOP_WORDS + THESAURUS = {} -WORDS = {} -LEMMAS = {} -NOUNS = [] -VORNAMEN= [] -de_stop_words=[] +WORDS= {} +LEMMAS= {} +NOUNS= {} +VORNAMEN= {} +DE_STOP_WORDS= {} +EN_STOP_WORDS= {} ############# filter tokens @@ -210,6 +222,10 @@ def stringcleaning(stringstream): yield string + + + + def filterTokens(tokens, funclist): # in:tokenlist, funclist # out: tokenlist @@ -218,9 +234,75 @@ def filterTokens(tokens, funclist): return tokens +def processContentstream2(textstream, parser, token_filterlist=None): + #pre parse + textstream = preparse(textstream) + pipe = parser.pipe(textstream) + for doc in pipe: + + tokens = [tok for tok in doc] + + # in parse + if token_filterlist is not None: + tokens = filterTokens(tokens, token_filterlist) + + # post parse + tokens = [postparse(tok) for tok in tokens] #todo informationsverlust! + + yield " ".join(tokens) + +def preparse(stringstream): + + for string in stringstream: + # fixUnicode + string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') + + # seperate_words_on_regex: + string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) + + #normalize whitespace + string = textacy.preprocess.normalize_whitespace(string) + + # replaceRockDots + string = re.sub(r'[ß]', "ss", string) + string = re.sub(r'[ö]', "oe", string) + string = re.sub(r'[ü]', "ue", string) + string = re.sub(r'[ä]', "ae", string) + + # cut_after + # todo addressen enfernen --> postal.parser idee zu metadaten hinzufügen + words = ["gruss", "grusse","gruesse","gruessen","grusses"] + + for gr in words: + if gr in string: + string = string.rpartition(gr)[0] + break + + yield string + +def postparse(toktext): + """ + :param toktext: spacy.token + :return: string + """ + toktext = toktext.lower_ + + # remove_words_containing_topLVL + toktext = toktext if not re.search(REGEX_TOPLVL, toktext) else "" + + # lemmatize + toktext = lemmatizeWord(toktext) + + # synonyme normalisieren + toktext = getFirstSynonym(toktext) + + # autocorrect + toktext = autocorrectWord(toktext) + + return toktext def corpus2Text(corpus): for doc in corpus: @@ -303,52 +385,16 @@ path2nouns_list = FILEPATH + config.get("nouns","pickle_file") path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file") -path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file") - +path2DEstopwordlist = FILEPATH + config.get("de_stopwords", "pickle_file") +path2ENstopwordlist = FILEPATH + config.get("en_stopwords", "pickle_file") corpus_de_path = FILEPATH + config.get("de_corpus", "path") corpus_en_path = FILEPATH + config.get("en_corpus", "path") -custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus", - "hallo", "gerne", "freundlich", "fragen", "fehler", "bitten", "ehre", "lieb", "helfen", - "versuchen", "unbestimmt", "woche", "tadelos", "klappen", "mittlerweile", "bekommen", - "erreichbar", "gruss", "auffahren", "vorgang", "hinweis", "institut", "universitaet", - "name", "gruss", "id", "erfolg", "mail","folge", - "nummer", "team", "fakultaet", "email", "absender", "tu", "versenden", "vorname", "message", - "service", "strasse", "prozess", "portal", "raum", "personal", "moeglichkeit", "fremd", "wende", - "rueckfrage", "stehen", "verfuegung", - "funktionieren", "kollege", "pruefen", "hoffen" - ] -filter_tokens = [ - # removeENT(["PERSON"]), - - keepNouns(), - - remove_words_containing_Numbers(), - - removePOS(["PUNCT", "SPACE", "NUM"]), - - #removeWords(de_stop_words + custom_words), - removeWords(de_stop_words), - - remove_long_words(), - remove_short_words(), - remove_first_names() - - -] -#todo filtertokens haut alle raus -filter_tokens = None - -clean_in_meta = { - "Solution": [removePOS(["SPACE"])], - "Subject": [removePOS(["SPACE", "PUNCT"])], - "categoryName": [removePOS(["SPACE", "PUNCT"])] -} def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10): @@ -365,7 +411,7 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print ## process and add files to textacy-corpi, corpus.add_texts( - processContentstream(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser), + processContentstream2(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser), processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser) ) @@ -392,14 +438,39 @@ def main(): THESAURUS = load_obj(path2thesaurus_dict) WORDS = load_obj(path2wordsdict) LEMMAS = load_obj(path2lemmadict) - DE_STOP_WORDS = load_obj(path2stopwordlist) + DE_STOP_WORDS = load_obj(path2DEstopwordlist) + EN_STOP_WORDS = load_obj(path2ENstopwordlist) NOUNS = load_obj(path2nouns_list) VORNAMEN = load_obj(path2firstnameslist) + filter_tokens = [ + # removeENT(["PERSON"]), + + keepNouns(NOUNS), + + remove_words_containing_Numbers(), + + removePOS(["PUNCT", "SPACE", "NUM"]), + + # removeWords(de_stop_words + custom_words), + removeWords(DE_STOP_WORDS), + + remove_long_words(), + remove_short_words(), + remove_first_names() + + ] + + + clean_in_meta = { + "Solution": [removePOS(["SPACE"])], + "Subject": [removePOS(["SPACE", "PUNCT"])], + "categoryName": [removePOS(["SPACE", "PUNCT"])] + } + preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" ) - preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" ) - + #preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" ) end = time.time() printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60)) diff --git a/testra.py b/testra.py index 843d548..d1fc357 100644 --- a/testra.py +++ b/testra.py @@ -8,6 +8,8 @@ import json #import textacy from functools import reduce +import textacy + start = time.time() import enchant @@ -54,8 +56,12 @@ corpi.add_texts( print(corpi) """ +jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/" - +LLDA_filepath = "{0}labeldict.txt".format(jgibbsLLDA_root) +laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_hilfe': 18, 'sap': 7, 'pos': 23, 'verwaltung': 4, 'lan': 1} +with open(LLDA_filepath, 'w') as file: + file.write(json.dumps(laveldict)) """ def load_corpus(corpus_path, corpus_name, lang="de"): from pathlib import Path @@ -85,20 +91,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"): textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) return corpus """ -import os -a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt" -b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt" -d = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt" - -c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt" - - - - -scriptpath = os.path.dirname(os.path.realpath(__file__)) - - - """ diff --git a/topicModeling.py b/topicModeling.py index 75dbe12..cd35eac 100644 --- a/topicModeling.py +++ b/topicModeling.py @@ -1,82 +1,39 @@ # -*- coding: utf-8 -*- - -from datetime import datetime - -print(datetime.now()) - -import time - -import enchant - -start = time.time() from datetime import datetime import time -import logging -from stop_words import get_stop_words -#import words as words -from nltk.corpus import stopwords as nltk_stopwords -from collections import Counter import csv -import re -import xml.etree.ElementTree as ET -import spacy -import textacy -from scipy import * import sys -csv.field_size_limit(sys.maxsize) -import pickle -import configparser as ConfigParser -from miscellaneous import * - - - -import time - - - - -from datetime import datetime -import logging -from nltk.corpus import stopwords -import csv -import functools -import re -import xml.etree.ElementTree as ET -import spacy -import textacy -from scipy import * -import sys -csv.field_size_limit(sys.maxsize) - -import logging - -import csv -import functools +import json import os.path -import re import subprocess -import time -import xml.etree.ElementTree as ET -import sys -import spacy +from textacy import Vectorizer + +from miscellaneous import * import textacy from scipy import * -from textacy import Vectorizer -import warnings -import configparser as ConfigParser -import sys -import hunspell -from postal.parser import parse_address + +import os csv.field_size_limit(sys.maxsize) +FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" + +# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &" + + +# load config +config_ini = FILEPATH + "config.ini" + +config = ConfigParser.ConfigParser() +with open(config_ini) as f: + config.read_file(f) -def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): +def printvecotorization(de_corpus,ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): printlog(str("ngrams: {0}".format(ngrams))) printlog(str("min_df: {0}".format(min_df))) printlog(str("max_df: {0}".format(max_df))) @@ -94,47 +51,7 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en printlog("doc_term_matrix: {0}".format(doc_term_matrix)) printlog("id2term: {0}".format(id2term)) -corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/" -corpus_name = "de_corpus" - -# load corpi -de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path) - - - -# todo gescheites tf(-idf) maß finden -ngrams = 1 -min_df = 1 -max_df = 1.0 -weighting = 'tf' -# weighting ='tfidf' -named_entities = False - -""" -printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting) -printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting) -printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting) - -printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting) -printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting) -printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting) -""" - - -# build citionary of ticketcategories -labelist = [] - -for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): - labelist.append(texdoc.metadata["categoryName"]) - -LABELDICT = {k: v for v, k in enumerate(labelist)} - -printlog(str("LABELDICT: {0}".format(LABELDICT))) - - - -def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(LABELDICT), named_entities=False, - corpus=de_corpus): +def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda',named_entities=False): printlog( "############################################ Topic Modeling {0} #############################################".format( topicModel)) @@ -198,132 +115,174 @@ def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len( printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) -# no_below = 20 -# no_above = 0.5 +def jgibbsLLDA(de_corpus, top_topic_words): + ##################### LLDA Topic Modeling via JGibbsLabledLDA ############################################## + + start = time.time() + + def label2ID(label, labeldict): + return labeldict.get(label, len(labeldict)) + + def generate_labled_lines(textacyCorpus,labeldict): + for doc in textacyCorpus: + # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi + yield "[" + str(label2ID(doc.metadata["categoryName"],labeldict)) + "] " + doc.text + + # build citionary of ticketcategories + labelist = [] + + for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): + labelist.append(texdoc.metadata["categoryName"]) + + labeldict = {k: v for v, k in enumerate(labelist)} + + n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic + + jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/" + + LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) + dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root) -# n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic - - - -""" -topicModeling(ngrams = 1, - min_df = 1, - max_df = 1.0, - topicModel = 'lda', - n_topics = len(LABELDICT), - corpi=de_corpus) - -topicModeling(ngrams = 1, - min_df = 0.1, - max_df = 0.6, - topicModel = 'lda', - n_topics = len(LABELDICT), - corpi=de_corpus) - -topicModeling(ngrams = (1,2), - min_df = 1, - max_df = 1.0, - topicModel = 'lda', - n_topics = len(LABELDICT), - corpi=de_corpus) - -topicModeling(ngrams = (1,2), - min_df = 0.1, - max_df = 0.6, - topicModel = 'lda', - n_topics = len(LABELDICT), - corpi=de_corpus) - -topicModeling(ngrams = (1,2), - min_df = 0.2, - max_df = 0.8, - topicModel = 'lda', - n_topics = 20, - corpi=de_corpus) - - - - - - - -""" - -##################### LLDA Topic Modeling via JGibbsLabledLDA ############################################## - - -top_topic_words = 15 - -print("\n\n") -start = time.time() - -n_topics = len(LABELDICT) # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic - -# build citionary of ticketcategories -labelist = [] - -for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): - labelist.append(texdoc.metadata["categoryName"]) - -LABELDICT = {k: v for v, k in enumerate(labelist)} -print(LABELDICT) - - -def label2ID(label, labeldict=LABELDICT): - return labeldict.get(label, len(labeldict)) - - -def generate_labled_lines(textacyCorpus): - for doc in textacyCorpus: - # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi - yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text - - -jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/" -LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) - -# create file -textacy.fileio.write_file_lines(generate_labled_lines(de_corpus), filepath=LLDA_filepath) - -# todfo ticket drucken -# wait for file to exist -while not os.path.exists(LLDA_filepath): - time.sleep(1) - -print("\n\n") -printlog("start LLDA:") -# run JGibsslda file -FNULL = open(os.devnull, 'w') # supress output -subprocess.call(["java", - "-cp", - "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root), - "jgibblda.LDA", - "-est", - "-dir", "{0}models/tickets".format(jgibbsLLDA_root), - "-dfile", "tickets.gz", - "-twords", str(top_topic_words), - "-ntopics", str(n_topics)], stdout=FNULL) - -# ANMERKUNG: Dateien sind versteckt. zu finden in models/ - -# twords -subprocess.call(["gzip", - "-dc", - "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]) -##################################################################################################################### -print() -print() - -end = time.time() -printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60)) + #printlog(str("LABELDICT: {0}".format(labeldict))) + printlog(str("LABELDICT-length: {0}".format(len(labeldict)))) + with open(dict_path, 'w') as file: + file.write(json.dumps(labeldict)) + + #for line in generate_labled_lines(de_corpus,labeldict): + # print(line) + + # create file + textacy.fileio.write_file_lines(generate_labled_lines(de_corpus,labeldict), filepath=LLDA_filepath) + + # wait for file to exist + while not os.path.exists(LLDA_filepath): + time.sleep(1) + """ + printlog("") + printlog("start LLDA:") + # run JGibsslda file + FNULL = open(os.devnull, 'w') # supress output + subprocess.call(["java", + "-cp", + "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( + jgibbsLLDA_root), + "jgibblda.LDA", + "-est", + "-dir", "{0}models/tickets".format(jgibbsLLDA_root), + "-dfile", "tickets.gz", + "-twords", str(top_topic_words), + "-ntopics", str(n_topics)], stdout=FNULL) + + # ANMERKUNG: Dateien sind versteckt. zu finden in models/ + + # twords + subprocess.call(["gzip", + "-dc", + "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]) + ##################################################################################################################### + printlog("") + """ + end = time.time() + printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60)) +def main(): + printlog("Topic Modeling: {0}".format(datetime.now())) + corpus_de_path = FILEPATH + config.get("de_corpus", "path") + corpus_en_path = FILEPATH + config.get("en_corpus", "path") + preCorpus_name = "de" + "_pre_ticket" + + #load raw corpus and create new one + de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) + printlog("Corpus loaded: {0}".format(de_corpus.lang)) + + #idee http://bigartm.org/ + #idee http://wiki.languagetool.org/tips-and-tricks + + # todo gescheites tf(-idf) maß finden + ngrams = 1 + min_df = 1 + max_df = 1.0 + weighting = 'tf' + # weighting ='tfidf' + named_entities = False + + + """ + printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting) + printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting) + printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting) + + printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting) + printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting) + printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting) + """ + + + jgibbsLLDA(de_corpus,15) + + # no_below = 20 + # no_above = 0.5 + + + # n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic + + + + """ + topicModeling(ngrams = 1, + min_df = 1, + max_df = 1.0, + topicModel = 'lda', + n_topics = len(LABELDICT), + corpi=de_corpus) + + topicModeling(ngrams = 1, + min_df = 0.1, + max_df = 0.6, + topicModel = 'lda', + n_topics = len(LABELDICT), + corpi=de_corpus) + + topicModeling(ngrams = (1,2), + min_df = 1, + max_df = 1.0, + topicModel = 'lda', + n_topics = len(LABELDICT), + corpi=de_corpus) + + topicModeling(ngrams = (1,2), + min_df = 0.1, + max_df = 0.6, + topicModel = 'lda', + n_topics = len(LABELDICT), + corpi=de_corpus) + + topicModeling(ngrams = (1,2), + min_df = 0.2, + max_df = 0.8, + topicModel = 'lda', + n_topics = 20, + corpi=de_corpus) + + + + + + + + """ + + + +if __name__ == "__main__": + main()