From 3137dc6e541e90646bd27c4591064da1d21908c9 Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Wed, 25 Oct 2017 09:46:44 +0200 Subject: [PATCH] =?UTF-8?q?topicmodeling=20jgibbsllda=20lauff=C3=A4hig?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cleaning.py | 242 ++++++++++++++++++++++ cleaning_bsp.txt | 24 +++ config.ini | 6 +- corporization.py | 7 +- init.py | 16 +- java_LabledLDA/models/tickets/.others.gz | Bin 89 -> 79 bytes java_LabledLDA/models/tickets/.tassign.gz | Bin 4151 -> 98 bytes java_LabledLDA/models/tickets/.theta.gz | Bin 8068 -> 98 bytes java_LabledLDA/models/tickets/.twords.gz | Bin 19902 -> 351 bytes java_LabledLDA/models/tickets/.wordmap.gz | Bin 4376 -> 181 bytes java_LabledLDA/models/tickets/tickets.gz | Bin 6782 -> 227 bytes main.py | 20 +- miscellaneous.py | 150 +++++++++++++- nomen.txt | 2 + preprocessing.py | 57 +++-- testra.py | 81 +++++++- topicModeling.py | 227 +++++++++++++------- 17 files changed, 694 insertions(+), 138 deletions(-) create mode 100644 cleaning.py create mode 100644 cleaning_bsp.txt diff --git a/cleaning.py b/cleaning.py new file mode 100644 index 0000000..55fd812 --- /dev/null +++ b/cleaning.py @@ -0,0 +1,242 @@ +# -*- coding: utf-8 -*- + +from datetime import datetime +import csv +import sys +from miscellaneous import * +from datetime import datetime +import time +import textacy +from scipy import * + +import os + +csv.field_size_limit(sys.maxsize) +FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" + + + +# load config +config_ini = FILEPATH + "config.ini" + +config = ConfigParser.ConfigParser() +with open(config_ini) as f: + config.read_file(f) + + +global REGEX_SPECIALCHAR + +global WORDS + + +REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|]' #+r',.' + +WORDS= {} + + +########################## Spellchecking ########################################## +# http://norvig.com/spell-correct.html +# http://wortschatz.uni-leipzig.de/en/download + +import re + + +def words(text): return re.findall(r'\w+', text.lower()) + +def P(word, N=sum(WORDS.values())): + "Probability of `word`." + return WORDS[word] / N + + +def correction(word): + "Most probable spelling correction for word." + return max(candidates(word), key=P) + + +def candidates(word): + "Generate possible spelling corrections for word." + return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word]) + + +def known(words): + "The subset of `words` that appear in the dictionary of WORDS." + return set(w for w in words if w in WORDS) + + +def edits1(word): + "All edits that are one edit away from `word`." + letters = 'abcdefghijklmnopqrstuvwxyz' + splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] + deletes = [L + R[1:] for L, R in splits if R] + transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] + replaces = [L + c + R[1:] for L, R in splits if R for c in letters] + inserts = [L + c + R for L, R in splits for c in letters] + return set(deletes + transposes + replaces + inserts) + + +def edits2(word): + "All edits that are two edits away from `word`." + return (e2 for e1 in edits1(word) for e2 in edits1(e1)) + + +def autocorrectWord(word): + try: + return correction(word) + except: + return word + + +############# stringcleaning + + + + + +def clean(stringstream,autocorrect=False): + + for string in stringstream: + # fixUnicode + string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') + + # seperate_words_on_regex: + string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) #frage ,.?! + + #normalize whitespace + string = textacy.preprocess.normalize_whitespace(string) + + #remove linebreaks + string = re.sub(r'[\n]', " ", string) + + # replaceRockDots + string = re.sub(r'[ß]', "ss", string) + string = re.sub(r'[ö]', "oe", string) + string = re.sub(r'[ü]', "ue", string) + string = re.sub(r'[ä]', "ae", string) + + # frage autocorrect? + #idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf + if autocorrect: + string = " ".join([autocorrectWord(word) for word in string.split()]) + + yield string + + + +def processDictstream(dictstream, funcdict, parser): + """ + + :param dictstream: dict-gen + :param funcdict: + clean_in_meta = { + "Solution":funclist, + ... + } + + :param parser: spacy-parser + :return: dict-gen + """ + for dic in dictstream: + result = {} + for key, value in dic.items(): + + if key in funcdict: + + doc = parser(value) + tokens = [tok for tok in doc] + funclist = funcdict[key] + + tokens = filterTokens(tokens, funclist) + + result[key] = " ".join([tok.lower_ for tok in tokens]) + + + else: + result[key] = value + yield result + +def filterTokens(tokens, funclist): + # in:tokenlist, funclist + # out: tokenlist + for f in funclist: + tokens = list(filter(f, tokens)) + + return tokens + +def removePOS(pos_list): + return lambda tok: tok.pos_ not in pos_list + +################################################################################################## + + +path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file") + +corpus_de_path = FILEPATH + config.get("de_corpus", "path") + +corpus_en_path = FILEPATH + config.get("en_corpus", "path") + + + + +def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10): + + logprint("Clean {0}_corpus at {1}".format(lang, datetime.now())) + + rawCorpus_name = lang + "_raw_ticket" + cleanCorpus_name = lang + "_clean_ticket" + + #load raw corpus and create new one + raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path) + + clean_corpus = textacy.Corpus(parser) + + + ## process and add files to textacy-corpi, + clean_corpus.add_texts( + clean(corpus2Text(raw_corpus)), + processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser) + ) + + + # leere docs aus corpi kicken + clean_corpus.remove(lambda doc: len(doc) == 0) + + + for i in range(printrandom): + printRandomDoc(clean_corpus) + + + + #save corpus + save_corpus(corpus=clean_corpus, corpus_path=corpus_path, corpus_name=cleanCorpus_name) + + + + return clean_corpus + + + +def main(): + start = time.time() + + WORDS = load_obj(path2wordsdict) + + clean_in_content = [] #frage notwendig? + + + clean_in_meta = { + "Solution": [removePOS(["SPACE"])], + "Subject": [removePOS(["SPACE", "PUNCT"])], + "categoryName": [removePOS(["SPACE", "PUNCT"])] + } + + corpus = cleanCorpus(corpus_de_path, clean_in_content, clean_in_meta, "de",printrandom=5 ) + + end = time.time() + logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60)) + +if __name__ == "__main__": + main() + + + + diff --git a/cleaning_bsp.txt b/cleaning_bsp.txt new file mode 100644 index 0000000..2edcd64 --- /dev/null +++ b/cleaning_bsp.txt @@ -0,0 +1,24 @@ +Index: 0 + Text: lieber support, ich habe gerade versucht mich mit meiner unicard im firefox browser fuer das service portal zu authentifizieren. das hat vor einigen wochen noch tadelos geklappt und mittlerweile bekomme ich folgende fehlermeldung ich hoffe sie koennen mir weiterhelfen. vielen dank und viele gruesse sascha feldhorst dipl. inform. sascha feldhorst wiss. ang. technische universitaet dortmund maschinenbau lehrstuhl fuer foerder und lagerwesen logistikcampus joseph von fraunhofer str. 2 4 d 44227 dortmund tel. 49 231 755 40 73 fax 49 231 755 47 68 mailto sascha.feldhorst@tu dortmund.de sascha.feldhorst@tu dortmund.de http www.flw.mb.tu dortmund.de www.flw.mb.tu dortmund.de wichtiger hinweis die information in dieser e mail ist vertraulich. sie ist ausschliesslich fuer den adressaten bestimmt. sollten sie nicht der fuer diese e mail bestimmte adressat sein, unterrichten sie bitte den absender und vernichten sie diese mail. vielen dank. unbeschadet der korrespondenz per e mail, sind unsere erklaerungen ausschliesslich final rechtsverbindlich, wenn sie in herkoemmlicher schriftform mit eigenhaendiger unterschrift oder durch uebermittlung eines solchen schriftstuecks per telefax erfolgen. important note the information included in this e mail is confidential. it is solely intended for the recipient. if you are not the intended recipient of this e mail please contact the sender and delete this message. thank you. without prejudice of e mail correspondence, our statements are only legally binding when they are made in the conventional written form with personal signature or when such documents are sent by fax. + categoryName: betrieb + +Index: 0 + Text: support browser service portal mittlerweile + categoryName: betrieb + + + + + + + + + + + Index: 1 + Text: telefon umzug antragsteller astrid gramm astrid.gramm@tu dortmund.de terminvorschlag 14.08.2015 einrichtung dezernat 2 abteilung 2.5 psp element uniaccount mnichofm hofmann, nicole gebaeude dezernat 5 raum id 201 651430 telefondose neztwerkdose dt04.5 04.6 telefonnr. 4821 eintrag telefonbuch e mail astrid.gramm@tu dortmund.de voicemail ansprechpartner astrid gramm tel. ansprechpartner 5444 verantwortlicher nutzer type bemerkung frau hofmann wird am 14.08.2015 in die wd 2 umziehen. es ist der raum 201a im og nicht 201 eine bezeichnung der telefondose ist nicht vorhanden. + categoryName: elektronisches telefonbuch + + Index: 1 + Text: telefon umzug antragsteller gramm einrichtung dezernat abteilung element gebaeude dezernat raum id eintrag telefonbuch mail ansprechpartner gramm ansprechpartner verantwortlicher nutzer type bemerkung raum bezeichnung + categoryName: elektronisches telefonbuch \ No newline at end of file diff --git a/config.ini b/config.ini index 5e99a06..39dc5d6 100644 --- a/config.ini +++ b/config.ini @@ -41,10 +41,8 @@ filename=topicModelTickets.log [de_corpus] -#input=M42-Export/Tickets_med.csv -#input=M42-Export/Tickets_small.csv -#input=M42-Export/Tickets_mini.csv -input=M42-Export/de_tickets.csv +input=M42-Export/Tickets_small.csv +#input=M42-Export/de_tickets.csv path=corpi/ diff --git a/corporization.py b/corporization.py index 64e4c47..9b7c837 100644 --- a/corporization.py +++ b/corporization.py @@ -97,6 +97,7 @@ metaliste = [ content_collumn_name = config.get("tickets","content_collumn_name") metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(","))) + path2de_csv = FILEPATH + config.get("de_corpus","input") corpus_de_path = FILEPATH + config.get("de_corpus", "path") @@ -114,7 +115,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la path_csv_split = path2_csv.split("/") filename = path_csv_split[len(path_csv_split) - 1] - printlog("Corporization of {0} at {1}".format(filename,datetime.now())) + logprint("Corporization of {0} at {1}".format(filename, datetime.now())) raw_corpus = textacy.Corpus(lang) @@ -139,7 +140,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la # save corpus raw_name = lang + "_raw_ticket" save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name) - printlog("Done") + logprint("Done") def main(): @@ -152,7 +153,7 @@ def main(): end = time.time() - printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60)) + logprint("Time Elapsed Corporization:{0} min".format((end - start) / 60)) if __name__ == "__main__": diff --git a/init.py b/init.py index 71c28b2..4a23069 100644 --- a/init.py +++ b/init.py @@ -272,47 +272,47 @@ path2stopwordlist_en = FILEPATH + config.get("en_stopwords","pickle_file") def main(): start = time.time() - printlog("Init: {0}".format(datetime.now())) + logprint("Init: {0}".format(datetime.now())) - printlog("create and save lemma_dict") + logprint("create and save lemma_dict") lemma_dict = create_lemma_dict(path2lemma_file) save_obj(lemma_dict, path2lemmadict) - printlog("Build and save Wordlist for Spellchecking") + logprint("Build and save Wordlist for Spellchecking") words = build_words_for_spellchecking(path2words_file) save_obj(words, path2wordlist) - printlog("Build and save Thesaurus") + logprint("Build and save Thesaurus") thesaurus = build_thesaurus_dict(path2wordnet) save_obj(thesaurus, path2thesaurus_dict) - printlog("Build and save stoppwortliste") + logprint("Build and save stoppwortliste") de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3) save_obj(de_stop_words, path2stopwordlist_de) save_obj(en_stop_words, path2stopwordlist_en) - printlog("Build and save nomenliste") + logprint("Build and save nomenliste") nouns = list_from_files(nouns1,nouns2) save_obj(nouns, path2nouns_list) - printlog("Build and save firstnameslist") + logprint("Build and save firstnameslist") vornamen = list_from_files(firstnames_txt) save_obj(vornamen, path2firstnameslist) end = time.time() - printlog("Time Elapsed Initialization:{0} min".format((end - start) / 60)) + logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60)) diff --git a/java_LabledLDA/models/tickets/.others.gz b/java_LabledLDA/models/tickets/.others.gz index 48c3c1b0bebe2f656322266a3437a2e3e02d9404..d52733eb0b4f24b7e48c4228113848c4511efc40 100644 GIT binary patch literal 79 zcmb2|=3syT?=zYwJhqscTrz&@<)!Pn#q@%)!IM+kC-qPG=xsB7c*^@E`=w(sl}~@V gY98I@HZ8_-rs_)9qubip*w|tid`@lOU<)({0IFCYZU6uP literal 89 zcmb2|=3syT?=zYwJhqr#G2UcgWMJAd*)7I%rE2Fkwko!^m`c^3tTRs@-F9qROr_>e phqi@FHN)d7>jJ$tu5IlU4EIa(%n}T^a^OG%H$$}9;c5Or69J;VA>RN1 diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz index f815b2d1901c638c278ac88335ebb97da5c23ea6..92186a2995ca63b19db6d651579e41f9c1094198 100644 GIT binary patch literal 98 zcmb2|=3syT+un=34GJPh{>^L>+~Le}v)*96+usc5PnQ>DZ>`5?&GOWMEjCG&nh2Yj>a&ml9DS4`gcn)M+K1bMOY z;+KK&mxD}@6=dBY1q9Uwsx1=KU8uHDZK2viwS{WszDWydC2gcnH_C0KPxIyW-9owN zhmGVvsuNBweI;&;gd1sYv(x;jYMy5kaoXDyech=^ z%ABs8^&{t1j9+5>607WeY~U4%PHLnhXcRcPCMm~Zu=Zf>!Ev}r2S;OcQVWV-{83_^ zg8CMdI&3G0~!8##!51>g%%mJYaBxUxq zok?M$WoDk*nQCB4@=D2qCCN*Y*QPvgaxRv%>u3(8?96> z?SLACjua{))dU^zQQBjrs;;-_*vh&t{SvwQ6oKAr(h8(GgT5hH~>@fBx>wuH)YD(q=!VpvquY$Ih%;VVoNW12OTjWcsj zWx7DAyPYhSEwFlLJGHAF7-{3~V5*+{Vj1koEFYSE+=7xhQ+77|?*7S`d9@FLdfzis z-)ZWim9taEE?!SEK|*b)5KC7>5=BNjNRb02P*Mqm{R#zCXg8Qx04#*3q=r@^6u?t0 z^bTqb|ZT0m)q z;ez4>bih^7ofXL;aKRKT0+c172rA&39JU2K+c^61SrN^KO|D3`u$NL;_F3|v5q#t21F z1x?TeCKv)2Oo5~{O1WAqhrQEax28F1nd$-)41o)#V4W4ZCb(1b)jGFR%bDw4pqcAK zTT`@@DE4LevWz8QX_sJd9Dv}HIgIm{pkBoONkK=3@LtpU&NJ*i{VjYK=e~+^u)tE z9v>j99uaw!O(ykScxL{GfiX8_V=o#bw9cQ5q>R7HXQNC}e$3?ePW!e!t~Q4}GL7Zb zEKd&o6_qMdix8TQPy`hK-zFg1U|xc%g`Oh}fgiPda!di`_nY$JJXaaZiY|nZN>4VR zM=uE#fUo?(Q`T6)30tpUY24^dhUpTde(q9*O=_AWfy^QBqux_Nw)CBMs{CH=Kc;BG zky(Cc$v*UwEJ~5m@jzZbQWp$bZ440udMqBm&CCm9&?9pQTrdR-0G~GC!d^oz6l3RJ z|3HVDJW}-qY=Tqp9sV7m9s2oCg*AsN$F!PYyfR%dkIYFLvU3joD8sVRL-qcBXf^UcGa`de;zjO>OT2I|s9CzB^Nh7MYqF zzMc-Q7^LD-O1t=Z5iBppCSX8tM4O1s%*IQVxLU%+wSt zLg;>$DKox}nF*C;Py~Zx#!RJVSG}Q>9ToayCXDDO6AtTdscUPj>7SUOUV}b@{Duk+O3}E8kt%KX+BiJZgpQf+wLal0(-$Y>d7|PeQXVr(P8+5 z6nkPa8=E8Z%Gst&$s5wzSZ>d=&1_j#CL7yrmNQi~7Tiq#%$X@G{K6qCn<|Ihxqr)V zx#!!5(#>?anJzc;DUwBP<76N5k~6;19JL+ko=uH;)>0$d(wV0Sw|Q#y%#?VkvdhY8 zRrYwED(%|#yUu4NhoZ4E1q%8monoshtLgryi>83VYL$jsSj$!$DeoWZ-Vulz^$QUX zO0eQP>1CC2b|6eHh?sAd!`_$OHob#q78PsM;x?x^StOazTm` zi>YtMPF>hR2GG!}MY=NN$?lrPqH`S0L^<-7^It4VH z0-8M2i#}=B*AH%iVj9z9N6N-@SOW$A2-Xcp;*31&Fge<=ydP_Y3kdK)cObZ zA@HMCvTM97Emi&1ZVpYmsnY@mu3m1AIsn%-Hzx*VxR3!90juNk28je#yJmWnT;LBUXVZau z)>h#+2hGHY9S&$pH~_O5aJ?Mj(b7vhPc2HiKO-jo6WUBBEI(HP(P-_6i1vsmFnR?` zyyzRPlNu>5_$u%9M&k`&o20tZBWGu=j*C57Og<^UmMb#@8kqqIG z=^gpa8(b?yrdsQzmrqQ)b?3+J!rur-0(3oYT1lbCoBq8QMF0Wy#adEtJ)B)AW6TbTDSMr*=Yn zWuCQ%Y}mKVJInIyM$VFAx{+S|7N+kS>&TpEOqJR~wJz!ORK^K%S?$h?k_@LxYK#1= zGxXb6B)WJTehbQxny*_v=dB&xr+t@aUkP-(aFx}L`>mm}u~Xw<8M+XAQu9nza`u`x zl8*Neu&0aPw}Nj4MTEb2nzMMrzZi5Z1nM}Ub!^~Lta2+o+*qRJMN3RPAz1Io(_PCLn;;; zGYbGxnW}Q&RQL(j8f(LS7cLmJ1Qd7WS)(bQBD#Z(5m{6f1RUhZlOA~Z~ z36jqRQ$RP=^`s@^1J^qiiPB>huZkRQ$xxQ0S4zJe13Dt<0FLs&83)5uuMA-)V_o_Q zlf%j)hPB=p)^62wdE;^44SVdZk~UJF6&Gy**YUfjPEW0s*Y)Y}?6bc8IKl;*#de0s z`!HRG^5?1b{<;@Z^S%5oWrOGG>hC89uCufKY9kk7KaZt;?t^Y8T#bLb$T(kin2rH< zPu352Bi_@CDEW2t>Z@G+p)kPl%62~sdoHeg-$TeY=R{<1s(Xlhq~i9&N4TUU%a7mb zmetP|G&labfOr~zg$nY4809_3to_cde7YL-&bKi-$#aFHRjSu@H}1NB@Vt1Cc6J4l z8I)*-w^GjaDX(ek?K~sO4yO6dAZAxkER*r)el=oeXzYqcOZFl2^maiv?wR=(;<|wM zSD2lLn68U>PoGLhe(uMmf0_BHMXl~l9KA(zgfbBJ@mwIkcUlGV8X@rCkV zew(uLukIh0J0ApgKKMhhQuuy6==wY%EX;5|GsF`;z82IA!kcWkhi9hV6Xx?l&$ooP zpyX@9|DR8NKJlNQfd3nvBFH*F~b9?8d^}d_R zyW~8X?NUzHyzj$OO*tV`-p7=#n%QtVDpQwDQzj%+H6Ic!<*D~~`uczKIQ83&`nE5x z;G*x3QE#MpXUT)geqQD0)nktS%&V>2kGa_Dv7TmkmQ#afS>5-TX;|{}XC4IW5`XGi zY$`IGib|)Nn&$({S=4>Wp0bbWZd65B*Q~n0*@OoLlUJKePhIWsS0^=|9<&se+Ap6) zBlf7@3g zK4TmR3quAh&b_nbe@@?DT6bmY*_UFEhQfo|$gdWiDVt1KQns%pNUv9L%**t`udmi5!){QiIa`)*9{%=# z94#?M1{VZaChrV+G0r=A)@7`}t~g{6(Z%OZdO;(TypouPVnZa<2+BoyYrXCz)Qf{dgSlv{#Q#)X#iP-#B-0p)qw>;v3J5SRK#ov5rzKdnO3yn{n-Rj>nQq27t9j6Nh?>QO+6^CVxCv7d530i{#+9G; zJZN68dsM4(`aM+-idSvF{I&84inkV%(X0xrY#cXS+qiW9{=IxUZ}rK#%D$-QC+H(- zy;plzZM^+eKSog7|DAcR?g6cb8Z7ltMG&kZGhtBPemVrR2g(?YWgGo|Y{hq^wo?3F zwo9R*;~vj+BBl$2#35M>@pXRRE0s6@Hba93Ub9~Kr?8ZLCM?vsuuR66Y#75C*%Mat z4PjuUskGR@!NFUC=DH-i1$Fsy@A3&l2$goCc_jYz$w4oSJjlU40q&jzjguPEqnptQSh*0`3OTh&bG3`$=zUq#2iGQ zBD*nHI)ck5B$!BAbA>Vw_1}D(qS2xrsP;aq(>HuJ6CDrcN(YRYqHK1Otfoo_j2n0I zru!18{19?qu^_BpD^;>Jpl;89EcYb=j~LW90t$y5(oZ|#o&lJ=&g!I^F6xCKDCsypvba!YL7Q4@%Dk& z>P5M(KNo!l1PxR`&>_v=Nj9J9PtYQxOd>5BxPd033d~xv$zpP(h}H#*JtfF8&Td({ z+aK;|U-jtxT1QHh%~Q%w$>5z~v@V_Ys*3>0elA^gdnpZNZ)+bVoi|kK__SMaa0orj zk*@Nlo`HtFs=q!8jiBWTo6ph0+3q1^LUOpRV~(9=N%){4$&dgYf^x&{!)xcC&zOdB zB&Y0h5phDciZ3Y@o`Ql|(ePFSQ&O)Jrs2J%{V%#*>daWIL9;YpUd=FnZkOk=_I>ie z*->fAfb?woDg1g~Bu#8Wipu%nMceTTqI+yaqc8*j=JKZGk~pUwVY-{OZfkU?4|s{O zQu@G1IugwvV{)Ub3{~Gk&ApYVgq}|@$D%8gC2eIS<&}z(*Rbh-`H>n7BpDUVDlZIX z5|cv==xf-`eIUbVR^B|KQG6N#F3u2@?4`sW)!M6x)s6hmWWC@qJa*j1Q45nwf7}t7 zN(}L_W^h^1fEgIQ(VN|CPF+ofh5dIiO?`%H>c*Y~6^JGzUdBsP(wpNTJs^lys!u<# zQSm?}pQ!+MpdgEk647eJwDJ|1ZcoDP-4Tqp^c|gsrHY%J&&+@Xutw!4TuRDL^#JZmZ_6A_j4R!~Z+REz_~6t*?Tf;R z*M`euMY#tMt6aSJy@v{o`LRQZVw3$lR<-N4Tjx9Ed+l)EAugQP?R1xeX}4PKn4N(s zn228gtnb}m!PkXmg*yt57hWm^F-DX1(;Uj&2QMh_<(!xG6w2HwYx98e?EA5G(OF0% z!DxSpjU)VleQdgLS<`eBkk05;8wfuBN(Ia-$OrB3QWcyRJjb7?S!e?Bs7(p&WC70 zmO%_4t5dI@`LSh+XA~8npdFB5?!4Uf9Q*|w))%vP7I+k`voxCMBf93lWd_|kzcSd@WnX(0Rzw_P!1+CQzn}ndvY< z@$G-#X6u}8u5PMrs%xrmdffDsM0=g!w-DKQ@|@qAZKup>ovA)D^gr|Y$Z<*B`$5Tqr-LW=vEgZcPx_RUp~kGY4%;cV z3S07gS{cGHz#zU1&8?PBGH-^Jzgd>0(W-t-t)kxZG;RR@&YC5c z`Q+qaeTo+pZH5HxZ`CYU2N$iHEH_YNDCZItN_JRF9Ab~lCNer>l2Q?Xs<_|u^s5d^T|QM(YWjy zI4^)1_#)i?3Ov%U*!lVa82!7dFGiU6Bhf|jCZ&1gX_wKOGWu0vxb;yDPuGGoZn-+( zBF;_8g^JMTvsWmDw=&9ttWGY{AA|pi8}6Gq3N4wqc!951$FHsZHq#O|c_kfR)+7of z^Rayw_$PmmK<-wU)t!$B;$MpwKgvXvtlEGtN^TI^(Ca2bYv5&h&)DiM4y+ZnC2%POMI)}Zq#m` zzd9e|HqTh$z#U`E?+oXAt9Z9F<*b`tqAH7(pN*+O>g6W>y&1kTP~|J8zrZ#CL=IHrxa9^0gK1+%Dq@Y= z*NRYcLY$P5lriL?R#{lkQZUVkGMg=YYGJuF>DUJ zdEYk88=qrjn93JQ`Y`d{w$&M4@v@Z|?sM{%7zXz{OP>)tv~h+G0bLw5QJuHpDcS(` z#KsNvCd&y3IY(oqk#Vvlz}a(pK%5yAq84gfJyHy4(`tHp;$+QEoJdalKwG&(tRtLP z?Y+b^o{?+KY0R}gs#vyGFAy*-L|ETFReyNWt+NvyP$j@xOA(f1^p3gY5up1nmXdI) z2{3wRNj^Ugaoo)fF^>#wx6fHSIc_guxs;gXB$iNOtVE^%mfv(Kp;C?#J=w`?DGh-@ z?Uy_C9E}?5%aO)x+&fWG4umVPz_m;rkxgod$KLbwLBctEkAig@MrnTl%dDw76YU(!FOO z!M3CAk3ccJyn>sYBz67Qd+5hw%M?Jq^%55r9cwQw_F=<`A@i%2)pu=d&ku44gs~Uy!tkp;jEhN>Oz`~AA=d$1xX`ObAlDUE-byM#_$9{_ zyd1+B#H)XU z{z+!>gyTXPrm&4pQltix!Dd`-mzuX14+%`si5;raid)tkwxVV6!%GR)VuqvaQ`jGAgNc(R>cl^ z@h4<;z3mkWV`3&NARfXThd&qw9t=1X;2(epIIOTAR54JY6BUwhWA_84FjV>xP0#@R zUIsLPEJ-=Z;J2)Qv3$}f07eP9{*K8vM{^b(4>h+XrDej3}vei^PhMXDF)o|tj4RBKpRaF{td z5}f0=CiYXIfi-{?+!p}^kgCSQRjpO!y!b`a4W@@c!SxV;|6qtNA~0ak^4_H-IuBzX zQ@u@cQcHNQE4`#YCf6l}uzzV5Rtf*IJjWTZ3X2eNshB%{tL2pp`!8>$?FS;6HDn6Q z%BQVAh2E%6p7H#GhvgR#fh&F++7zMSR@7i^%>L1~f~_W#sm)str3MviKHr~LWxCNl zq5navx6XLmb-tj~bmLv3IE&?*b17!zL90z#ag3ZH@p%SHJL4aX<4+Jqr9{tL)E0%y`=@rYqvTFkR z{ns_RLGqrhb$5`-AvLT`a;fuM?9i8~3`yQgUG^~PdJ)q#MA}&i`A5SfMnr4yWVA)e z#-oECe^Hs6x9WyzfrWg}=h3u(EM+((t04@GqjB#+E}GTP^A*E`VsMaG`k(!Va=`j= zg&{(Bb&LO5f#tRg4FCCt^`P~|j(;w)poYtx0}s=d?L2~i#Ci91v5u*w=aH}$zw_g5 zNghlq|8Z>Ym%kaTh2GpaU#g>+?A2SfhClX?I@!`vz1pp_yJ*3>zOqF}b=5d`7ySB}hmdub77h*~DEUbQzfFVg;ued6ij^!ambrce%-6%qQl&9Vu< z)LIx**x)9~7d$R-V%NL=4#%VO0HZc$q8g0Odd-Jt@@M^~hB?nYki_8wYHqI=mr{72{k7I znqv9cgO(iFUOA*D@I$Z_27riZRqG<|{+#E*-VltDSFd|NUzECJ1X)@(7~tGpMHuXM z7wwWZ(a0nH+$a>XHgo8*xSe;`o4c}wWO6FhHS8H6P0@?V`IUM*)L}t;5wmWHFVh$5 ze@-;K@e+58po@%uyApD20I^M0vVq$dK7&l4PzrME(bS)nY}1piiqYh7epm3s2OrvH z$aKL_mf)%FNOlf|PK*B7Aj?JoYyP=~rSK=aU;+*-P(y|{?b$1by{Pvy{Q#b!_t(>D zP#8erKneJ;nCwvQFAQ#a72e!_^RfZ4$o})jfAaM zISE^B*Db$6#|x`A^n+xjO~)${v=M93B_ChICa{SPPf^u&*T}S!D))G7VHX?B*@avOMGaCS%DJVGmjV- zo{`frgCfDLh$Otqe!|DTG+0^er=k_sFKWn1L2UN1wSP8^|B~)Y+Hqbv6!7&d6zxjE zmqU{%>fqia5!AdE(-!=Ug-8cCXS7xJQvc3E1azOH68o!uro8LE`L6A#G~aEk^X)*l z*&C4FmnKn1a;sm2%b4N-rbl-S`A@r*N8*ItM+MjBE6kEvNcNb4i*ok?_@?V5DG!?rlr^@?f%O9qYajPcNDt;h<{oi~9~ z+tU=N(F$>&_dE}!jYqwyecLKIX^c&XiWwOb`(b@3S2K;VNt_(6p7W>fJ8z) z`99aTkTs7 xOk33OX0U}Tdo5gp+-QC>@8r&hc26tz1*Iei*IIiuZk zW@dZ3r=LR-1BCeB--X{q66HwUX4X4G<|E<6lbJ^#FQq6{Iwurhc~w)_V61xjG+UEP zX;R*(AXRbg8m@6yIt3?-C9zxbW;zn&TBz`4%p4kT7}i~oVsH0km-uq`WADXc?N9dk zAC>@TB9*;e!N24_>5KLTJi2PlP0vLv6-FPX87wRltpeRh6Yj*g;`dHOrStawDNbO4 zv*w!18h-Ps;LdEyckcK=PyhZN)YHtE;|;l<=HvFl%hz`L=z7N2LJee=`AbhOM`^6m z8?T%ifFB`A_)RkjydCx7ZnFSJ^pOdF9S=)Qi*tA67J6{}9{KRIw_!C68!Z3ui^m_y zzOxyY%DVv$pPP`3q=TrLReOItmgN&&o`ff<#Uq2w7Qguxm!|aHM^K`TivM@iqXVy2 zWfi>bc+<$c{H^o{B|Q!M&*}C{z;dIH-xkljxyo5$X7?sJGkfmdl&GjJ6YS$Jxlb88 zFM`4#8qmc^X^_X6i3YOSwM*9pjgS(X=co9I$O0mQ@crWX3zxoYljrp!xPKfNwrnAZI}%!xEK|UD+*`?uD7Zyo8~IiAJaH zf>*1GA18Mr-&AnrD7zK;Ou37#z;IrySU~;9NYu&Y*GX%VAiOpI+2=7EVHDH4?zi{&82#<|BPoR4 znScPwdyw&hNmmV}S154ZQ~@8H&Gt_~!heXCBHeXQ%!Q3!V=@@}9T|1s#^o zw;`Nw0>5Zrpp$3uhHNraptCDRZd}sH6Y6+hnnSAS_bYvL>9}@m9Njg6+Y{W1%lK~VI&3M z^b-$*Y%iGNz(fz&%g2t#!^F7qrf8sFmbO)gz_jhO&V>`9#aW#1HOo4>^CcSnsuR=V zo&MmSrujOfpV2_#uW7HOBEGLH7uw&V3?IeetD}-*GPPxLt_LLmoxU^ONZ!i+0HmIT z6n3JLfIrV(43%76tbACkEjh!@(?zQvg`ja8hO#P3q~<*XTE#5_O94yO7O_5QOxe=* z>*|N|OH3DmAMHRpCVePdg7mbdC8bW<~|7|%j;Xwr~tkQ{8>VI z=UDxEdGA^@Z_@ZJnEy|%pVF6tA0!~5+}-Uqty}yVIW7^V@z_EY9h*lCm|&uL2SMVw5RO)J>7@$i0)^+4y;j-|cBDs+cq0GRHYEs4X{% z0b^^l^jkH0`Kod2?bh|o6>B9k^_>pO$Y{YHZz1`s@1!_(ehK~b2NxaX(`O~8(bZ!W zjJAQwaW;afgqY8t_)44*_)fy+N5&9A{Bdffv1TiDdw5#yyy1gd@^F=s1k!#|gr8}J zayYxnckPD6nFv2i$@5Zk@bltFX!Oa^(6Uq_@D5acD1CWcaAf!T@@5GtNp*a5sq!&f zDBr?c%;FjLh#F$|<{8#*%4RK{Z6N>25*!EX}aKi|pJ5WQCUeHDIiXX{ zx;5RsozmpQVn%&r(S&^r{;@!9USIA>(HCn81o=X)DH?y5v7<7Ux2H5#Tm=>f5MTze za&6Osf_j^UfAnBB8>lN%RY9}U6gc;W+u1>vpyH< zs1?g74N#J%ta~Wm$~_^QhX+!H-ZSQDJsGS<^&E)t8X&#Sy0Jy~Z<>~KtuYPT z)#`x&==N-#uq;6`j(aY{s%WWL111^zOIo_C=1RE_7E9WHK2hURP$$s83L)WxZi2eT z<3Gpr^0qUO&%P3e&Ycqb0<3)!E~|$x)sL zl@tvmUA8I9VhO*c-7g|if>>{KF(v7tO(_KOSl`|BgcFIGTb!Tn1@j(xxT`AO@UIIK zs7)g}(y5H%M2y@L=nrU+K7}DQutwiXOcro5catA~mTi$*`Yl+U7?nUVJQV*Eq-*R+ z&EjvHuyml>DpEVN2_>G$p3rKfN%$?B0OUqF_6D~Xw0S_a9U<|5zTW)y{Vy}tYH5XPC<{xeNl8_Aj!N>c zZ`+onb-;5490r?0bqGB?1?xIr{I>}_Tdilsk{d!2iD0oc3FcDZX`iq9K`$0x|uI>bk zqWvu)EBOP33_TE)4Dv^_Y57w{+6tC>m?$IRN#w@~FW7KZ!wn-BJRJof3>XK@4+ zECTOu{%sd|I<5uoDejt|@9j0+l|BRWKwZ3ngSi4Go+}sAJg#Ik(lae*CzU^NMAL6b z)4mHaP(HE~p`3TMjI4(Vd1LsTR@9OL8TX5@5R%alJZzygKB&?JE56hvxo^S0HK9|N zcMxDgFJ(ciI<>&;C_BPs@DUE?Yh9|woMr&bSj2U@KDCeqwnTp-)#c)X829X1+=zB2 z=!efYNxS-?^5Nj=OSl?YC8qqnd&BT}psqTO+9%^XG3-+4PNgo9KwT=vVQ5c&BCu>t=AYE6qw)lUF_Q!XJxBG%&#X=bAlAJGYxn?}Ha?s+fww6i!hOM~JT% z?>Kn+w;6QjipX!u+yA+%6*zSjDSy%9kw5mVWZTQ2**@BW{Z^LT3C7rCV zQM|VHs2GPl<>x}B$oD4-1NDM@L0R|g-tEXg5djLxE6^)Q^w4`^I`uNrWFk_IBth@D z_b*p7EOT<#O{&~~y;Agot?%}hi&jX9y=Fr3wi=rL{;}~U20m_jdL>ZL*7&aw89CCd zv9*~vcxbhp*zl1(@=nn;#@^lXPR?5-ASr}pHvif$9V=dvnubnJ=~qvhQ|%0|4iY|` z91`9-lrq9n$~93Cnm>e(QaghGs)B9<*VwLNTyce|!|)E+_@OnN zV}%*KNKFc;Q0wKt)P)H(x5YtYH^D^%;KPP$gwvqO4$;((SO-FR^~S3j`zlIcT|=*x zm*qN^IA+x|mY0v;HQQwqTVei;skaT|-$cY;KDzjdq1kr7jpau}WwfmsWLs*-KWn#o zue5#S_kD4l5x{m?N=6$^k2~%|4f@VJ5n}tuc9=uWhGj)(on20QroRG+tv!t4_jrLo zU{MeQ!RjKIdY-XF)8+Kgh8tr*U87`Mk%sr%L9r2g7mruczqT_VVrQ<MGY>|4T(%+5 zvV5ZYWUxi|Kq~0n5Iri!Ea>=CL@<+sTGjg13a{AoILP#@+3GasL%B*VtC+gup3`R#D!tAJqn1 zPg}c4l#srTxsSX2i05s=jrx_he%9)13va|^QCqknkQd~^b3a7&~(-(7Dm&SL*(Zp*mtp#A0 z&9?B?L(NUeag(AQe@f4o>eBiZ6$KeN^m5f>QZJwOM3;Ye0`=R4 zpIwj}kX(&-F1*twoiafMy;=t*8TbPmaksVULVrk$+|51luL_WYO%N&W0q%_^As!HN zJ1fwNd{f_t=O^=;S@IeEFnltFMqvHYZ%_exW>X2zszzjxe;+4T4lDvlBWs@wL(2=8 z)B_o2xc~Ss-FcE7HSo{GbXKsJ(e&SW%3|e=NJL)?5-0K6r#uW2CD(mxh4usbz`de9 zq1=;IYxo8$y$$xp5a29-%T*s0@J0}g;Gv5Xo=W5Cn zYRtXRfE4xVKZgd|n1!-^v5NznY6*>3%OPjw%;nu@q0X}wetFm?3+if(>}9T-SWs1A zi}jjk5J5Ep5IX*>1QGoC^=fty>ZP&l9>yti+M~=jPgjnmQHe2rri3hjKNGzPB7{@Z z%EQJTkioaTm;qQN6GuHUI3owng%=JHl$?t(I%|J#jJ7inpiu8j(8`%F7G%aw@A7z@ z@Kk)DA02W~+FUXaKPOMNJ!CzY;SSEJ=;cE}@d>exY~ zL;6$G0=fJQaZEmKt_%_n??p|wcdSCx+7A+b;lyQw)AmL73GwwY!F)X;5kcE`7L1$c zIIG(!kiq{nWmJ~B^V^`{a`OP{7u)r zmo!nL#?S_6l4rgM)N+Bho0$hzx*J1Xsw-Hz8H?AR_JS;OQ6~8>PkKYmE1)si3!pIp z;IzhRFF>82)dzIiyANpuX9-evH>Tf9K$$$;JnjW;f>3e{x#&k5qbIwejifBi)acn$ zle?oQL#xyi;WM-n@kxg1*^|YexWE%tdf>69H0c5Duaqi?uCMWf{jzsx{;{4O+trPK z@&GEhoq<#_`s5mx{%2)r$b>dgQ{oo25+e%rT!|V@#rEh*LMLN~GA+B*{AdO1ZAk}m z)+xIQBes&ZrF-_bE;qO<+%jkr*4E=M>Gp<;U91ypT86}^(cxH1EdJvY-KXnc-qjp) z1=$>?cp*-H5kv5p13Vw8nsbftW1ieOa*X{_l-nyza%>u0D^af*fceDrg=UqJ3xj^GqLYH02?4snSC zd^NmHeC~^iY0shNERI5`Xby5$Zw|r;t!xvpgZ3=XF|3`7-%R*((&@Hx7=%AXKM6Fz zHVysahiB3;=)!A{x7*E*v^p~%W|9uAiKjN0#agHy%?edZ45DrN1VSr<8Hm7;XDvi^ zHy&+($Ht3cse~L@^*~_VHz@r}traatR`EwX4JFlx7oq6Q6dt#{TUGYc6t0OQU3bD+ z!WQC>QG+SIDjqV2ZFTPgpvp<-aZv1djMPF9LW|Dskb3k~$meh9$2J|R(z+J7-=@my zql6}G>OH?Vsg3VrD+(6>PPsa$-rrf0wP-grB&FG8yE3DthhDbU=^y65PQLLZ)QzWm zz=i>pd48SiQ4;0CG^Nf1+2_;O!UnJXI@zuX7hRkn2=yXkOmD6@Q_jS$ZdE|KyuPYl z(MOUS@*GUq`3?uwC9jPcJ{=Jf#a5I{bNm?WqUh|CxXXvv0Swt2C)SS{9YsQL-`22C z)f69sqZcfJmmeh!t_?;iycUl)Vd;EjyN0RuMH25iWdlA!Wux1D9U+xX^(TeqE_+7* zgcB!*RpNkTw}#p{%+Or*DdeZiE^wU+XJRo*td)cZC37B;Fv0+Zd<0#JkU#FH#yKyP(=~75 z!t5T9ZFldwK=Pfp5EXiqOooPhw9aXOY~QKb0Ey3>8CNn-hmN|)4v)|LnKh0uu3uxX z@0Nve7+UHlyrj&EUgB}WnJO)R3fniXqtk23&>KAq&-_3Y&0R56yXb3UNRRwE8s@$b z3%qPN?LfF7!#_l{MRs2Jj1fzx0+LQ^zXcM8+%9qlEsy1|c94CmI-$3xDHYl;r~N>g zIk-KiD7xqQ$0)1OS70|=_)oOJHDWEyx9g*z(+g60JqI|Gk~NfbBMmz^b}n2{^#aOP ziV_v!XtP3enOiOqFU>l;fahrSI7*<>$@F=bqs%B4q}RHp?q*snlAZukN0F)u%>{jw zGu=ujV=!Ca&AVWC#v2XWDSacZ*X&c~b4~djw{1}jFPj}p(_li!6Q;zp946Vg5RCse zzXUmymCaBBtLdt*#eK(2x%PEm{1P}Pzq z_O59_=139Sjp?(=LYC0L&{yI1L5G`pZVApTOco1#-U?neIorlbfWW0cdCVYH}RjN$h8w=epV-jDUgP2i)zf{f?_& zE8`jj&)Pdr%c(1}`D=m}V{M>D@X3KhR+0U#IemtnaU9Q3N4ye0!BqPnIp>hc=I$p! zW}ESchR1P8nh;vm+H5sZPRFh;HP@WG@7Zt z)pbGW8PaS-HOFWZ{4zd{@rpaF)q2}BDKy`*uIY~q!g_GcLctkQTz~6yEeO%TN>%aG zGJgM-ORiY8L^TJk%;neVwrLEgC#Ge5tlxCY_}<2Q`~%{rqEw3K`~xmv_28OI_RBR_ z4Bw(lLGoF58Q+}DhL47a^GFk#a~L?bT{G5D&d8!5MbDAto@&(HeSBNV43N{)a@X?W zn~Y{W%c7om?HrG(tXkuWX9|3uxK~zmT{zI`sP(kzdmR21G^934dE z6B~pzJHb$3`V3Zn$XIo_AP%J@MkL+G)jamF7H6 z?)HBmVr^Ug_{#R+zPPWs{(}D(;#hkjO07ncS=1nn_ZOp&rBVuHFIOUrceYgundi(a zMi}q!0-YIF(Og74rP;%A9MUNIjzP*G6V&To2pJ9@^qtz)S6VjSpOvmisY?`kW%t(R z^yh;*e>yZ15YvV(i6A~I+OXjB?`8mK@{bRmJ|#JXhte{3wys4*x>)^JrtIlOW%USl zPys>&Wds8vO6uxUDsy5gvZ|K#o=Tyr9dm5HJjyberFuW#ZWWVN|5##-57J~lO8Lh4 z2K~Jl80=1qr}vw=;TODa(qTR)2;e_KfcgH{B&o9~Ug;-FrP2O38`-f=>wpwj#&Azx zOH9@~cMRCtl_M199t6{KlU2l+XfB7_;?hBl!b-FRo9uS+z>0x>@kpEfwzI+ZJKC8) z;WtiQltb;t6@5Q7A-eKbLG<>ZOC_E*fy;erYtOTE#~yg z3I3y#T$l#%?rr*+(K&nz#15Q{gP!+AIWy-?WqkxgJ)R^El~ttJ-mc3{N}J}IC$?Q| zhZj(6Kbxg8FRjP{mNP=REc;WoFsXQr^eT7NEDR>)js3kp!vRBY~0X#1@RFN{S+V`s{N1ZCs-*f-T95(1*6sedSZR1MNH!CB95=eMvRwX({vMz z9s><443ovn{JVf1DMD%)DJr(O@=Q_86DGC*l{C3My7+!S};e$i(|x_D)_!Z_tIiTh)$`8S!;(4v!BI!uS-SZq}>4D48Fqa22wZpg8}gR?JCZmJk>g1_@A70W`B zSWAMFWdCxD$%U4+8FPxs*?rb}#4uT>8q#2I4xg06OBoEe3r@-?w46UnvYC}}3#DjeNZGWj)|E9r&%EtoN>OunpT#jSw2G~y)`Tl9zV(pMTym$2$-!-69LCTJ z1vwubN0QYjLy8vNA_c;x+rx%(V=oibM5%=Sifnl~fJDAE%*S_ZDn9`(e_^rE8Q zx3b}F_vw)o$4@Q$XC-}2v>{F#Q&#zyC_l6$nzRKO0t-DN_fBCo+NqQQ*-A{ixDy^l z7_A}*>pX*mA`JK&k4_TCv=}WjCNX&L7n^!5rklk(9v!F#oID_vMo`X+P3)g=Fe|>E zum@p@FXPdLt}Q_Svqy$h$Ts4?i?@=9z;qA}s*pwV`Xi{7J<|bpp6MjZ_Da!Af0yn^ zM1eCT!zyBN%7ZEX^pIvlI@wl zy__z;0o8{Je1REkv|`Pyt@jGu*il78oa{@b!K8NqqgZ1}S~D)CNFnsLml$`!in6>e zzO3RK1WoXuL{AS8j2rk6r#biTZuh2wz$Oqp`;|CT0=g?~AY}~MoK2-IP$28^ZpT4o zXW3}ju*vg03M+ZBmOCzviM91u&Vi^V-z|>!x2$QCJ!0>3m@z#RY2%&ttjv}tSJJ+> z$Q>hZx6cjg--;xyqzI<%WG4qcRz&vVwvv+rAHIf?uA~6P63WrUrOgpSNjWP~gJ~7L z&{i6ahycpc8iW9~pf%FS$yC^&yc)>z;`gnn6zn zHfa;lBuSQlL4>)q=)pg5tI?pW*0F&>lLx0Elv>eaw0eR7c3&LWE4rptD)z!n$lziOpLjU3LlWZ8b32C)MDymVutRm(jwUY@30Km7f$v|AM<= zkzW}_Wj-C;`t z=jgwKA5G7lQ+8JYgjy;%y`A)g=&$_3#=R&^GaT-8B~>mq6o-GjgCIo3lw2b9Y?+5Q zsGI`r&2Av0I^a-QSxkxJM`El20Cu)s_dI6iVZ2zoHiMNrLt`P)uuBRN32uusE@sYJ zxPl+Z#fT&DOTI$TXkj7)CA0 zLq@F?UHiIz1df+ga zGh5Y(tC`uWkIoe+A;^NJNEH?Yr0QnsOD?0UD^3|1Ts7-1p&(0Lc!LxXnu+x z8P@R~_mKKFt`ynXyQc1P+T$}L(xDiq7%M6{FvJTj40jURXZMFrFpE>4Fz_MR5A7W0 zjdZ10M?AqLAQ_Utibj0knoQE9Kf-w4{~67&&i)y#5NcNY{jmB_pHW>LuLnB@0OL!K120q?CDjjg zF8V`-FedarFBbPls3?m({5UN^j|8A7Ry>%oxF`W5J2ytWpWyye7y#_Q%C!F+^8=+5 zwve<);&9M@6>na9^Nfv_5dk2$v@(4SUM0ci5=&|R3E!`%hPl#13xUYagzL);f!I47 zZo}grUt%-PjGJZ0z`0gVYkyi+p2&uks>r|ipt%TmI*O^JTd%ClE}SYav^Gx}UCm){ z(fquqsi&#AXk!^U@`A zvXMbR6W&RvE}rcqBoqGd#R1|ga=!li{IWeWzxtECUZqv@`gZJXgjhyZ1@0bmOvx2L zTZrF$Xf+zd_F)w1+`hN4(e!VrEQ}E|Dy)Hb-aTb|E~Tx0}6)S~zF7OhhnQ0=XUzSHH+RbTry zycSb5=O;XWEiS{Wi418X!DW`4diwMSSrcACPu}JAPEUyuGjVe>C_@Bgta(j!Li5)G zEyfQ@qHl21RhfndVckBl;KIULebLQSfK20((tZ4G)W*^#!31!)EXSZ{Fv7d6rzW5D z4noJ<&lq&vDt4EE*1u!v$FY0mgm^te+2Lx&E2yBrEjUa3ByN8z?gy7KJ$=e5`J4qp znu?Y)1&~reL*6FWN~X{JT~OBf`0dP>gDYn4=tcgi?T27-nxaKj5#Ger?dMSb<)PuD*AQg|M2hy zoyecS5M`!wELza^w0`uAeOcbtvk|@wH1w;B=Y{l6p0-K6jisS|x}q})e*eRx>8qz6 zZfxb*7I&9*1-EwPP)k!@+X!ULYl+J^Hi*lT)4C|!*}Yz6bJ?cliREl57bdr!cHE*6u zSpKspCKLH#Hxv02)n2VL$H;8?)ceZfH{fH&D4zB$t>ZRU zVIAT(#?~{<<%rt0Co;yobmKQpOnqc7-59##am$|d!L_*g@f)j0=p=1s@_zeqZ0`HY zuPuBoG>TI&MRm|CC6&q)z;$N2lB6$OnaS5bLy%XC4@{%8lqRG~q`uE#w`X<@#I4(g$u5hOKvdK@;d1##2wbtg*Zc+2goI(i)C=)t9i;L zgRf4D$F5-TipPMDNz@c8luFVGTE#i@mB#O!_%$osHtdB^ z;O?0USx6%V$bC%n$f2`u>-+@5`$8gAIdS}98rPzrtpT}2utWxH_j zY;5&{hkkvbSYMn>y*&cn95>0-$wS~bOs;wsHZ1-HlPa^JmwZ(R- z=aM0=>r#BOm}JI)^Tj2D9$cAOw?E2TOeMoF=4H~fkLSR|H&<^pmN+(_Mrg)qKUSA+ ztAJ9plr-?Od7t&a%~H3|+Wo~PlrNV`ru`f&*wl8+gx+M#1Ol8^|KftDmPY6m3Ff`k zvJaH`tPls!V?LI+%AH1N!fZB1r44pS?8nfnJv(?z_|2Bs!xDq#?iSN;CFpaQeA&qc2 zg-YhNrEVVx-Wt}7e9d$WW-+xFNv8BRV~Kq5OnkEDcGS98AGeHdRq1=Hk^`w0S_kfW z+J0(UUOGkHcJ1}$(arRyg?6qz(e8jM?H7u{FhrSzZzBPSRhgj_vNAI3Z#Q}ilCd&9 zsCWK+XP@|-MV9n1^C@bM_CP-53sYxq2~^EwoyhC4K)WV)hz!*MV4}P3Na@)<`atEWOko{jxPj!^tJ{(4}a78KM(-Iiep1SJlpmh7-tZ8;XIK zigdvYdeO$g@z1@fkyN#g%IFvtDs@ZRA<@-F)@tbWqs8QI!zHFgiPG5rV560VVA!Z> ztWP*L7&fY_b~s;xeG%AP1+mN-EvxkJQ7;W%EsEsc0ZpFhWpi<;ZkdGQS6nDLm|v&sEHrXN1;mKx%(P^6~^ z`ejGUeYVg*XSLU4la869r?$Mb{e`Dx)0fj+nbCSwg6&0KTlEu7!0Dx-srn}&7+ZSI z5X=u96Gdk)rz&Tz#K&6P?7FgQ3#Q%-63sO;2`A z-E!MwdHT`Shd6J{ja^bbSt~o_#cRLG^V!>UXn44B%^n7;Ei0Kp1bd~Pjup-O?yH)y ze>j%Ir!H-3Ht8TdzCE8cJHDyoX?H1V|N55dH~?||SjlN)Rsj+G-T(cuh2%rB*NZH0 zarf+-ed?lAI&SW`W?(ge$tRVN5kNOoJYCL*viI)7w_B*9F0%!?D?E>OfpG9qPF5%O z3$g+gr>J`?jxJ?QIm@ugquI1zF0PC`mna!OH5VlD^}HAy_|o#r-PJOnVe(ny&lIex z;``AaZ9@xvwr$I<=p zk3;{FmmKlaBksiSAjgSwaSD8zm-A)X7iD4|vPXNl{Kx>!eE|d6jBIF%RQUyx1G>kR zC4bJXu%&6Oz%LKI@+X4hT~>_`#jMHF{SLosY4^O%{1wO~i`8NL-=+f@W z@rC@tWc6L9GSy7JD#c9kWFL$tP0e8ihqct6|g6WQrdTgTFwaa+eC z8NgK~x-W~_YAo-lcxy+~Dy%X{CG_gB9lY`W(MV4Zt97rhuE;K7C}&mE2DNLDvE6S* zhl2_k4c1|agY!DYMj(VWm78NUrYRLt%+_2~bsZ0F?dE_D2R2k28oHsNfyK#6 z=ne|Qt_o3>5`_3HP|rkKz{y0K&xrUAc)%tflkJM#~ho;t8V+jyCHw-weTS@E1*v}gVQ6(nT+DJf6*g43EzV?Dh z=fK#5iKR$VsA$DNHZsyfMj&HQW1-?8sEH&)a^VpAkk&@1xS7jLFhOZ$4dr=C(xsR& zLZFS6i=<++S8JJrU?t8QcaN>Fbqy(FxoJ$aJ7mQ)Fm%m0FrgmTX_@;F%F>Ab7E~wt zsP*I248;J0ky7RSMk7=jfL~rm7U^xF1QaA#3+`piM+TNi|2h)93#v? zi2khOqd((IJW08$WV!3E%5G-X%jV(!AhN*1d6b}Aa~$Vqd98Q+nqGiS%I{ltdRBt@ z$0nv^CZ@T53~O&&PIxurS4na#x=1#oKT0tu7T$hXXW?5mEBEln%xi+IEwV0bTp? zHyo%Fyn<0JmwD4X(i9n1gw^eQ;us~;0Khux9Z&MgyNFy6m%cuR>GkRQEq^W5dv04J zT>iHI!=Gg%zc=WkraOhNx;^b`#}RVnap`AAJuE6wizP9|QgD#VNJ&tW7UQqw=Y#3s z?@Kg^tP&(S#bD(m(JT`ru?%75RuEZ2D1Cz_NTXE2iQq1zBp^3FC3Zd5U`ysn*zwfm?DM87oH@AmoZ_NHGRRgOb8`k z0CQW>!>1%$Ej_0)O1lS`x9y=u%+~x zn`$!=f|fK7aFZZctYhXOCM(pTel-mO0u~mYU;$e{J_QcrpQypTi=%gmv1S zbwdJ9v%N6-kHV(UZH#l1+t~()(`N##8oY(#`hgn!v}ez1sh%&3Vzh875$pYF#yIrz zd!)9p4Ia}j{!fkBHOcQ;@n>GJCv^;X^eKztzLNi7O$s|V6p~_Nsw>a_&CpO{BE&I13t_cfrlV4wwN#NHjEO?3 z$sW;1{+>O?DEtq!_D)_HYFtGgO@KNc`P-(^T@wCoRlKkmx;cq9bXmO+t+`2(tW|+2 zw}Ki0{@tZ8!=lT(uys#NOidO;8!ODbyqH$fNl8t%JSJLvR0De`o@zY0AFjW>Fk==o zOezX#FjeZNvtzC>V^iOLcA;EN%(5cp_Z2B4OZ158X$%G*J`Ubc|B?n?*p42RG#g3b zry?DM&E@^vD|+Z_B4AD%DTc%{P^- z!UliU8WnfPh_OhCzUNn5;F*;WY9l@W*fU*VA%ugidtUZ3)yL1eg86^y6@%&)h2V*TV*h;K>|t-(c2FX={w*!Z2f| zmzq>k>=UFcdI@?UrQs*Y4NR6;2>%EL3J4Q$Vz7%73XR7GKw9EZQ6H3j7J`N{4-~~n zmWF7#EJX(c6O||+#zev0kLG|ov&WoJ{}!1OD%4nsB9H(TTxip10|4HwazYhDbH_r2 zE`#|sgZcfp$Y2@p@0?KJMF#iV`)?2!rRag?^sx|3|fbezQ$YqG>QzFh<}5-e>% zDsEOyw+OWRuuX&9fTs7F(}lgR)`eYJg9HB~_NVc2Av?3e@7)fYVi)cHlwtvq zvseusoQ0h57d);83EfjJ6E-nmqDccvrh1#m=7)yiN1yS$v{nF}J#9_K6 zL*@zRg0p(ifLtBrNn?Vr_#Y)T!ctwkwjzB$_P{fz_1VOpoJJcZ9ck7Nn&wE<7`qiL zFE}ECbd_;RS(o@La**{0xvhk*4}RrOcHTJZDFe{=Y4fF#voq3tb##JFcyP=~j!NV!VKBLttC*d1wzmvkRw2fOeeWQn8N34Yd*B>L$a^+#Pt9(?0qNC)s%rlSq3 zoPkluTBMSaEP9X?!%NITlxOE5oDR9caX^RVvrHkO zc%GRFDL978)#eHg{6ifpi&QQt7%X*htX30JspIe<$~*$tr$N_&oo~E6NTEVJNQ3`T zECH~G=D@cBwBSYL z5CT+i*{3RkaMthc0|_WPV1C+Qsv>|527f}x2kv;i$(msS=6~tA1D0Gz0GmX|gCtoC zetrcv1QEcBq&W@r!in=BS>A^dgpZ1zhxgrGZm?!-<`7Ufebq?N|^`9DW9Vp&^)N{b=Bq-isy^e|n^Z8HJ*AawA^6?W*BSWjr zAcy?OaOXf1Yic*i@NIIyn{-`6PF@qwB?!0sr~uDLpnP51zepG;ZGJa!rY9iGa`oV zsZ9?i^BKcZs#mr6qd;8Eh(DR-%d;kWb|u3Dj0UYZ^mUX^*JA{GXm--~XN{6%{)(gs zzACsJvy2t=P)2+t5&r+o*6EE=lEii{;g?e1x4%rzjm=i$=YFkE3XY=NX;O=bi`_F5 z<>Y+Jrt~agVo24}av-@hJlr;-bz9n>aHzq~bj+omrbG>KG;tiHwtYKV0_M?Fq7;KwIDhWd<7CHG6w8Ps-0Q3mH z4ZZ^OKc7MeGr=*F@_H>nhvHB12ZGhXU$}ybH@JeOw~&A8;0kWH`wWh~^yo#*^L$ZE zW4tT`;(T5#W$i}@+TLssDitArr11TbsJ~LHYwdCW7GdlAbSAX!$cxKP!*nV%Rl>u* zs0RuWpA*(_e8>;IRVXRtweN3imwXBoZfmJjP_b=I5Q2}6y`)OCw|rN~=(+TW3V!8jb7 z_^E&J&21wI908|gW9lN8*>C-#OU=S~lTV{CA+%L+uqMNh&~LaeMx-q+QAeFDo-CS= z*t;WPE-`g3w@=-fRQGGj!+fbfRBby7PuGU^nfiWRtZpVo^)*>4>G8PFYe^aK_-#Fj zk_>Z-Eq;`5(>>E^}AEH|0h8*0r#s%aavPCQY%8&q=?fVQwK3gV5 z_}mN2eW6|3$I#4V0*2Kh6G$FLuYh5-^xXe!b$)IVHHdrx zt4(Z0Pt$>PxqzMLpRLZ|mJGJlHQR%rgf&lLg=)*(Lty?38?>$=r-8!;`dtJ7Zy3jX zP(8L`E#Vi1Ewnxx+bvO?c_~MX?1BpK5T5I%^M6Zo($Dk62EP4!pW`vF3t8ha|6kzX zeqy_qrFW^P?>iiQU=<@; z$U4+{omQGxko8^lF2axT-^cQ|kWEVeuZeSyYU23fI3yt^khBN&1*n%%ms7TSH z@|$3P=luTIvoq)3y|Z`D&g`81erD@OiKcnJ-c@U&;Tx7mZY$SE-1@ND`DWG%pO!|W zl2)TU&oko*e)Xv(-MhE!{e@JvwHRvIMKwqK!U8xs!TaJX@I)UpP4JQ-4w^cPBtW_@OOjmJHfp{q zPIxe8`Q~YYc31+#;c(HZymW-&l+1*P4uwQkyIrG@j|)bt4F%xPe)ctBxn8r7VF6#w zN)P@Dr8eBTTqe2T3sX=rFr{<=c+-V^0Hc7)*Jw3(PovaVV%mj#z*oZ*z*o!i7WryW zo`)df$u^9F_udSv*uGQ%V$J!(l)p9`%dlD&r3tAMLqF3XA`l2*N(yWwlPo?2P7tvh zVt#E45rtfslHiU(9c30t*iDbH#k{%iLErUbi>W=BV%6uJ!h_lTrcux(N$QO?yqF_FPCh$T`;&gO zy+)t~hR2vMZzkHNzluul&ahRL%=Xx5u{dvy!`5FD#adzs(pSRZCPhzBf;J4~5cT3k5RqF|DF1f?uQbz5EJ>0xh z&aB_VoCZT>v=pZFTB$JC*`oDDY?qX=2-4rEP$@hxWJykB;w8Ck5w(WcuS?r5@p@vV zGQMp^Wmhxtfi-Ib|*CA%#qT$-KdEElti8$8Hme7U$a=%FRK zo{4v>XNzjU(kqKCni;gB_PmH7_3sA_8wbiUK6p6>ealLf({H89sP(wzI$))mz0P)V zDjc?CU8BmyJq*V6Dtxv3|t9-$|?tW+~eib4YFEWoz z4{vgLBx%)X6I*0~iy(bL9wSLs?V2rQIVSzC71e&%s|i3nq64g*>hHMx$Qriky3?nz zi9k0p4Ya`HdiglFlXbUSaJ8>vF2c0g;n%GkicLs|^6S zBh!EtibtX2?1zLo(%FjUdE5HL$UXd^T2}zl5mTx8yy; zb&0ueBQ&r%w=cXEbDeJerurpzOf1uKH_x8pFn8eT0-s==T z<0X?QW>YOMSCr7@Yp&65moB{@f6pYiWc4011-=%uS_JKeN~dcv?cAgQ(U?I{9xP4? z5G|eDk9WJ$Mao*4d+#(s6NYy?BwIuFl2Hn-@V5#fz^I^jJM>7n)%N+@$R}Ob8cKkO zJi6AfbG~pJxna4bp+tJn2p8&h6)&FY2EO7dym+a=2)8;S|3@2e$=J^*-mM}oil-T0 zDTGQhfWZng}ZNo{fOcqZ^WZMQ8jck*q<bR*`epDvoV0?J`ntQh`rZl) z)}Iw4GK8a~ANNsu*cA=4x$r<#*5>+U?VeMeSG<*UOH1b!S3a21UsjIDTq;J+zwnzn zKr_8KBC`g@0xxBk)A?bxR?G8Knnw*r_1BgsO1sno$56)Q#H;z?^~=iqr#dhAE9u%v z-AhToesKh7dBg1HNziaeWn!rHxx`Q%Q1-q!LiQ3kCTD=yk2y;G_dfMF&Nbr;yvXVC2^ZNum~T-K1Cl*Hh;D!Yj#zIW-vscseVftR3n35<1?0p)P8jz4o zZNK}nS>UCBSw=y^?U*qW@{X}!p|Ewj3IVh!4Z^*vLRM;&sB}0^54+v^Wg$Uxvtt#b z1(m+8$Aqk@JEn)N*39V>N>Loxut14I|8cGwL9`q|H3A=PMWF+~0+G6|a$u-zhz8A% z17DX1AZ|VI*FFv)bj*T2A2Jm zys!0HfCEU-!|Klun2=|Y34OvB4uA$NasZ&^=fL0oHuoJ8d#DxViYW&v1hDoAJt8Ep z^$CCjph1fqKsEAckprkkvO7SA+5d6?H&EysI!E-d$G&+E`lJrTI{?!Eypsd3)N%k- z$S8a0k+*P>1K1}N#?`i>(tnw(Lg1Z|prP^(u28szSB)4ti?^Bw38|9@h?eefkL>j) z0R^X|cE|DTeS_CrlI$ff9l6gy7{&ZF{86=zkCQrWwygmBmtH3Wx!^#ACRu30!_4n= z3g?a@yTE96BtpmSiSFu3KH_h~AVPj&6xcqLl6}}+ofa?pyG=lW9?_ovx_6F+woe{r zz*!15dM-AS0uK~o$%6{rMUDrYIO;U0+=1wx-yZ9o77|W@t$7q!rvNMS27X=&-Qoz?|bVIaA)4#bSCerrhT%5Ce~@uDbE*LQKYRxm^$w}iQopI;r>(E&r`0?-^?C&e ztd;#b8L+&z|1o}X3Ep&MtVpcTT~+2}T-VaC}M+_8QK;yNYxw+!CL n+T-#Y@of0JGua#cJ#xDok1BR$_uD+rSaAGU>V6UPR#)f0V?9EV diff --git a/java_LabledLDA/models/tickets/.wordmap.gz b/java_LabledLDA/models/tickets/.wordmap.gz index 6051dd9865c31ea70fd335a99454dce98daa368d..c05f690fe1febc5a9c36b982ec1823f4804e056e 100644 GIT binary patch literal 181 zcmV;m080NKiwFP!0000005y)mj>IqsL+}1el6uo<5*ksMYUb;gth5ItFyQAq*1F6Q z+N;=sV%KxikoE4B*h?9h-F?s=_CYuYe=bGLZ#0OyNxM_b12mI4!LHWp>7fk=zcdwx z6wmi9<^QDqL8aC!vTZEFI0%!N=*PC$zW=Zi^~97ul6Nm-J(rkk65q6p@MO-B$P_d@ jiSW-NN`maQnI&W7UUxra`L*yGLfi8%R9s7M1pxp6{QFr! literal 4376 zcmV+z5$En7iwFP!000000F7F^cHGL9&FeFq>jw5LPfU(@C0nWWSpUvMY-kr^c32x1e92f^<@?zv zja@ZZHICdsV|*~#_IOB1(eQTPswtSpIBa;-+;3y#8lvJm` z^jOto6-(#Kgy*Yd;+@@^Z1kkgQqt|~!}kXJn`~B>Z`CL5+Q#f`%y@|IBt$RcPioMO zQ7?6Ab~=_RVuN?q#B5RucVSZJrgz3>Ja$q(Y=?8xn!+Ea-QgQ9j+|i`psPkNoZU8&#%#>5&g)L?>LZp{PFUX;QBuo}LkJZ$~6A zT0Pnn#;mcEsLm$stofz%2;>mbZ1?KvEzIpHwYqe@>7r#S4N`ab)^&I+Q8Sg@5$PZt8qb{r;10wBPi?o&wdCEu>Qo4zHJ6eHUR8G%1%xZMnM8+5?p74WC zkHuwX4CyT$zd@L~6=>Q!jWv3cg2z*E*H4dw@i+?49D7;Ewkw1m9aKGYQYA-yW4WLL z0(}YyQor1C!Nd@s>YV}d6+UG(00DSjae~RHS|)Wu^)b6rj;f3GAkiCTzRV_CmYjUG zGfg6vP)zk^Y{59}N!{6GQH4M}KUv>}IO%LhZo4a{XSkw94r9md;!Yj{TS`W?I^#kU zP!@u0K0&A1jJZ}2@aVKx1t*O1Go{B|WJ4$@xiE#cxVrlKn_e@YVNbfqCHj(7h`63@ zO{)e_P`bM0z?~-6Ox;SO{iwy2L}e?djyZ+}ny1_apVn&+ zcSGPH#DfzobF$Z@BEgi7%{R2naK#>ZL=+P;?hacMOc155y4gbT5a$S0lmIBJZDVL@)qz=k#!-iS18#|f+P#~>nf3?Kn0 z2A6CHv@mOkaNTBMID%X@jcp*dlbKnEhWv$vNM?=2&KK+}kO;tr2vm|SlT_rglsCg9 zk*kf4pwj3AY)TDI0h)+Q^Qr(hpw}JiOZAPi)~cQ6Njx)OV>(#!Q7#$$AasJjB3bf# z>~}q;go}k^`!zc*B&E08P*8|xs0+}*Gn_vU_eszvoTeU;+KJQDiPZ45@S4!t85^rE z3=*IV%Hs2%BUzCN&ovSixZcg-`*y69g67X}j9<30wCmO;BKsrXFr>(1ZVh<)7^8xTz&TKg4H^ih9d3oh^zx*3w|t=T9eO%*r=O z=r`fU*MW6OL}k#G*8mTCH+6~&CW-Kh!cJKv0JfwXa6sJSz`7Y&W5{=Jcz2fOfa4K| zy}@4VI#TkB@KYq{#O90Z={`}LAY5dlz{58>F8Jcw2ya5cVA%n+p%yYu#8oz5ko*su z>l=lJ5y^gwMp-r36nxgK-B=GJ!AtiZq#pdj`sm%JC)h)+k&%!h60zD_?NN#(zL&rb zik&yt90EmblDW3T2`wrj+;=!4`bhk&K`p7mB8=+e;hP&pL_%lbdSNVBHT=3{7jO?M zkN{YI^o#DvT$v&Lu(?o-C0a4>=*i|D-B~suogK|)*%ss6n~}XIALVM5pq;28kuDnC z=dfK7LAomLlelfs0+EE8tttpSvBn-y9){`R-|Vcvmcv&-PCy#f&tU4T0lkD+HcA>M zyO}`YGrLznCB?EuWE9kH9tu0;Y66(4G81j6@+3lgRjBgU4h=|sL=ti52rbWt+o(mG zBhVeO(ivU=H_Z8H8kYL6!4mcMjCg%uN}fo!V=*s99LA?*j9mV6mHI%q{h-@U94Bhj zp}Vjpx*qZn3PZgbf|9;RAM%?9a_LUw<&bqFDhl`?9eeF|z_xTD19a*)Vu%>N_e{O5tVM^T zHr901Wu0Y5R}~sgi|pJEfm%2)zKk%jf+Tj8Y6cx!*eHr?0N1%HUeNZf8N@PK7G6D~ zn$ta=MokD33wyi#DQ*jSp|Zn`rpswj);PST41@VoIiRJod8I|20i}h6vLYW0TM-~Y z#DiLNA5j`!eje+D zrG~a_`Z(%OaFFa}s}^TKAVn3lLCiBmFpWbS14Iv@M%?vg7Y6(m`Xt+qs+CqHL_Az- zatG?JO;403IV~LH&!8*@9JZF;ZmV^Lg>`6b`W^nRAo0pB38>7(5Kt zc#WqxVU)gPlo7tE(;Ja-^3-WA>C)ekV#Ka`O+&i4rqvBeeF41`0Y(T4=fwH28ze0* zm1xSEeQ49!fk3oRXbMX%!S7v`#z4VU>ZpLuJ3VrUV2!km24pk@dV@XP<+}buC@DKF zLI)uhXH8cDbyNSzrVS>zPa2S{#RqEf^VVpH_Pr6MG5AdC9Wq;y}mPYnkpQ#$EK$wtU3R662Nl|%zQLM?~zPy(*A$=op-)o4{U^ppty=WqXw zB@;~(P9cLU#T`ohKE0Ve1ZCTD3{Ng#%^102&CW2X_yfJet52*HuxW^98Gc)H_%pr- zR-9e<0IXJrOFX-1(?xyNbilrH-m#1hC@1fc0>lbXw7$+H#giV#x#Gi6o~q0YD0Ttq zY8-1yI<}!Ut9p>Xrh7w3E5X6f?IpOkBoG0uNopIwHA0^;e7UcjQmD+2CJ;`g3FyQ$ zzGr?%T}_!v^!>c025ViTHy>I78Q3rZk!1T{sEvXDpnf}#vOL?a{@CPm#5kB5K28Y5?Ty;)i2p~+gE*Kvdd zUekxa1V@(z3`KBU_tcogcj>2R7+RmiexMH4SqJxJ}GnZ)DAIyFnj`rU-t zq5rB4^^z!9_@X$=I|5%{c>Mn?J~J z(15Dfx)bw3r%z=M)|$i*3>1;|O;+_bb-SHIhj)C$GMjcDBF^{2(D; zg$N>D%*@=>;MYIvV2H|eFfP<-GvB}+U9FEhmD%j_;#y>2Zrw8Rl$k466YK!Sb4|N$ zWP(7LD9VJ@WpO0OpHvlByDP=KcoC3LG)Rf^Qb0D8>qPsQv-68LmseMp=!|i! z2Mdk#DtQc~k=@f60y!h2E4>nqO%%=-97=J3u-ITvs#>)7YJ)y;ic>|BtS|9`tyKo< z$M!M?1+{_i&|C({XSFLlIa-u%lOfm3^p{<&;g|!Rf|Mg~m+(X>$VKpwLec0NzCf6c zW4DI`B?&!3BJ*xR58W#r9w~w&@1=UN!=MiT>AJ)B8mET*Tl;HMat`G6qSrUWdRk{r zjTyz0;5W$X;k(p5QgnAteD_&81WcfMzwG3(vq1tQF@wIe1Rt^hG zL)Ct%Dwo0=Esigk5`h7CZPeR8TwQ*+eevpIb8~rdeo<|P_?RCkXJ-u=HX#%hGY2b>xkh*g?M&L>C!cD0lJ;g2sj|gA=Ve@%Opu1UtzGn z%XoNgfd63X|IE028C9HK82t8kKmF_Pe>=Owx5-{j)7!|6@w)7btn0P`inLaARk<9l`X_WbPfmYb%k zV2PbE`M1`{bPUv96{;Xqw13CfPq1lPMB!s~A#a>i6TTR@e=RVfwnyOKQjEViU#8{f zp&h7YGkV&?obYI$XzN_<1J5YJYDLANvdT@}UHrs1c_`lf;p^bCI=}g_*<8Q4y10bc zfA{M09p6dXz9XP{mJOVV_Vz^ob`390jIPU;2vG{_v=71~}LlW%h?%gQiX-Y&ZO zqR}tI_y^!${x9saxBLDycl8o~J diff --git a/java_LabledLDA/models/tickets/tickets.gz b/java_LabledLDA/models/tickets/tickets.gz index 0516f9a759fffc97a81dee4f60221eb93ef5637c..3a08de8a3609c94eb9e4146199a2b786930c12b9 100644 GIT binary patch literal 227 zcmV<90381xiwFq6K=4@t|8!|%Yh`qE0CbN*Zp0uAMfW+yKSJB7%0<6%P*c7Cu587P>Kn8^QEmZz9vY{RdK1M%4TH?A})@;+xx z!OI#(JWweY7oS}@9-Y+WNBx)HS-s)qs}yhKK!&*i_SG4|d+2RedfI}!JtPj6^iwFp1apzeA|8!|%Yh`qE0PI}LlHAC3-SaCD+H*oAMBiakqm)Kw8bMDPdB&w>1)F@<84n;87 z4Iq)oyw7tV*{^>2y6A0ftF|dj(|fb%a9LYl3{7df-mTYtVO`_hY}pUZqL|yA^^NHZ zQ}))?yfyxYiPpC1i-j$X9p<*!m}^(pnA@9SUAQ@?;z!~)%VD-G*2dL^_1nQV9lnBf zbQ^DH%Z>4UgOwa!Ut5o*Zd>2iSQTzGLytuj{kx4V4Axs>6@JA9#jTaLX>IQo{kYko z^OmnS^J3lF($=?L-AIcm0aj+Z|I*4#Mb{5=hi%q&>FqHsX8m7_3WhV*L)R6RdAq=S z7E2kbW>{mytygzm7oGLjZf1LH*2P)Znfasr#}Z#zyS})1Qv7J@d1<%K_n633LWc+b zt72(NTRbpBxix-Rdl32BS>M~PYp>gNao;X#>v~MPE}C|>yl>}r;LF{1{{h3iY`KV0>TIJ zGaVkfb|@C!tk=aukI(fWW8+rfj(29|x^~qYEX`*;JzJaF+20Ne$j7qnHf~oO4kj~4 zUYem=*vdQd&Vo1l4^@Q^kT<<8T(dOT;N$&=j@!R(Jvp*}zp%x*YgR27(aatAHdNqS z4|#&bUU@THIT1MzdVKx5sLiHtH^r(p*59Cv6r)?OesxHTo8|c@v=?Hv#-ctcwSShV zo)**9Q1=v5=+;p2A>^WpUp7r`K)HqO+6J`2sIN|JdBXlF-+z+P(_P0+PxxD$U~nNE z{@!s#1?)-FwV>D{3^}Ki(%J|!f3VQ<0(bhqc~&i<*Z6&?#p?Ntb;7Z^KVVvzZw|Zj zO|m=qxrVb54UfOb{)@`yly2eRUA zUb&`f{o3?yl`Rtctp>Z`p1c9wd_@N09nu^O>TEGLJ#LgOHY?VTbD3^qSG}$aHmf#Z z9(eBMwZg3WL6!zG$&IwDbvqwwT~OFem||hDf^k{%c1QN6_xiy%LszsOLwl#nZ(vj4 zywVsm^ewm7(FAA2koA|QS;bkyWW&<$KsRs|+oHu0U&d|L?ZS!A6kauEO-J5!lnP4L z&en}hkMX4%9-wh}&*7Uoh5D94DPFu?*TrUDg)KvGH}F9aNik%$x%KMTR07y%*Bf7w znI1g!z7Wa6y69p-NJmCJ8ES`JyT;lMo~bLIV)YFS<_0U~q8|#Y0zda~M;3D0 zwBC3MCEZh)Kgy`nSE45$83EZ{^}Y?3W0aw`>EgY7(NqF(bXkjDRNnp$ioS~@@r7}~ z(F!})G}kS~hW=o{3-X;}>FTNz{|J1r?T%hd)QiMAbX3ovEc^GMk@X8lQ9d9hPz513UR0q77iL`d|ef>!M07S1ckXf}CTUbustZY3Ok3`c`7cgZYCY2@EjJg+x z<*p$o!+fC@KCNbSXu_D)U?HMV5lS`nR_=oHo#}U=lJ{WF04cLen#$F{(U5@IYK^sU zJ~&zcJwO~lJD7Q(m5GRPXJGbh2|Ma6_1Vn(jQcdFRBqYcIE=dhgV%OLnsd_n=E{5NS-E`QtUr4^bT;}($TT?L!YD&GE3hC_Cj>z8$HJYj76Fz zQy_e!32}=!OAwyA?FuuSwL{Z?3YXpNm>~%;$tQr=X8EpzFii2+ryaET=VIcZNB>+* z9JKgnx@TCGJK6fr==RJ&2Ez!z32|-B&qf*t-ue?eXu0m^`;W7g>JnTshuY>(#~gRQ z9`_XDfw}7-Fz+z5qGyXwap`8R4^kPgsPdfFPhzGi-X7z}DMb=N7#fUpDRROr&9eE^ z_Aw(weAQKvML6{rE)nn~+FC5x5Dz%QY$uj3#;jF_$#NP{4?!7M7n? z{|~|*nZt#}rga}_p7>7mhV$rs!<-TpDl9DsRyqbKUB0Ku4g=7X;pt}guIpvVbNbqr ztSgNV)Zp46Rvk<%_P)e)X>9?RbRd9|J3i>`+rBt^arN|h^iZ&8!3+f&0;G9DgRqb2 z-Pdi?o6I+S6MX|5NFIBTC?X;%{PMJW*8nWFO+Z-ywRS-OGfuK@)m*P=K4^8YnJ1=! zlc3Adslx3CSO6SZAaBqI%mr7Q=%z0O8?Mh3kU}l^s8{R@S94@A)YKPC3d{fBxbm>(fJh74<6tpbn$LmG+~Umg0ojc>{@U3|Dt z55+}>4~0eK566Hm%{hvg4*#uLESR$p$ODmmadkcprfj<$tm116dNmU6H{$IeG)hoG z03*r2)fF%7ieR=+`6A8bR1&YN^8>c2K~xFTjRt08c4-cn4Y5|?SBKls_7HFkxn$(O z6VU%|*uRJ25u@dc?|{g90|V4SPRhWw5Y(j25tj!q4lx&czacbk-)R^)_rnalD<17X z_RC=%G8V*j@yQ6$Vw6c`88#9ZZfz&Iv6z^V-?l)t#Y+f3My?Pt+GPBvnDs~&bqgmT zn04MHO@KZ^)T%y+crcZpjFE3hReHEGKS_?nuvvvX1aqHDxLTlUadI<$1HtjZ4rphd_e*`TU zfaf5TW7H6WFbImNw@$$Su}8;JyO+>Ar@sBNW!}vCc@Wb_?>23}eCKToN3`010K5i8 z>UseRBQz`?jd=-$8Rw&9P?({*SwfSa1ORrsq4*9z#9%Q@`dR?L`t$Z0sj2>tCe!F>xL*tuOulqasew;D7%4-_Bp2|8Vic!_N#}j>0+J(utt% zv-ma*NC*KM%q(-fqST6w9fNV~#~L4f43m!YL1fm$nuO2Ar{E z4z+T-kVoa}#AGo)-K5#lJR#2%D4S!;9YCDUk?z*6S>1Fsi443oB$Z?S^WcTJ$9REr)Tk)sq_z%k&d>%{uobgU&EDK0No^vFZ3YIvh zfV{JSbGCFdQQ77wbCr_gAs#9fRrELHys1M;$g+?EujY>w9b|XLb(&g$y#q?2yv+CS z*P99~dt;gbi$1hV!8xWn!ofuN#3#QQE+-p0Uh zzpt1QB#VL#PL&5SA+XenL;P-VowyJsc=N>d`;VnsY(4?JV+FaPAhcBlOVUBe?HYbN zUWVF6Np-<%vlYE^%DB>*n_kB%KPn&Jk?8g~j!gWEcS@_y4+8(#HE_lh_3w#Snh>>M zb)k+SS&>W05#4?ig(xdp4PEp><8PVf0&lLO;Aj|)DY6<5g7GfKK54-r6bk&VG?$xa zb)+&Jt0j%d%NUTREG&J8SeKaSMI0Dm`@E%GcD+e?+_?aC5JDg&(IX^pELE7&v!oVD z4pwk#Tl^E`h>5HHdkOLX_=hb#>iwrrAJFf>(g9xUJ0S-^F`EI59@f?&UB7xgDZFX+ zD6umWc-g*bJo7Mkbbn};4ok%q&^0N}@{O8V#GGUvR{IF+sYTZoLk}B>Ct8G;qz`_~ z%z86)7tJ+cfRuY0!-5XBip72NI_9^65D5Me5F?%hh6lnTB@&h&S*DAmOO(Zep#wfR zAv*-`HZn0NS21@RsTSdTO;-`#4MAH3Q1j}Y{;#x+?xSm~UQm8W@k|OWbiad5`6VUw z+_0?HO$JUbbZ~ORG@a0~KC{tD6eWwG;{_3QGj9QJmTe!BSa!$!L!?%2HVP-TLxyoO zVItWf7Lz0b)7JeiraTU^XQzZGNTAeRy#SZW{H{Py(MoMix=F@#TwotjpC(N?c++lT zjxqRL#Zb}ZbYkLc`iW9MZX_*11U!BAV zK}Pn!_ptJ!B_i-^%a|WV`Xgoa+4GC1&o8d_zrVV?yn6B7xi%V6JI$~d=)T!g1iLiJ zDau8b?J*i$DhAU<2+dPHK9&|rJFt$83CX*fY=teWnX%ELOIS2w4qAdObG-VF&Kd%i zJTD=5LaXE}PbfGFmpIu-f0fZ3b^G=WfxYO4mA(!*ptcnMBaV&kY!L||eOXY)r14`d zKock-yQ$JQR%SvA%faCCE2U61*rnHOrDD5U8YB?Ih6`v1?IgQYA{fY;R)SSHoPCT5 zdS;fm*_stCzAnXcXpiR;LS~5DPKYS1k5(tCc4#&mZF|HbaU?T!MUDR@Jj{&86s3sO zb^fq@CscFEnuuxp=(4ha03n}VMV)yJKTSU*rej(R0qt8V;v)zSQ;V?!sjnzb>JBu0 zc8GER`@jAZr0G~FO6YGAaSYPzGNQ<%tUNg-*d|50CqgSiOh6Nc;U#T0oxGJh&TcK$ z62Cy3n&L-^_!u>b^!_j*=m}dOm9O9;n!c%YW_}}zpv)! ztVTegdLo1|A&ad>9bN=xMOz%kxIQ$KOiEADxEy2{zVr4StbP~FK7{mJd=0MMHQH(@ z{wX!-X0cXy#F8t3bTMpbZl_tFXdN_%^&_BcXKO-G^==WmOZsfg1Ed7n2qUv!!)FoI zNG*VCq!EliVV5OXD_|aXVxcc-cm)sGu9&q5bV!c$7zXz3JB?!o28hYHX7$2!s~gG| z(Jea-&YB%lmz%1N*fEC#VU>xxlej|2zh$Wvw1_`p;7g_yZePxpTX;f$Br|7q7ePo* z(c`KXUP5gZ0F7P|9t95&x)LXWL&WzAOd02TDhl%TF%k7H#V6Oa>keD9aj*8D{FEd3+Q&BP({{a8X)Y%W}+ zwvW7w8*94>(g97I{eQGIYopyMZQTtjRiI=r522qmMxB?ld$-TNj&X8Izs8gnbOSq< zoPhOy!mt?(~veCOIjG9?F$DzG2xkjuqphwM@+g_Fg7;FZnav*bVF3WEOB%q zJ!}KMbCb@Tn0RHMVL8ASH7+1nZ6H`f)(xW8Nmj?PM^u&e1sThXGO6DPxGgSX;g2{} zGIXFn3v-$~VctZavjIu7gMpB02`8OD9HZp`hDALUJ1L%+zI%6HXU(Zf{PB8F!DHyr zCB!=7>^OTSx`{&}`o3y6H4syAHRN0iojWAI1ox}`2QTdhf!!%NmOyt%P_g4WwG_E5 z**$*pxbyJys&r$C=j!bF)#I}lH~OB;rS2e&(9uT{==ik`+vQ>Jb(On|K6{Sh(Q-RKx4`+Yvz)+PEA|QmTsf} z-8T%V@*bl^)`PNvcwhy_vuRAIxgJ}W+)(kzR5FbQP$_S8=k;N08Lh}m2&P%Dr98T1 z?Ll=%E6UoIbC>~YdH$fClR(l4eJFIt>EG1nw19drALn9StTGZ#4f9}{4x&ERnZ(#m z^=d07dF;UtIXjt0rYn1&LH;P={htk!-yB!8?q$no|=8>;FB$ok4YSV6&sWE{}AGdh0YRO8?~yYl*TC9 z-f$yK|yyT z#l|AUG(j~@LQS9L>SM3_dWv6_eXy+eUS}pb7;Et5im4i`Y64KloA+^(rJwv9fSBpj zYI&+TKsKAnN?W_$ttPE0l3N?k8pT%mlg$e^bhl(3Eu-IqFOcxUNppMF{~{>hCk`Ck z`<$bsW5*X{HRR z!yx?gDiyn9M5K|B3huNuK?_yP{2TzA=&H15IikFzkDX;tXKGRa-M541Q6){@_vy|U z8iWq+B(#EJl0&_J{{6+{#~0%l*2b??&)ZTGj&X#Zmy&fg<1vkhhQnYT6c>@cnNq>n zp%}0oQ{Ul7FkutPOlMEVLnt~byaU3l=xiii8yd=zB{?DKX__(2i`g=>Z1PCNz6Bi$ z_5xx`&AUmVb1aqdbIp31@JsEYA!LY*`;iT^Qdo;;Yr<)SSYwC@!g!4!PFcCkdQqUo zg+-v^77{uGf5A`LVtO>9Ujpq*O=zH~r~6B5<{i_^tsH-0jU?<@6%HsC~;C zYciNduvzjz*gBchJROnSS@B{aAiW0bWt-wnS(RXOiY85yY3Qg~Y^PCN51C*;db?hK#f<;Z+vEO|yq`bX%cEl_r#oCyEUPD0@`4g`5KQJxTjrD5)P7q4_t2#7 z#-f%K@x(tx`yc10s;J->J0n&vT0aH20Ui>(vIrEOrBvP8^xnPI%niE-;?uuG+8Uf#OB4*aCfp2^OJvkeDUJ)_QUh5=NISq&$H5i z{3h)#M^WI`*$bBFp#ah!BL;AiM^VI@{MiJBWp(qZ6Vwj4$sRSEkyUI|Er_k5EtA|_ zmfq`g{H9ekEsoCET)`7ORa;A+VQBSr?4KE*9?ckQrX$b+3Fa)|RHYg6uW5;CzIF}U zBTV0VJ(e3fwMS;h^N|7k#9Cw&KL&1c%qZ3QLJ>8lX^)Cir%#%VUY-jZ&6-HUpLKZu z3pj-^#_S`QRnQcikG(hHT=Xw+F}|UTeLTK17DdG{Uzl_BU38 zoR_+G3oVZabfcm8n1$7AHB8GfN1S^@RxFpty^%);K$^jCO;`Mu2VfqaJ-L5&aY=b( zt%OMOj8wXGR!)cP-+S#s5G*7OL~|F9aI|b7h^TcMs%t7bWK<;9AG`es4N~-<^~;u? z{ieq~W;upW!w=h1N7puX4C*UKwVH&=q!X!Z-`Y8(Lfko!?qkeiU&9+$5t{|!OI`nX;@4C@Jp;OPkM?+dj3R@(AeR7Vkg?CDlM#h!>Snh zANl7VK&IXP{jv`81ewnpVI`glqILiIi>s@rw;!KhTwcBS;o-#(hlP)W9;b*o<6i+8 zQ~GV=78Q5jM3_ElgHq5Q1yBFnfO2YD4#j)NT|QTUXQp}Au@{|xdc*p*HpSJFhr@_F z&YQUu(F7WjAhC?(*yXxy>ekG8EO_S_QN`#Yo#G80E(hJH$t>#O(>!Q9{8Jl;*BT)o zYscZZ)&rWP=AF?Ddv%KxskNZRNSebA^AiUy@BOSplc#c0@r2h6Ug7`hj}A=(kXXwd zlTbV*!t4py-F3{GG`j<(8_#sepX0cpMLa)GatH_)8ssZKnr^>3#% zVM7^)))=FCV_?$0MC /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &" start = time.time() +import init init.main() -printlog("") +logprint("") corporization.main() -printlog("") +logprint("") + +cleaning.main() +logprint("") preprocessing.main() -printlog("") +logprint("") -topicModeling.main() -printlog("") +topicModeling.main(use_raw=False) +logprint("") +topicModeling.main(use_raw=True) +logprint("") end = time.time() -printlog("Total Time Elapsed: {0} min".format((end - start) / 60)) +logprint("Total Time Elapsed: {0} min".format((end - start) / 60)) diff --git a/miscellaneous.py b/miscellaneous.py index d1a3fa6..ab4357f 100644 --- a/miscellaneous.py +++ b/miscellaneous.py @@ -12,6 +12,10 @@ import spacy import textacy from scipy import * import os +import glob, os +from textacy.fileio import open_sesame +import json +from spacy.tokens.doc import Doc as SpacyDoc csv.field_size_limit(sys.maxsize) FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" @@ -40,7 +44,7 @@ logging.basicConfig(filename=filename, level=level) -def printlog(string, level="INFO"): +def logprint(string, level="INFO"): """log and prints""" print(string) if level == "INFO": @@ -91,6 +95,7 @@ def load_obj(path): with open(path, 'rb') as f: return pickle.load(f) + def replaceRockDots(): return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", @@ -117,7 +122,19 @@ def list_from_files(*paths): return list(map(textacy.preprocess.normalize_whitespace, liste)) +def deprecated(func): + """This is a decorator which can be used to mark functions + as deprecated. It will result in a warning being emmitted + when the function is used.""" + @functools.wraps(func) + def new_func(*args, **kwargs): + warnings.simplefilter('always', DeprecationWarning) #turn off filter + warnings.warn("Call to deprecated function {}.".format(func.__name__), category=DeprecationWarning, stacklevel=2) + warnings.simplefilter('default', DeprecationWarning) #reset filter + return func(*args, **kwargs) + + return new_func def printRandomDoc(textacyCorpus): @@ -127,17 +144,26 @@ def printRandomDoc(textacyCorpus): """ print() if len(textacyCorpus) == 0: - printlog("NO DOCS IN CORPUS") + logprint("NO DOCS IN CORPUS") else: - printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) + #printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) randIndex = int((len(textacyCorpus) - 1) * random.random()) - printlog("Index: {0} \n Text: {1} \n Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, - textacyCorpus[randIndex].metadata['categoryName'])) + logprint("Index: {0} \n Text: {1} \n categoryName: {2}\n".format(randIndex, textacyCorpus[randIndex].text, + textacyCorpus[randIndex].metadata['categoryName'])) print() +def corpus2Text(corpus): + for doc in corpus: + yield doc.text +def corpus2Meta(corpus): + for doc in corpus: + yield doc.metadata + +def saveplaincorpustext(corpus,path): + textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path ) @@ -163,10 +189,16 @@ def save_corpus(corpus, corpus_path, corpus_name): parserpath = corpus_path + str(parser.lang) + '_parser' parser.save_to_directory(parserpath) + ## + # save content contentpath = corpus_path + corpus_name + "_content.bin" textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath) + #save plain content + plainpath = corpus_path + corpus_name + "_content.json" + textacy.fileio.write_json_lines(({"index" : doc.corpus_index, "content" : doc.text} for doc in corpus), plainpath) + # save meta metapath = corpus_path + corpus_name + "_meta.json" textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath) @@ -175,6 +207,7 @@ def save_corpus(corpus, corpus_path, corpus_name): + def load_corpus(corpus_path, corpus_name, lang="de"): """ Load textacy-Corpus including spacy-parser out from file @@ -207,16 +240,115 @@ def load_corpus(corpus_path, corpus_name, lang="de"): contentpath = corpus_path + corpus_name + "_content.bin" + plainpath = corpus_path + corpus_name + "_content.json" metapath = corpus_path + corpus_name + "_meta.json" + try: + spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath) + metadata_stream = textacy.fileio.read_json_lines(metapath) - metadata_stream = textacy.fileio.read_json_lines(metapath) - spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath) - for spacy_doc, metadata in zip(spacy_docs, metadata_stream): - corpus.add_doc( + for spacy_doc, metadata in zip(spacy_docs, metadata_stream): + corpus.add_doc( textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) + except: + # neu init!! + corpus = textacy.Corpus(parser) + + plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str} + metadata_stream = textacy.fileio.read_json_lines(metapath) + + for plain, metadata in zip(plain_stream, metadata_stream): + corpus.add_doc( + textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=metadata)) + + return corpus, corpus.spacy_lang + +def save_corpusV2(corpus, corpus_path, corpus_name): + """ + saves a textacy-corpus including spacy-parser + :param corpus: textacy-Corpus + :param corpus_path: str + :param corpus_name: str (should content the language like "_de_") + """ + + # save parser + parser = corpus.spacy_lang + parserpath = corpus_path + str(parser.lang) + '_parser' + parser.save_to_directory(parserpath) + + + contentpath = corpus_path +corpus_name + "_docs/" + if not os.path.exists(contentpath): + os.makedirs(contentpath) + + for doc in corpus: + with open(contentpath + str(doc.corpus_index) + "_doc.bin", 'w') as f: + f.write(doc.spacy_doc.to_bytes()) + with open(contentpath + str(doc.corpus_index) + "_meta.json", 'w') as file: + file.write(json.dumps(doc.metadata)) + +def load_corpusV2(corpus_path, corpus_name, lang="de"): + """ + Load textacy-Corpus including spacy-parser out from file + :param corpus_path: str + :param corpus_name: str (should content the language like "_de_") + :param lang: str (language code) ir spacy.Language + :return: texracy.Corpus, spacy.language + """ + + # ckeck for language + if "de_" in corpus_name: + lang = "de" + elif "en_" in corpus_name: + lang = "en" + + # load parser + parser = spacy.load(lang) + + stringstorepath = corpus_path + str(lang) + '_parser' + '/vocab/strings.json' + with open(stringstorepath) as file: + parser.vocab.strings.load(file) + + vocabpath = Path(corpus_path + str(lang) + '_parser' + '/vocab/lexemes.bin') + parser.vocab.load_lexemes(vocabpath) + + # load corpus + corpus = textacy.Corpus(parser) + + contentpath = corpus_path + corpus_name + "_docs/" + docs = yield_fromdir(contentpath,spacy_vocab=corpus.spacy_vocab,type="doc") + metas = yield_fromdir(contentpath,type="meta") + + for doc,meta in zip(docs,metas): + corpus.add_doc( + textacy.Doc(doc, lang=corpus.spacy_lang, metadata=meta)) + + + return corpus, corpus.spacy_lang + +def yield_fromdir(path,spacy_vocab=None,type=".pkl"): + os.chdir(path) + filelist = [name for name in os.listdir('.') if os.path.isfile(name)] + filelist = [filename for filename in filelist if type in filename] + filelist.sort(key = lambda elem : elem.split("_")[0]) + + + if type =='doc': + for filename in filelist: + with open(path+filename,'r') as f: + for bytes_string in SpacyDoc.read_bytes(f): + yield SpacyDoc(spacy_vocab).from_bytes(bytes_string) + elif type == 'meta': + for filename in filelist: + with open(path+filename,'r') as f: + yield json.load(f) + else: + for filename in filelist: + yield load_obj(path+filename) + + diff --git a/nomen.txt b/nomen.txt index 3793ade..e3c6433 100644 --- a/nomen.txt +++ b/nomen.txt @@ -1,3 +1,5 @@ +unicard +uniaccount kernspaltung kernfission atomspaltung diff --git a/preprocessing.py b/preprocessing.py index 26e755e..77232ee 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -27,12 +27,6 @@ with open(config_ini) as f: global REGEX_SPECIALCHAR global REGEX_TOPLVL -REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|]' -REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?' - - - - global THESAURUS global WORDS global LEMMAS @@ -41,6 +35,10 @@ global VORNAMEN global DE_STOP_WORDS global EN_STOP_WORDS +REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|]' #+r',.' +REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?' + + THESAURUS = {} WORDS= {} LEMMAS= {} @@ -185,7 +183,7 @@ def autocorrectWord(word): ############# stringcleaning - +@deprecated def stringcleaning(stringstream): @@ -225,7 +223,6 @@ def stringcleaning(stringstream): - def filterTokens(tokens, funclist): # in:tokenlist, funclist # out: tokenlist @@ -257,20 +254,6 @@ def processContentstream2(textstream, parser, token_filterlist=None): def preparse(stringstream): for string in stringstream: - # fixUnicode - string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') - - # seperate_words_on_regex: - string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) - - #normalize whitespace - string = textacy.preprocess.normalize_whitespace(string) - - # replaceRockDots - string = re.sub(r'[ß]', "ss", string) - string = re.sub(r'[ö]', "oe", string) - string = re.sub(r'[ü]', "ue", string) - string = re.sub(r'[ä]', "ae", string) # cut_after # todo addressen enfernen --> postal.parser idee zu metadaten hinzufügen @@ -312,6 +295,7 @@ def corpus2Meta(corpus): for doc in corpus: yield doc.metadata +@deprecated def processContentstream(textstream, parser, token_filterlist=None): """ :param textstream: string-gen @@ -398,21 +382,22 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path") def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10): - printlog("Preprocess {0}_corpus at {1}".format(lang,datetime.now())) + logprint("Preprocess {0}_corpus at {1}".format(lang, datetime.now())) - rawCorpus_name = lang + "_raw_ticket" + cleanCorpus_name = lang + "_clean_ticket" preCorpus_name = lang + "_pre_ticket" + logprint("Load {0}_raw".format(lang)) #load raw corpus and create new one - raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path) + clean_corpus, parser = load_corpus(corpus_name=cleanCorpus_name, corpus_path=corpus_path) corpus = textacy.Corpus(parser) ## process and add files to textacy-corpi, corpus.add_texts( - processContentstream2(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser), - processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser) + processContentstream2(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser), + processDictstream(corpus2Meta(clean_corpus), clean_in_meta,parser=parser) ) @@ -429,6 +414,16 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name) + #save corpus as labled, plain text + plainpath = FILEPATH + config.get("de_corpus", "path") + "labled_lines.txt" + textacy.fileio.write_file_lines(labledCorpiLines(corpus),filepath=plainpath ) + + return corpus + +def labledCorpiLines(corpus): + for doc in corpus: + # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi + yield "[" + doc.metadata["categoryName"] + "] " + doc.text def main(): @@ -468,12 +463,16 @@ def main(): "categoryName": [removePOS(["SPACE", "PUNCT"])] } - preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" ) + corpus = preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de",printrandom=5) + + #from topicModeling import jgibbsLLDA + + #jgibbsLLDA(corpus) #preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" ) end = time.time() - printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60)) + logprint("Time Elapsed Preprocessing:{0} min".format((end - start) / 60)) if __name__ == "__main__": main() diff --git a/testra.py b/testra.py index d1fc357..62007e9 100644 --- a/testra.py +++ b/testra.py @@ -15,11 +15,26 @@ start = time.time() import enchant from datetime import datetime - +import os import xml.etree.ElementTree as ET +FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" +from miscellaneous import * + +# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &" + +parser = spacy.load("de") + """ +# load config +config_ini = FILEPATH + "config.ini" + +config = ConfigParser.ConfigParser() +with open(config_ini) as f: + config.read_file(f) + + PARSER=spacy.load("de") @@ -48,13 +63,74 @@ def makemeta( testmetda): yield metdata +def corpus2Text(corpus): + for doc in corpus: + yield doc.text + corpi.add_texts( makecontent(testcontetn), makemeta(testmetda) ) - +corpus_de_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/test/" +rawCorpus_name = "de_test_ticket" print(corpi) + +#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name) + +#textacy.fileio.write_file_lines(corpus2Text(corpi), filepath=corpus_de_path+"plain.txt") + + +dict = {"unicard redaktionsteam": 189, "kms": 131, "itmc_st\u00f6rungen": 17, "benutzerverwaltung_probleme": 168, "mailverteiler exchange": 130, "beamer": 70, "cws_confluence": 190, "benutzerverwaltung": 26, "sos": 166, "virtuelle server": 116, "sap": 7, "wlan": 21, "lsf": 6, "gastaufenthalt": 8, "umzug": 5, "firewall betreuung": 129, "ausleihe": 39, "fiona": 10, "kursplanung": 195, "schulungsraum verwaltung": 200, "plagiatserkennung": 32, "designentwicklung": 100, "ub basis it": 184, "tsm": 51, "backup tsm": 110, "raumkalender": 174, "veeam": 149, "linux bs": 42, "hochleistungsrechnen": 90, "e learning": 37, "h\u00f6rsaal\u00fcbertragung": 52, "sophos": 88, "service portal redaktion": 182, "verkauf": 93, "fk 16": 30, "campus app": 54, "dns": 71, "kurse": 196, "itmc schulungsr\u00e4ume": 96, "leitung": 91, "telefon": 14, "housing": 135, "softwarelizenzen": 35, "hcm stammdaten": 68, "semesterticket": 197, "exchange nutzung": 33, "mediendienste": 167, "sam spider": 172, "pvp": 27, "webserver": 29, "werkvertr\u00e4ge": 158, "ibz raumbuchung": 177, "webmailer": 126, "unicard sperrung": 64, "cd dvd produktion": 114, "lizenzserver": 92, "pr\u00fcfungsmanagement": 38, "blogs wikis foren": 87, "unicard ausgabe": 161, "pools": 157, "desktop & basisdienste": 144, "antrag auf rechnungserstellung": 193, "mailalias": 121, "evaexam": 133, "neuanschluss": 0, "mobilfunkvertr\u00e4ge": 69, "ftp server": 191, "haustechnik": 77, "raumbuchungssysteme": 186, "confluence": 181, "uniaccount zugangsdaten": 47, "itmc medienr\u00e4ume ef50": 171, "dokoll support": 128, "elektronisches telefonbuch": 3, "softwareverteilung": 153, "overhead projektor": 104, "sicherheit": 145, "itmc_als": 48, "itmc pools": 160, "zhb": 60, "serversupport": 101, "veranstaltungen": 61, "fk12 webauftritt": 138, "hardware": 142, "unicard produktion": 156, "telefonkonferenzen": 170, "dhcp": 188, "zertifikate server dfn": 139, "lan": 1, "datanet": 49, "neuausstattung": 173, "moodle": 16, "abmeldung": 13, "uni mail": 15, "medienr\u00e4ume ef50": 117, "verschiedene aufgaben": 40, "zentrale webserver": 75, "vorlesungsaufzeichnung": 152, "grafik": 132, "campus management": 72, "hacker angriff": 46, "pos": 23, "zugangsdaten": 41, "serviceportal": 63, "ews": 24, "voicemail box": 150, "service desk itmc": 74, "test": 180, "beschaffung": 57, "bestellung": 185, "vpn": 55, "app feedback": 66, "allgemein": 134, "rundmail": 105, "telefonabrechnung": 199, "limesurvey": 31, "unicard": 28, "eldorado": 140, "uniaccount": 12, "plotter": 125, "mdm mobile device management": 120, "namens\u00e4nderung": 43, "sd": 84, "basis applikationen": 103, "\u00e4nderung": 194, "fileserver einrichtung": 187, "fk14_test": 154, "werkst\u00e4tte": 179, "itmc_aufgaben": 45, "formulare antr\u00e4ge": 81, "facility": 192, "web": 169, "asknet": 136, "server storage": 113, "mail groupware": 20, "rektorat -b\u00fcro": 178, "office": 50, "werkstoffe lehrstuhl bauwesen": 59, "telefonzentrale": 115, "verwaltung": 4, "netze": 22, "beantragung": 82, "d.3 dms": 148, "redmine projektverwaltung": 141, "wsus": 106, "lido": 118, "rechnerr\u00e4ume": 143, "matrix42_hilfe": 18, "boss service desk": 44, "konteneinsicht": 62, "spam phishing": 53, "forensic": 164, "fk 12": 11, "benutzungsverwaltung": 198, "redmine": 79, "basis app": 85, "viren": 95, "fk12 migration": 155, "raumbuchung": 109, "virtuelle desktops citrix": 176, "outlook_einrichtung": 123, "kundenserver": 137, "nrw ticket": 80, "weiterentwicklung": 127, "siport zugangskontrolle": 98, "e mail dienste": 99, "vorlagenerstellung": 36, "video": 19, "studierendensekretariat": 111, "it sicherheit sic": 86, "boss": 25, "technik": 58, "dokoll pvp": 112, "betrieb": 2, "v2 campus app feedback": 151, "mailverteiler": 108, "videoschnitt": 119, "fk raumplanung 09": 9, "sap urlaub": 73, "keine r\u00fcckantwort": 124, "prozess- und projektmanagement": 67, "dienstreise": 34, "webgestaltung": 78, "schulung": 175, "software": 89, "medientechnik": 76, "servicedesk": 107, "service portal": 94, "software entwicklung": 165, "uniflow": 159, "ub_st\u00f6rungen": 162, "fk15": 183, "uhren": 83, "entwicklung": 163, "videokonferenzen": 97, "itmc webauftritt": 102, "joomla itmc website": 147, "changes": 122, "visitenkartenproduktion": 65, "lizenzmanagement": 146, "tonerb\u00f6rse": 201, "arbeitsplatzsupport": 56} + +list = [(key,value) for key,value in dict.items()] + +list.sort(key=lambda tup : tup[1]) """ +""" +from spacy.tokens.doc import Doc as SpacyDoc + +filepath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/de_clean_ticket_content.bin" + +# load parser +parser = spacy.load("de") + +corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/" + +stringstorepath = corpus_path + 'de_parser/vocab/strings.json' +with open(stringstorepath) as file: + parser.vocab.strings.load(file) + +vocabpath = Path(corpus_path + 'de_parser/vocab/lexemes.bin') +parser.vocab.load_lexemes(vocabpath) + +spacy_vocab = parser.vocab + +def readCorpus(filepath): + with open_sesame(filepath, mode='rb') as f: + for bytes_string in SpacyDoc.read_bytes(f): + yield SpacyDoc(spacy_vocab).from_bytes(bytes_string).text + + +textacy.fileio.write_file_lines(readCorpus(filepath),"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/result.txt") +""" + + + +# load raw corpus and create new one +#raw_corpus, parser = load_corpusV2(corpus_name=rawCorpus_name, corpus_path=corpus_de_path) + +#printRandomDoc(raw_corpus) + + +""" +spacy_doc = PARSER("test") +save_obj(spacy_doc, "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl") + +spacy_doc2 = load_obj("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl") + +print("Doc: {0}".format(spacy_doc2)) + + jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/" @@ -63,6 +139,7 @@ laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_h with open(LLDA_filepath, 'w') as file: file.write(json.dumps(laveldict)) """ +""" def load_corpus(corpus_path, corpus_name, lang="de"): from pathlib import Path diff --git a/topicModeling.py b/topicModeling.py index cd35eac..5c2e01b 100644 --- a/topicModeling.py +++ b/topicModeling.py @@ -31,13 +31,21 @@ with open(config_ini) as f: config.read_file(f) +def label2ID(label, labeldict): + return labeldict.get(label, len(labeldict)) -def printvecotorization(de_corpus,ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): - printlog(str("ngrams: {0}".format(ngrams))) - printlog(str("min_df: {0}".format(min_df))) - printlog(str("max_df: {0}".format(max_df))) - printlog(str("named_entities: {0}".format(named_entities))) +def generate_labled_lines(textacyCorpus, labeldict): + for doc in textacyCorpus: + # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi + yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text + + +def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): + logprint(str("ngrams: {0}".format(ngrams))) + logprint(str("min_df: {0}".format(min_df))) + logprint(str("max_df: {0}".format(max_df))) + logprint(str("named_entities: {0}".format(named_entities))) # printlog("vectorize corpi...") vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) @@ -48,19 +56,20 @@ def printvecotorization(de_corpus,ngrams=1, min_df=1, max_df=1.0, weighting='tf' for t in terms_list: print(t) - printlog("doc_term_matrix: {0}".format(doc_term_matrix)) - printlog("id2term: {0}".format(id2term)) + logprint("doc_term_matrix: {0}".format(doc_term_matrix)) + logprint("id2term: {0}".format(id2term)) -def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda',named_entities=False): - printlog( + +def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda', named_entities=False): + logprint( "############################################ Topic Modeling {0} #############################################".format( topicModel)) print("\n\n") - printlog(str("ngrams: {0}".format(ngrams))) - printlog(str("min_df: {0}".format(min_df))) - printlog(str("max_df: {0}".format(max_df))) - printlog(str("n_topics: {0}".format(n_topics))) - printlog(str("named_entities: {0}".format(named_entities))) + logprint(str("ngrams: {0}".format(ngrams))) + logprint(str("min_df: {0}".format(min_df))) + logprint(str("max_df: {0}".format(max_df))) + logprint(str("n_topics: {0}".format(n_topics))) + logprint(str("named_entities: {0}".format(named_entities))) start = time.time() @@ -98,13 +107,13 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel=' print() for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words): - printlog('topic {0}: {1}'.format(topic_idx, " ".join(top_terms))) + logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms))) print() for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic): - printlog(topic_idx) + logprint(topic_idx) for j in top_docs: - printlog(corpus[j].metadata['categoryName']) + logprint(corpus[j].metadata['categoryName']) print() ##################################################################################################################### @@ -112,100 +121,142 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel=' print() end = time.time() - printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) + logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) -def jgibbsLLDA(de_corpus, top_topic_words): +def jgibbsLLDA(corpus, path2save_results, top_topic_words=15, add_default_topic=False): ##################### LLDA Topic Modeling via JGibbsLabledLDA ############################################## start = time.time() - def label2ID(label, labeldict): - return labeldict.get(label, len(labeldict)) - - def generate_labled_lines(textacyCorpus,labeldict): - for doc in textacyCorpus: - # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi - yield "[" + str(label2ID(doc.metadata["categoryName"],labeldict)) + "] " + doc.text - # build citionary of ticketcategories labelist = [] - for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): + for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): labelist.append(texdoc.metadata["categoryName"]) labeldict = {k: v for v, k in enumerate(labelist)} - n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic + if add_default_topic: + n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic + else: + n_topics = len(labeldict) # + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic - jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/" + jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/" LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root) - - #printlog(str("LABELDICT: {0}".format(labeldict))) - printlog(str("LABELDICT-length: {0}".format(len(labeldict)))) + # printlog(str("LABELDICT: {0}".format(labeldict))) + #logprint(str("LABELDICT-length: {0}".format(len(labeldict)))) with open(dict_path, 'w') as file: file.write(json.dumps(labeldict)) - #for line in generate_labled_lines(de_corpus,labeldict): + # for line in generate_labled_lines(de_corpus,labeldict): # print(line) # create file - textacy.fileio.write_file_lines(generate_labled_lines(de_corpus,labeldict), filepath=LLDA_filepath) + textacy.fileio.write_file_lines(generate_labled_lines(corpus, labeldict), filepath=LLDA_filepath) # wait for file to exist while not os.path.exists(LLDA_filepath): time.sleep(1) - """ - printlog("") - printlog("start LLDA:") + + logprint("") + logprint("start LLDA:") # run JGibsslda file FNULL = open(os.devnull, 'w') # supress output - subprocess.call(["java", - "-cp", - "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( - jgibbsLLDA_root), - "jgibblda.LDA", - "-est", - "-dir", "{0}models/tickets".format(jgibbsLLDA_root), - "-dfile", "tickets.gz", - "-twords", str(top_topic_words), - "-ntopics", str(n_topics)], stdout=FNULL) + cmd_jgibbs_java = ["java", "-cp", + "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( + jgibbsLLDA_root), + "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile", + "tickets.gz", + "-twords", str(top_topic_words), "-ntopics", str(n_topics)] + subprocess.call(cmd_jgibbs_java, stdout=FNULL) # ANMERKUNG: Dateien sind versteckt. zu finden in models/ # twords + """ subprocess.call(["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]) - ##################################################################################################################### - printlog("") """ + + cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)] + """ + proc = subprocess.Popen(cmd_gzip, stdout=subprocess.PIPE) + + process = subprocess.Popen(cmd_gzip, shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + # wait for the process to terminate + out, err = process.communicate() + errcode = process.returncode + + result = subprocess.check_output(cmd_gzip) + + #result = proc.stdout.read() + result = proc.communicate() + out=[] + for line in result: + out.append(line) + """ + + output = subprocess.check_output(cmd_gzip).decode("utf-8") + + reverse_labeldict = {v: k for k, v in labeldict.items()} + result = [] + regex = re.compile(r'Topic [0-9]') + for line in output.splitlines(): + + findall = regex.findall(line) + if len(findall) != 0: + try: + index = int(findall[0].split()[1]) + result.append("Topic {} {}:".format(index, reverse_labeldict[index])) + + except: + result.append(line) + + else: + result.append(line) + + textacy.fileio.write_file_lines(result, path2save_results) + ##################################################################################################################### + logprint("") + end = time.time() - printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60)) + logprint("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60)) - -def main(): - - printlog("Topic Modeling: {0}".format(datetime.now())) +def main(use_raw=False): + logprint("Topic Modeling: {0}".format(datetime.now())) corpus_de_path = FILEPATH + config.get("de_corpus", "path") corpus_en_path = FILEPATH + config.get("en_corpus", "path") + if use_raw: + preCorpus_name = "de" + "_raw_ticket" + else: + preCorpus_name = "de" + "_pre_ticket" - preCorpus_name = "de" + "_pre_ticket" - - #load raw corpus and create new one + # load raw corpus and create new one de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) - printlog("Corpus loaded: {0}".format(de_corpus.lang)) + logprint("Corpus loaded: {0}".format(de_corpus.lang)) - #idee http://bigartm.org/ - #idee http://wiki.languagetool.org/tips-and-tricks + # idee http://bigartm.org/ + # idee http://wiki.languagetool.org/tips-and-tricks + # idee https://en.wikipedia.org/wiki/Noisy_text_analytics + # idee https://gate.ac.uk/family/ # todo gescheites tf(-idf) maß finden + # todo pro model: gelabelten corpus, ergebnisse und labeldict speichern + # todo topics zusammenfassen + # frage wieviele tickets pro topic? + + ngrams = 1 min_df = 1 max_df = 1.0 @@ -213,19 +264,44 @@ def main(): # weighting ='tfidf' named_entities = False - """ printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting) printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting) printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting) - + printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting) printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting) printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting) """ + if use_raw: + resultspath = FILEPATH + "results/raw" + else: + resultspath = FILEPATH + "results/pre" - jgibbsLLDA(de_corpus,15) + top_topic_words = 5 + add_default_topic = False + path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic) + jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, + add_default_topic=add_default_topic) + + top_topic_words = 5 + add_default_topic = True + path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic) + jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, + add_default_topic=add_default_topic) + + top_topic_words = 10 + add_default_topic = False + path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic) + jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, + add_default_topic=add_default_topic) + + top_topic_words = 10 + add_default_topic = True + path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic) + jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, + add_default_topic=add_default_topic) # no_below = 20 # no_above = 0.5 @@ -242,45 +318,44 @@ def main(): topicModel = 'lda', n_topics = len(LABELDICT), corpi=de_corpus) - + topicModeling(ngrams = 1, min_df = 0.1, max_df = 0.6, topicModel = 'lda', n_topics = len(LABELDICT), corpi=de_corpus) - + topicModeling(ngrams = (1,2), min_df = 1, max_df = 1.0, topicModel = 'lda', n_topics = len(LABELDICT), corpi=de_corpus) - + topicModeling(ngrams = (1,2), min_df = 0.1, max_df = 0.6, topicModel = 'lda', n_topics = len(LABELDICT), corpi=de_corpus) - + topicModeling(ngrams = (1,2), min_df = 0.2, max_df = 0.8, topicModel = 'lda', n_topics = 20, corpi=de_corpus) - - - - - - - - """ + + + + + """ + + if __name__ == "__main__": main()