From fff1e5d0fd508094c63634e8a656e21ae3dca4ea Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Tue, 12 Sep 2017 14:56:11 +0200 Subject: [PATCH] pipe effizienter gemacht --- java_LabledLDA/models/tickets/.others.gz | Bin 76 -> 76 bytes java_LabledLDA/models/tickets/.tassign.gz | Bin 254 -> 238 bytes java_LabledLDA/models/tickets/.theta.gz | Bin 137 -> 127 bytes java_LabledLDA/models/tickets/.twords.gz | Bin 2277 -> 2124 bytes java_LabledLDA/models/tickets/.wordmap.gz | Bin 697 -> 650 bytes java_LabledLDA/models/tickets/tickets.gz | Bin 780 -> 700 bytes preprocessing.py | 3 + test.py | 229 +++++++++++++++++++--- 8 files changed, 201 insertions(+), 31 deletions(-) diff --git a/java_LabledLDA/models/tickets/.others.gz b/java_LabledLDA/models/tickets/.others.gz index bd57ad4c34fb969218cc1370b99f93a982c8858d..3dd33a7e1b8f3cd2526c2f981b0da1d7ba6aa66b 100644 GIT binary patch delta 27 icmebAnP4U+7GpV6b)~Drvc^V6aR&MQ+NbRp7#IL{(+A!F delta 27 jcmebAnP4WiEXH!C>PlCKWsQxD;tVe5FV@&GFfafBha?F4 diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz index 07bf22c53ebfcdc99f920dc8e1f5bd50bba199db..f272c908263a22ff60ad90633104173942ed25b0 100644 GIT binary patch literal 238 zcmb2|=3syT)$_Y29(CYg4U7-eXxYhQzxNMw^eodCQq9hu&&%V^-Z(S&J}V21+>km=!~Bs8(ZMMX)l6UgKW>~IoUB+#niq6lQVnJg0Q z&2Ukcy4aDv@B63!-)nc(fBR;>bI(3!m!6nMDjt)RcJo}ejLyxi-fr;JF8TNWb-Me? f9={QN3xyjm?_RO@Q+;4etgrObiSFA|quV literal 254 zcmb2|=3sz;(EigKTMT#(ybE#U-1ykv=YQ+o6T1_?$*y5c;*>bQBennC%<7!}_nq&b z%-g#BhIB)2V(blZmV*Mgz?8G=&6o)`h9x z+N&~k$t15JPp!_`J5CmTGkSL@LV`OvG3NBVoiF?EX*)ErW-Vr$AYSE{&18MP%Rx|s y>8du18l$AVSMco3nZExmd(+IqH=jG+@#n~r>r0a_87h|jW%TRbaBM3R0|NlYrflT^ diff --git a/java_LabledLDA/models/tickets/.theta.gz b/java_LabledLDA/models/tickets/.theta.gz index 8dc11ae31b7722545aa4ac956f6261d544895888..2a09ec3d979aad0ce759a8640066a9d8c4b28f2a 100644 GIT binary patch literal 127 zcmb2|=3sz;+zH-X2Mjn^zuUU{2mG;Jy!ZdXTUQ*IIS=WuNxJNM$+TRQy9(woIdTlqUXFZF;%eQXYYOABGw$kq1Rt~QY fG3i%=-EU(ZF5XbFOPjkpM~VcDR7^3yU4uyVV(?1y|#ox*Q&VN`7OfTLAz75LZ5~ diff --git a/java_LabledLDA/models/tickets/.twords.gz b/java_LabledLDA/models/tickets/.twords.gz index 608a547c7a746095503da32eea64044445ca5994..b2ba4722e98f1df568bf35e03283d8818b04f79f 100644 GIT binary patch literal 2124 zcmV-S2($MeiwFP!000000F7EplIl1RT=zT0EN3%>4d&PTxxu@D3<5OxLr8LW1ecjj zpJ7(L<6+D%r%#ZL&+&+g-BV#lPP2 z$A78JuuQ7LYH`LwbCc?Dk@}V`PK%?GSj$SCtKy^7=5Wc_R7ZK2x)8*|ROV7&tAY$J zD(gmLnJCOn%fsa+v@MvD%-jM6S=?EkN-dRj>&t@qn8{OF6x4`1h{kDU%!HfxP+3-z z`3qS&M_l|WV}ceOOGCC{f&V_s^zl_4$smzhsWKY)v#xG8S)8ikY8A8H269Wg{IPWU zk5V0EE)*HxxtvyqOHhG@)ZRZ`R`G~wQRT!isoj=-mfB{`0bp6%CT1aW`LPIZMO05@4u36B4b5X2f?V~RO(#Rp>&CcM|{fExLjlto~29o$Crx;mTPe&5@TH! z6qYV*RlSi|b}aAp@xj8pJXV=VFORR{tj^ckc(#VKp>RgrGN(&D)Q|7SzjTU|xK^2c zd@*ZkH&wla2pG|71dMPxCnn33q1)5U?Xn6>uP%s%GKP(oIuljuG}!nu&sC;uAbo}u zyAp+)F3(!Hd1PI$3uSFvwuriL-+(KNwM*^#e24OaSs{$6j~5Xiv^*+%7Z)~ua;|7F zj>4s4Dhd{htURl5mS|b$YdDe4!qqpDtj%6Rvr$h6{~y6@jl0>zfNY4stjOU{3Ix;aNqkMA=#;_;o0YDQg1ez?ejuXD<3DX4U;Jv0m4jxdS(*Q2Dc0ym(=NKn*5BtCJAyPt(4K!V~CAK}qHJc1e?I)ZWv zj-Ymjjj*hPMm!IQ1wAl=QV$osK0Pc5BPzms*MOoxQgoOolr{|{!g<*uB4)kmZkcpu z4-esT;K89E6c}>6TQCG=6D)*XY@yJr_>lRhK!~}};~*&aU=Wn|D2V4KN%$5^@M6H6 z9uW|hcL0cID=qFF=S7C~=m$qp@noA|ACd+mDTP*Nr>g@#Bn}qY1f`vwJ6q&f)HfZkHU=XHl+R`DTJ%)L?A6 z*d@#LGKn{FvR<#Y(Z&a3MB0NfB7Op6L~jnph}r6eO!ajD%#RqNO2u z1#Xz2P#*AF*pZP$?$MDyIZ-?)BPGf9wIL;gY17e?;nZq~NtVGaHOW0Ofjaw4PqKpH z5tKhFL-g>sBqc3(9!<#{ZZQ^?wl|^G`{c&6inp6MiZ;vLeiiNFWWrZJ3p;i$TE$7S zStrqM7p?0b{*yEy^xmWaDSDI!)WEAWpyfYH1LDcMG$0>2Kp+G!(~z^{(=;G`ewzke z^&Y1|LuRkjfR6t>4LLe{pT@1|pIZTq1^M59qK3@(->9MC&yUowc|6_xXKHX4ymxBc z_5Pt6*PzjzK2-xM{aZC;9eAt;b9vf(Wi#>SD;VK-U*XPr?N{j4k)3kC0gD2v_=b)x zSX6mP4;I@nZNdVbNuQPY;Wb)A8i7@YRx6MVy;e3iO`J)$6&(z}-HOJ~`mI1^+Hgg6 zC(v<4wP0wu;)2^Zz1YSR*|e|v>SR!FC!JNaRefufroq@-<(bkqS5Y<&-Bs*^;q?*r zSJ8=|HdtR)5q5ru7Aq`O(;h4Gbl``=fnrNMwOO$vUY`{xOdG90!RxfLjHHF7Cz4q? zpw3Uatw>>Px3V1O<(xyhdB+t6R0R@n-g6Cn!l(uG_b51Q+>E}5Gei2GlzV=tC-gn?jnMZ{b)fH`rT+uvzy$>pGXMY> Cd-+@d literal 2277 zcmVB?#9z5p9Wb>PM_4dspPx&qvZ@ifodr=%O z;xJtKQ4j~g+c($ik|${;_O&_m!o?z31%5bR#mi{b|GUi#P6Mt*ag==XbY!v=T+*OB zUNKQ@p3ZM|pDNt$XmPpuEE%FwhL4+}7BHKE!w&VSd{ z=7|P;&yx+$Sdzm3XW&OJnJ9ncvP_jbQAqBp@`P;H8F1q}DsdgW1vnk)nHJ3c8bBTins1&9Z>g&oxU(hr^#&`Td!iTl1Tj*bB6(TiisTwbmr zdnWb;!kqt8DPPnBMTi?-unbz?OTJCYlgcVM4dHCQNmpl{ylO2KNtY0=tT8+SLz18B z^8i>MD=8}I%({8?Gw%Oysd_I}(m4|9dmRtw($R50zX<4>#;I)mq%iwrf2bEKUTtbb zA}Mf^svhel;_k7@(M(`{25(vJ4O6Q?o)H8_iLkUlg-Jv38|YETVjEm*ZX)!auo;Gb1eNCv}p!?D*wcRrRvna;OA>ss2-bJkbh-xF-7E}#nKNJ zUKmD-%EeP2rXce;h!;_~h?ZgC$IDgNQz_aw@C(}B9wfA}Jx6HcaEt(Z%R!aZ4_0$; zUSI6uC|Jxp=15sO+DO?Q6v$(8SG=M0-^-diwO-RgJBx9KmeGifhgW1JrKZJMG&qN zLlcBYMg(B`I0N&kA;F8pL6JXYNlS`Aw zi8S0zE~JumH*q)-PK3IfICoWG)pwKm)i`EqhhNUal>*MAXtnf~QBCtlA)`TE2xw~; z0owZb4H(p-+1Km;(2@cI(%&=YYH~pk%;S|hqJpL0<~wO=vYoU{a-Bq>^d!MLz1+Rg zW;3-&0GU{{FPgL>O;l2mz-xy`{yhJb#rY*g|K8`@4J%481huuLycll4#26JhfY$fw zN})(61H(2Vgj)Ejsb!fIn&(Ur59eQ-Lqiuq6CdOZxxOgXrd^^C)*%M@ajTfJWMw76 zza{Fkqh<41IQQoZKaOH=q1cUDv<$8jgO)+JTeJ)fyOqkIxs?cAem7)7)J!X_p~-3)5m-&_4J{H&enaha z3LKh?7UA#|c6b1m6bCSd8V3-AV(Ri^r`X|;bfO*HGPbaTLApcxVWZxm1*8Q$=jgSD z=Z0t*4`2)>5ALarm5v{gfO2j=HYN9KX8$t0^(Sj#~DJ!BFG|D1v)OVi6mA)c->RuE_ zp`uOxD)fRdSXi&s)R(32tBXeL)vz#JPM?xdn8{-@3Pe07qhO;4Wfa7GQbrEPd}2rT zF}ymX%;vi@%5J_qqwL+=GujK*`#uLmKu;P^)ExhJkJKoL^-PV9=;W0l=xPT0AYQbQ z3Dyh4o$6w*zenDw~t@Ak;5M9i?n&Dt3toqwN(n% zD+pvCL$Q+dL|>SttfOisxq`+8O|l)Mrb$k;FKdD+?C=1_#Z3xh*EdNYnl)(XN23M` z;$@UU>HTC2JFtqOLn|(icW4j9nY0H{7l~N?Gj+DcG%C&G&Uu zm?y7VOVeuU1&_dx^iLDWJ+*(EfST_gCs2TKK^9Ew!qTzJve2nSWtLoRW^oquXxC?9 zCU+&;%h7_z#;8zB#-OX!Qbwa(OCL*i#g^XxaM2b{-Kg7An7(vN56Z6I(zIqM;JVs? zw8Sjqk|XRYxsIf>i@7vk=<2!TmfV+g!A3Jwb;&)y70+q1!{CWX3Fyu<#kGY@D-RGw ztMTL-94ZyGXZlhF8M;o*cTOH>j-}|kxY}6LM^1zj0lGfEwgBt@IX!Dw&ocl3rbJ&+ diff --git a/java_LabledLDA/models/tickets/.wordmap.gz b/java_LabledLDA/models/tickets/.wordmap.gz index 4df13f88e34049fda3154d66062b000633318c9a..dbc5fe61f197d0654ac079820884fb4d4157b5fc 100644 GIT binary patch literal 650 zcmV;50(Jc#iwFP!00000098~$lA|yTy!$14nm-8;#EUIcvYb@$nZ4usJ#&Nu{Zs}|Onj!~d%~9?}>v&5@n0RLuvM_LFU7IXH znjscQ>ZUy2Hc}IOb3%q#z}tb9qal|CfQjcK;t#xJMIa%4qkBxLNDyv-LbX=wl|7Zp z;VUdW9M3j!puspMFTQBoHkP@`YO$Q~PDa7P3!-`)Ia5p*)X>J;bw(}QAE0sKE43W% ztmx}e)Y2&gcy@k5nfq-uN{(Hl%tbSKT@ZWlP{ESXC4@MhD5ErmYG_!@nMZGKD!Z$a z%hVDLx8o^GTvmT5o-0V+5~9G-<8}N&c&4=_JDzreMNc64IiJb+nmMEhIr|N`tSBV0 zf*Nb0HsBoAl5ajb8o2jI`TIDa}2tf{mLlh8C~Y;enG zOx0(J3VQKQ29m#>3_#}NwRrOJ8oaJkYd78DL#N0crt1nY&v>9EL|E^bsm{;MCalr< zK|_CYk0+6$C~u(bb*?c&CKQpDu|0sw44E-xRAW6ce(j-QTJ#*?N^RbKA;lr0`5Oodgwj!lP4d^+iiuyJTwtuB?wy(6&9(}?bcJq55WAy-@ zRZMQPuwnz0pk)XH)!a^4c|EqvFrWbv{dAR?BP~QlD0Sjwblhp(O09>0pegFUf literal 697 zcmV;q0!IBGiwFP!0000009}(!a@#Nrh4(y3SGj}p+f06%Nt|R2S+LBC0yQ9+%;+p# z`V3uVr;GK0vfHF9Q3MHm4|rU2;B6Fl&xIsycA=!?R+5#vlsN=ew`Lu@yhXJzb?(B8 z`pqsE>D)O_tU}qXn#?}f)EowQf1|Z+0VTmP!;(+zTs7evg_1Xs~-RCa+8o8b$l`eT-@T&O&kUve2#*jYuQE}DEr=B zQeQ%z_j#!RFlliD5Zn2qGm($fx+fdSJA2@tQ$~E1Mk}`5kui`Q*Bwch-24j~lik9o z5}|8n%B{~6xUJ(gQnf6AUsWf61gyB9GQ0CQ+5FcoXFre z7hcLyv%^p4yWX0FzTlSllxl}f`r&Yu*^u=Ak-1Y!KD8G8Hn}x&1oynt3E5x_m>837 zjNw6Jr-Y`nSzCP0i{+Qh)NJcmZ)4{L7p2-uXfQ{oWXAiu>n=1k{tY0N5~`pEXl@yr|K$cYavxss@qzR-(&Wu%2lpiy5>lk60RJ5cR)=@`Y!9ci zmuFLQ-{;?&=6&^gY&Vp0vR4#1E$VL~`f7_a7!BhgxB3KZ23p3FJrLdF{ITh4_r;y- f32;)zV}zMU^DiwFpimA6>}|8!|%Yh`qE08~^x)}t^C>~jj2TjlM~aq?0K5a7i&JXucO z<1*Xi3}2Owi|xpS&89Q5JkpFtT)tl&MRjreYB)IdNreWN7xwMQvV?Pg?mRnXi9X$C z$?X`B(DBNWNL|B`Wh!ZKsVHKOQeK4HaUs!!Zw|;v>hQKMSRD7Wl-Hio(-DlBIg*aV)T)}ncm?-j|k(GksXP^36B3W&ayK|Gl zl+h`~*d8b))VZu^I;pYu)?hcUrc9i>5~vxsM;iPv*~4VH3CT-96gaXyw?7E?G?k#+ zLo1lK1WNC7O~zNxE_g_~-+<4ALdj>)V+m9moZU3X=!4hV@;?HTTVx)DaK|?7z)j4d0Oem5{#0CK>GGrFl zQ1-cEd|OYKU)Q(5hY3!XpI2AlBdPYzRQlD{H5s_q*uL#DZeNnON2nSICXO-Ifr#y6 z;Jm9k%_yBF3NY8=0=CqZ^7c9?EJVL<&2a3aZ zJ?m90ORkXwRsNP7_Zv8I^N*#+UM~y_b-MMoI&$2wPv(S+ev?Z!SM+3^ zL$zzl%~ZM!k*Bp6&923#ogy#S<}TPJH)I`<#a4K^Zb_0Ku({c%%{vzI={ZPpz=@5^ zoB-3zn`HI@NoqwmF`vFSEf3MHo2_Q0Wpg6AZP~A~uaiZ;8N8-|=)Xfcr+gWe8~!5MKWmlo1ogfcre_q2A+u=M6i>=UI{ z!f&pVx#uFH5{lZW*l=-$ptI04*4Y$1eop-=NJGii=9ud2Qol)K1)%gLB7kPqzgDWo zHI?Q@yZX!)Tn;JWd>&gZHs0~ABR{Px@>Mkb9}tduYeIa&k)ZN{aA{Kq_}ZKyV8?Wb z907lnBiae@0RLU3clZxOr^c&A37An^G zsccoNq7V_0-p0uP?-sJCfZHsCSOxj|A3kQEqK&1Ve(B5t$=!s13;T909g4!0m@QbR zP|#NrDj_tIb>h5DX`IBd4i&4xD(9iH9vA)KZBLDkSB&cV-L5$jN4WqQb8eAjV(-x7 zEv3ucG-|&=cx_)R{bsq|d-(BaZm-Yx9=<=C)5XOBoKB5~0g`f~aY|1=wColVOHBQ1 z>XgXvJ3JMU@~YGE^16QputIV7Jz>-Yx{t}u8H&7B+(qp(DsP{kX=ZC`sk3U3t?!XM zfUvL0!8udP@m<@_+TvCxy3!{?8*hI0FB1FBFl(V)QcO`t+Ho9}It%E5E3ZuI+U3rIfZNRaKPBDl0$pp7NL~YYckvW3jhHB{{sL# KmWVT}1pok9-HHwX diff --git a/preprocessing.py b/preprocessing.py index 7dda81c..26d8261 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -199,6 +199,9 @@ def getHauptform(syn_block, word, default_return_first_Syn=False): return w return word # zur Not, das ursrpüngliche Wort zurückgeben + + + def label2ID(label): return { 'Neuanschluss' : 0, diff --git a/test.py b/test.py index 2fae1c7..8c89e07 100644 --- a/test.py +++ b/test.py @@ -1,17 +1,42 @@ # -*- coding: utf-8 -*- +import csv import functools +import os.path import re +import subprocess +import time import xml.etree.ElementTree as ET - +import sys import spacy import textacy +from scipy import * +from textacy import Vectorizer -path2xml = "ticketSamples.xml" +csv.field_size_limit(sys.maxsize) + + + +path2xml = "ticket.xml" import de_core_news_md PARSER = de_core_news_md.load() corpus = textacy.Corpus(PARSER) +thesauruspath = "openthesaurus.csv" +THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) + + + + +def printRandomDoc(textacyCorpus): + import random + print() + + print("len(textacyCorpus) = %i" % len(textacyCorpus)) + randIndex = int((len(textacyCorpus) - 1) * random.random()) + print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) + + print() @@ -31,16 +56,19 @@ def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'): if field.tag == main_textfield: yield field.text +def generateMetadatafromTicketXML(path2xml, leave_out=['Beschreibung']): + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + root = tree.getroot() -def printRandomDoc(textacyCorpus): - import random - print() + for ticket in root: + metadata = {} + for field in ticket: + if field.tag not in leave_out: - print("len(textacyCorpus) = %i" % len(textacyCorpus)) - randIndex = int((len(textacyCorpus) - 1) * random.random()) - print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) + metadata[field.tag] = field.text + + yield metadata - print() @@ -51,20 +79,48 @@ def processTextstream(textstream, funclist, parser=PARSER): for doc in pipe: tokens = [tok for tok in doc] for f in funclist: - tokens = filter(f,tokens) - #tokens = map(funclist,tokens) + if 'bool' in str(f.__annotations__): + tokens = list(filter(f,tokens)) + + elif 'str' in str(f.__annotations__): + x=0 + tokens = list(map(f, tokens)) + #tokens = [f(tok.lower_) for tok in tokens] #purer text + doc = parser(" ".join(tokens)) #geparsed + tokens = [tok for tok in doc] #nur tokens + + elif 'spacy.tokens.Doc' in str(f.__annotations__): + tokens = [tok for tok in f(tokens)] + + + yield " ".join([tok.lower_ for tok in tokens]) +def processDictstream(dictstream, funcdict, parser=PARSER): #todo das selbe wie mit textstream idee: processDoc(doc,funcs) + for dic in dictstream: + result = {} + for key, value in dic.items(): + if key in funcdict: + result[key] = funcdict[key](parser(value)) + else: + result[key] = value + yield result -def keepPOS(pos_list): - return lambda tok : tok.pos_ in pos_list +def keepPOS(pos_list) -> bool: + ret = lambda tok : tok.pos_ in pos_list -def removePOS(pos_list): - return lambda tok : tok.pos_ not in pos_list + ret.__annotations__ = keepPOS.__annotations__ + return ret -def removeWords(words, keep=None): +def removePOS(pos_list)-> bool: + ret = lambda tok : tok.pos_ not in pos_list + + ret.__annotations__ = removePOS.__annotations__ + return ret + +def removeWords(words, keep=None)-> bool: #todo in:str oder str-list if hasattr(keep, '__iter__'): for k in keep: @@ -72,22 +128,143 @@ def removeWords(words, keep=None): words.remove(k) except ValueError: pass - return lambda tok : tok.lower_ not in words + ret = lambda tok : tok.lower_ not in words + + ret.__annotations__ = removeWords.__annotations__ + return ret + +def keepENT(ent_list) -> bool: + ret = lambda tok : tok.ent_type_ in ent_list + + ret.__annotations__ = keepENT.__annotations__ + return ret + +def removeENT(ent_list) -> bool: + ret = lambda tok: tok.ent_type_ not in ent_list + + ret.__annotations__ = removeENT.__annotations__ + return ret + + + +def keepUniqueTokens() -> spacy.tokens.Doc: + ret = lambda doc: (set([tok.lower_ for tok in doc])) + + ret.__annotations__ = keepUniqueTokens.__annotations__ + return ret + + +def lemmatize() -> str: + ret = lambda tok: tok.lemma_ + + ret.__annotations__ = lemmatize.__annotations__ + return ret + + + + +mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) +urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + +def replaceEmails(replace_with="EMAIL") -> str: + ret = lambda tok : emailFinder.sub(replace_with, tok.lower_) + + ret.__annotations__ = replaceEmails.__annotations__ + return ret + +def replaceURLs(replace_with="URL") -> str: + ret = lambda tok: textacy.preprocess.replace_urls(tok.lower_,replace_with=replace_with) + #ret = lambda tok: urlFinder.sub(replace_with,tok.lower_) + + ret.__annotations__ = replaceURLs.__annotations__ + return ret + +def replaceTwitterMentions(replace_with="TWITTER_MENTION") -> str: + ret = lambda tok : mentionFinder.sub(replace_with,tok.lower_) + + ret.__annotations__ = replaceTwitterMentions.__annotations__ + return ret + +def replaceNumbers(replace_with="NUMBER") -> str: + ret = lambda tok: textacy.preprocess.replace_numbers(tok.lower_, replace_with=replace_with) + + ret.__annotations__ = replaceNumbers.__annotations__ + return ret + +def replacePhonenumbers(replace_with="PHONENUMBER",parser=PARSER): + ret = lambda tok: textacy.preprocess.replace_phone_numbers(tok.lower_, replace_with=replace_with) + + ret.__annotations__ = replacePhonenumbers.__annotations__ + return ret -def replaceEmails(replace_with="EMAIL"): - return lambda tok : emailFinder.sub(replace_with, tok.lower_) +def resolveAbbreviations(): + pass #todo + + + + + +def normalizeSynonyms(default_return_first_Syn=False) -> str: + ret = lambda tok : getFirstSynonym(tok.lower_, default_return_first_Syn=default_return_first_Syn) + + ret.__annotations__ = normalizeSynonyms.__annotations__ + return ret + +def getFirstSynonym(word, thesaurus=THESAURUS, default_return_first_Syn=False): + if not isinstance(word, str): + return str(word) + + word = word.lower() + + # durch den thesaurrus iterieren + for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen + + for syn in syn_block: + syn = syn.lower() + if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist + if word == syn: + return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)) + else: # falls es ein satz ist + if word in syn: + return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)) + return str(word) # zur Not, das ursrpüngliche Wort zurückgeben + +def getHauptform(syn_block, word, default_return_first_Syn=False): + for syn in syn_block: + syn = syn.lower() + + if "hauptform" in syn and len(syn.split(" ")) <= 2: + # nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus + for w in syn.split(" "): + if not re.match(r'\([^)]+\)', w): + return w + + if default_return_first_Syn: + # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht + for w in syn_block: + if not re.match(r'\([^)]+\)', w): + return w + return word # zur Not, das ursrpüngliche Wort zurückgeben + stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + + clean_in_content=[ removePOS(["SPACE"]), + removeWords(["dezernat"]), removePOS(["PUNCT"]), - removeWords(stop_words,keep=["und"]), - replaceEmails + replaceURLs(), + removePOS(["NUM"]), + lemmatize(), + removeWords(stop_words), + keepUniqueTokens(), + normalizeSynonyms() ] @@ -100,16 +277,6 @@ corpus.add_texts( printRandomDoc(corpus) -#todo https://stackoverflow.com/questions/15200048/how-to-get-the-parameters-type-and-return-type-of-a-function - - - - - - - - -