From ab4036591f2721e406f1d69cbf7b42469ff21c56 Mon Sep 17 00:00:00 2001 From: Davide Grilli Date: Thu, 30 Apr 2026 15:26:52 +0200 Subject: [PATCH] temp --- .gitignore | 1 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 560 bytes .../__pycache__/checker.cpython-312.pyc | Bin 0 -> 3284 bytes .../__pycache__/converter.cpython-312.pyc | Bin 0 -> 2868 bytes .../__pycache__/deps.cpython-312.pyc | Bin 0 -> 1145 bytes .../__pycache__/report.cpython-312.pyc | Bin 0 -> 7130 bytes .../__pycache__/runner.cpython-312.pyc | Bin 0 -> 6970 bytes .../__pycache__/structure.cpython-312.pyc | Bin 0 -> 8601 bytes .../__pycache__/transforms.cpython-312.pyc | Bin 0 -> 50874 bytes .../__pycache__/validator.cpython-312.pyc | Bin 0 -> 7800 bytes conversione/_pipeline/runner.py | 1 + conversione/_pipeline/transforms.py | 974 ------------------ conversione/_pipeline/transforms/__init__.py | 4 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 323 bytes .../__pycache__/_apply.cpython-312.pyc | Bin 0 -> 4848 bytes .../__pycache__/_artifacts.cpython-312.pyc | Bin 0 -> 5898 bytes .../__pycache__/_constants.cpython-312.pyc | Bin 0 -> 7640 bytes .../__pycache__/_encoding.cpython-312.pyc | Bin 0 -> 2995 bytes .../__pycache__/_finish.cpython-312.pyc | Bin 0 -> 6655 bytes .../__pycache__/_headers.cpython-312.pyc | Bin 0 -> 7344 bytes .../__pycache__/_helpers.cpython-312.pyc | Bin 0 -> 7693 bytes .../__pycache__/_structure.cpython-312.pyc | Bin 0 -> 9659 bytes .../__pycache__/_text.cpython-312.pyc | Bin 0 -> 6072 bytes conversione/_pipeline/transforms/_apply.py | 96 ++ .../_pipeline/transforms/_artifacts.py | 106 ++ .../_pipeline/transforms/_constants.py | 161 +++ conversione/_pipeline/transforms/_encoding.py | 45 + conversione/_pipeline/transforms/_finish.py | 116 +++ conversione/_pipeline/transforms/_headers.py | 110 ++ conversione/_pipeline/transforms/_helpers.py | 153 +++ .../_pipeline/transforms/_structure.py | 184 ++++ conversione/_pipeline/transforms/_text.py | 109 ++ .../2026-04-30-pipeline-ottimizzazione.md | 560 ++++++++++ ...26-04-30-pipeline-ottimizzazione-design.md | 80 ++ 34 files changed, 1726 insertions(+), 974 deletions(-) create mode 100644 conversione/_pipeline/__pycache__/__init__.cpython-312.pyc create mode 100644 conversione/_pipeline/__pycache__/checker.cpython-312.pyc create mode 100644 conversione/_pipeline/__pycache__/converter.cpython-312.pyc create mode 100644 conversione/_pipeline/__pycache__/deps.cpython-312.pyc create mode 100644 conversione/_pipeline/__pycache__/report.cpython-312.pyc create mode 100644 conversione/_pipeline/__pycache__/runner.cpython-312.pyc create mode 100644 conversione/_pipeline/__pycache__/structure.cpython-312.pyc create mode 100644 conversione/_pipeline/__pycache__/transforms.cpython-312.pyc create mode 100644 conversione/_pipeline/__pycache__/validator.cpython-312.pyc delete mode 100644 conversione/_pipeline/transforms.py create mode 100644 conversione/_pipeline/transforms/__init__.py create mode 100644 conversione/_pipeline/transforms/__pycache__/__init__.cpython-312.pyc create mode 100644 conversione/_pipeline/transforms/__pycache__/_apply.cpython-312.pyc create mode 100644 conversione/_pipeline/transforms/__pycache__/_artifacts.cpython-312.pyc create mode 100644 conversione/_pipeline/transforms/__pycache__/_constants.cpython-312.pyc create mode 100644 conversione/_pipeline/transforms/__pycache__/_encoding.cpython-312.pyc create mode 100644 conversione/_pipeline/transforms/__pycache__/_finish.cpython-312.pyc create mode 100644 conversione/_pipeline/transforms/__pycache__/_headers.cpython-312.pyc create mode 100644 conversione/_pipeline/transforms/__pycache__/_helpers.cpython-312.pyc create mode 100644 conversione/_pipeline/transforms/__pycache__/_structure.cpython-312.pyc create mode 100644 conversione/_pipeline/transforms/__pycache__/_text.cpython-312.pyc create mode 100644 conversione/_pipeline/transforms/_apply.py create mode 100644 conversione/_pipeline/transforms/_artifacts.py create mode 100644 conversione/_pipeline/transforms/_constants.py create mode 100644 conversione/_pipeline/transforms/_encoding.py create mode 100644 conversione/_pipeline/transforms/_finish.py create mode 100644 conversione/_pipeline/transforms/_headers.py create mode 100644 conversione/_pipeline/transforms/_helpers.py create mode 100644 conversione/_pipeline/transforms/_structure.py create mode 100644 conversione/_pipeline/transforms/_text.py create mode 100644 docs/superpowers/plans/2026-04-30-pipeline-ottimizzazione.md create mode 100644 docs/superpowers/specs/2026-04-30-pipeline-ottimizzazione-design.md diff --git a/.gitignore b/.gitignore index 4ff0772..0334ca9 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ Thumbs.db # Output conversione/ — generati da conversione/pipeline.py conversione/*/ !conversione/_pipeline/ +!conversione/_pipeline/transforms !conversione/_pipeline/** # Output chunks/ — generati da chunks/chunker.py e chunks/verify_chunks.py diff --git a/conversione/_pipeline/__pycache__/__init__.cpython-312.pyc b/conversione/_pipeline/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..add25a1ae104f359f221440b9e79e4528c586f87 GIT binary patch literal 560 zcmYL_u};G<5Qgn0ZIXsk7!U&k2rAK;i+sSf+Nt;+#ZIQmJgLyZ}2J3opVO zM9Rd(hUmt`*(t)q@9(#Lcfb5TnT!dpr>nJoqlA1o^S1^YSOdZ{kwj7;hB8VW4`5&d z7I+RIG($G@9Ky&%Eb=^r*d#3RJc6-FS?W20Ei+*g&oOM9DVtJqOJwpw+0JS_OFuCS znb9`o^E)-a=Temw+N5`-T)4Y=kuR04d2e?_S;C6fR^-*9u*SWQL@r?Upb)oi8V!9WaYv54 Xo_6H0BPSg>z?1JVNe7RS-NT2!!)=w= literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/__pycache__/checker.cpython-312.pyc b/conversione/_pipeline/__pycache__/checker.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11cf3f04b4536dfe014ce14490900180a3897548 GIT binary patch literal 3284 zcmdT{Z%i9k5`XJm+iM#z;9$(3!V*(rCn>u~YAT)3;(d^X^{)GN z4cNC_a=l80rcxsH?&2tl>V7&$QPq6R$Gf(vPP#7!?u2iTlTN)a$2T`bN}7JTdB&R> z8s+<~<@e^z%$u1vZ|1l5U#qLV2%c|8H|3K&LjR!)`*D?t&AmWeM-q}41;v?n83suX z%t$PDU>0+ejwX;#auW9e_pUSLjJh`H#wcTPXGPhs$VAhX^0L~U@~pchMsk?wCc5RAbeyBwjUVrTjhB6VPg zgWdKaD>hWv_EfNU*qJURZo7PeF&W7@69M~7$_2E02BIT8_~K3GCXzh2SQ@nlNlwYD zxE36yW7@yn23?uxPB{mC=p(ODXKO5Zr|WI{$Jy3hU`@8{-3uPP^mC`29R{U*MuWXa zsxnzyvcMUQHhpv_run$EJncPebxRM>WzE|Z%%0XaCd z!;62r7t>|O^9(<3(+z&8m4H-hvXI9*lY_DT78GC1vGb(opirfN=>m;iG~dtC_-THm z+Dp`O!&Y5|Q7_V&IR}F-AO=1k*UqyS&>Ry+MzfupPDEHLPX&$E3SR#LZ?xNMlee?= zyqa9_NR6gv7U`T6qBC(`fcsnEf6b?w;4IB1zg3elhs}pGmgIBH9NLO3+L9asuWyTr zEQx7ZQ?WqA1ki&5)@9wmYFgYcD0pg0o{$yU_$s?!FtBcDAU23dnYeo_T|Wl1plYgM z5N%d8v_T=wbwdj$X`P+{{#Q96sS0OLj13BeR#((9miCr*%_cQNlM#ndx}n4X6N!W- zB;;63Q#2ux>;QdXB4yyH@a^T_2??T2h~PMmRRaq#jS$T+v@_IVEjSQ#8nloQr(_jx z(4O7~9h7kpzMB~se$TuP*F@ptm%|7xFve35S#mrJB20$X`13Ahp3&j@B{b~@#U0Pi zR*;xdd2*8D56pGI$B#!ilwv<;=a}1^g*IM>?He?pGq7Y8tXfVu6xOXOJP)~#8RG^J zmJ=>QPTXcKNB04Xku4U|u2XOngv(EQdoF5m+#`vzvV?nxIMox=)LBe)deuGS2|0lk zNK;SjB96^q(w#^>@f@DQDxOb}p>%7BJq`xD4=Y+sRP>?lr&8Mg`X~&K|BD`9Mh_6X zyQUCqTn_{@?5Ca^o@~dBsa;SGTbZ{j!aF?t#9$Riaw%T}nH8Aq0iIu>qhb*eA z|Bi)?-)~Y#{q3-O*vo9tn)&~`GeTjXRMUaNt$hU~NI5iF&rfF?-Loy7kzk>%eZ8eE z%l^^xnJ3rrS=E{_uq+I$3O`#BhVPNJ_EXF4r&ilvTWxv$FB2;*XCa+U&5+JW@Ou&h zb8jVSAmvUdw~idgLi)5on-1HpT<%#xR&8B98H-_EmuImMNoYXOK8IFIU}DIsD9G`6 zslB4N|3NZu?K!th`K(f z5y|pkHAYei154CMh~fs|ih%)11c{=@mR@?->G+Qc5aDD54+_y*%Z29w9(Bv5CnqQ6 zdCRG2bC_71D#o$J>2L(gtsCRIoW_nl zZs7wDAc0j0OI9^d6#@o>cYTV`7rgF+L3sKcD^;!KdD{PkLMHXCKL$jCcK^38aP{cY z(ZBgx3;tdC(B9muxxM*dU*6xB=lcq7|A&`;b7{@py6kRUb+=`^R@@QL%lLEr7uC0` zvzPM0{=C0G&-WL=&XJ`fSB|dvb}##OUp`TA`xf7OKUMJWrj+#luS)h`%1ODm^9SF` z2hZpI=kxseA{Tb{{*!MiR5xB7UmDN6vs&F%L`|OhTzEa$lnH+ty%GIHT|8Ov2QwWX znT5s!kK7_;|45!6dDKTG)ZPyd5232Y2Z(W= zVE*ABS@-RFfH-IGdQIK6jw_Q_l_lkhy2uuM4Vl2QuQlt->C3{)>w#BF3E!Xld2S>h zJdpPv_&a~#hoT2|KG8*}fIs&&jMkwqy{)58_RB7w(mlibVDWXc2Nqv!x&_XRL+2-^$amvtumCe&cn~v6cbOH$f+iXQD34O@I0;@&utTGRtWs zX=!s4nx+u?zoJx*x6vk$s8p&?_``8cN-B7W3;>V5HT6yyiY&u0|3aantB2{$a76@D Vwjpce&Sg`}9la0fn!E~%{{yCgE{gyF literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/__pycache__/converter.cpython-312.pyc b/conversione/_pipeline/__pycache__/converter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..249b9532cd06b8b030e2f160f256972bd288fbc7 GIT binary patch literal 2868 zcmZuzO>7&-6`tiTe?*F;^+VaRRcWI-t(ll2ilnLAS_WJ>l8scVV;N|j%VM=Nq()lq zFtfAz0|lsQ(aJypr%izMY!eXSshnl!H3VqT`|xMI~+Ey0NhxaVk{C6tN7qr{H7>+e!3(082p}p6sE}?ZDMsywaM73yy zh8p@T`Zf|_f>iHm#B~kbK&$#PTG4NyWvyt;@EG*S+^pb%Bd&8ot}!MiiwS8|s92K* zs}oP^zTh&WqIYazyR0hXj>~)^yIx;$NL^6o6%B3x5(g5H)G>2pLfF+R!MWA!aq}e&0Oy-dXxT1-@1G|yPNyYrrzv%;qL55)}O827hl`yd2KV%Ol59$-R$~Z zrkO8nX8*gF>HiPX;)O5s{Vg5r6TcS_-?^w=G|*>8*TrM{XL$|YGRXpAVagqqsiz^3+I`cL3JnBO_pF_Afp!iTGPqk^0WN;iOa1x#r?!sxD zsm2;=)$V(lw#y&~t$`K}#1I97&~1dXcVcTWZrx}Y3o5cBC6bh17Wt?PB2`ozDL5DT z6WR56Bkt!!Uo;Ts8`^DjCl>X;qc!4K8-(s~^+Gi6(Bfmb=ab$jdke=8d!kz8-+DqQ zps+L&(F(Y45i-wDJmo|G^Tt6Q3U~Bq7Wie%qT)dCIZRF~{Z52p6~v^jdG*qyNj-Dg z=8Kpud#2!ajY5~&EjZjFHYU6Tf*lLz&N`efg)=MEC8kGQ*ET!4GKtBFFQ^Ep?}TIO zggcD;Y%Cp4zG?$H5uBRP?+R){znWYRgn8}aREY^eYqm>%6pl8{lDSAo-BNqWn&ZS? z^v5S{*C!CeSHJm#DPS>XR@{Jif}AxyX3pCjmzKF*S8*MHz)VcNs>L{jGCsPp>I7m` zB|?FKf1JSfHlY{RF>3)hR&tojJkK`I{NhGwA`maXTs%PYZI24NMh;w2NUA8bIwXMb zC0nJo?H=Q_N<9eb002U)6005d_$aGXLSH%n9`XGE7N#{ArTl>yxh;fkdJjH^^i~Kb z-WmmF4)BCM=MoF@I?wRdzW10XoA z1fz@f}()P5(OuKba1CyrOX6`3Y@SW1GStH)LShA|E|3Xl>TgD>HswG zi3K6RnQRF-l-qz2C+_54OSIYSjjE&-@%*Eno325uaAQHVNTDKXbhp zlgYXb*$}==krnC-i!I7jt)t{g84rn;I>aH9owP_pdAgRKrx+8j#GGm|Bh%Ca1-9?Q zbek7-Ov*%s@tQ5Whbg_w1mPTZ-`5z>16>-R-5NNq==7!Gf{{kmWplFoTHwLy zKweid$u32JCCG}9aZnenOWh@2G139_0(&wN>f%qA?9yieb5Q+~!HZs4R{^j)6>FxfF=`W12z3i#2%e&bZ zHzpq&1CNa1hsN+;-@v;o?~N<7{NDLyey}+>+#L8;vuB`LIN2-=H-|>={&eTm_nIe$ z;lghnNAZzwP*Yy|*shyzoWl)Lvn5>$@MG z{ow4~rQMTbyM^**`pf*_UhnYE3*+~z_iyZsOzrkw-s!&F>>b*A^Y^9A)d>q=;kD~v5c)>r5ru_?@XeClwVe5Jefw_zKm1zx>wZ5&>Ed*w} OKQw&{;N7!cy`UkHEZjVCYlMTri!)2t7WLvw_PsZ6zxUoZZ)QHF(@7ArGjd?9 zL;$|YMJQq&$U%oN>wp2H5=cXtw#-Z`2sg4Fa$@KNUt-Q5m{{huq;J zusi}T*#v?kn}Uy&6H%ohnmGOsFsp`M-Qs8+ISDVP7hy@6gheEJ{^syk&!&1#1CwM! z^gnu*A?%sc#+lq`g2%H%rvSg!MZ3v2Igaw`^C$N+zfI$|e0+9Lovg z|47@HL-M1u=7L#Kdlv|EEox8XB7SqxbhyVzfqLBWxF6+YiB~R%1n!Fjy|3?D(3Ug0 zAbRSqSJEF0Pu$B_dTPzbYJ_xb>NA`>?vP=am^$3@=JPh;My|=vp4?19Y{u8d<{ip- z(9MrI%(A(!xMkOmk|m4#N!NSiFj^$8>nqH&{UjrTtX-s-_?c0wM8+u}rJjw03b_cA zC+x==5fr&AXQ}H)d1Hpo69bzI7A6KWr;Q@DYa<{=K@5(REL#AGFJrSG(#-@u14C7XO6ud0yB ze$_fYYaJWti`KQ5?cPdtR~1NI_;Gj|Qf=Qs5e3xrR`bxddMj|QZ>s})s`grap{`|K zY1M>4{5(7hsdIJkz_vOVfNyN8H~$15xw;^^8XbgZ#&93J?;p-9A9|70ef@WPl#f@C z(Bja=faDm*a|%m~3?|=iu+z72AY5Qsp^nts^Avj}a)-5vrMy6{d=ga!A+!sbT}U3r Kv#6Dw5y4;Fi3&^r literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/__pycache__/report.cpython-312.pyc b/conversione/_pipeline/__pycache__/report.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1eabfccc2ce81ee401796e7933ed55b014c558ff GIT binary patch literal 7130 zcmbU_ZEzGvc0D_@v)><*cD2$<7=Z+6S4ctv49I*KOBMnf*&M#Bijdf>c1F^O{i>dk z1?$-=`Rub-#1+8c1D};UUD>{fzz<=Q80YIKr>^R%QnhGr?Ly4SRgU6Rg+Ix1ICU|< z@_MwhUnZBUOx3)8{a$y^>o@PK`=4z#GlJ*zv4_Hf8=-$C9qqB_k%tq2Tt*z?s4yB@ zYpEd$`ns?#LJ!dtq0wO`!VEDg&V=<5c8CRB&#_@c#5iOG+`yYSBhPXsXwAHtv%qWR zEu4*K$LyT_b!NzVhVnV?lg|$ND88MiNI)#JEJh7#77iEY)T4n0#?kSFdvKtB>uFd7^HwG1tn22#Bpp) z2=j^w^W2oWgu-H8n!?c#OXSEuR%mH>A8s}v^k(gwo;T|*BUv{|q^Ni}>RO(9woW2* z)JN8Fx@)>MJju#zA3A3^XAIHfD5M*4A~X!Vq#QRa_=#UJ2&ZtS4@0%+6X05NT z<#_yBoMld*w5U6@7IyeTeuq}rVU;#-vRcE6EoYUj+E?(7Qf*9vc~(Bx@&s)!ob43o z5!cC9*$5_S_o7Kp%z}>ce+>Q>?B$$2#HjzP zb8R>3KZ7|*J7C=6`gJ{5rM8Vx9fhLxAd6j%KG1}oe4TTvdvw97POu?ZmFwkdC9k$q&Qr7&u$-cOHEJV> zQpVU+-5AByy>3fZmHfjev@yk&tJnU=Saonr;W`ATNha6V!S+PI4p=P%>>E4XcC+_nnt)-vwa3T}HDx4nYfQO514;C7aA zJ1e-`%DCGqxLsx3t_p6qoYw*DeLLXljsf2RxVk^UcLJ{N2Jj~VS7ir$7vQR-fbRxe zl?(7afU8nmqySUDSSCYHsSGee?Q>7f5gXV$XGpqairbg4Kn8Kk?#VXICW@`puHlm% z8dhv&7q_1~aQ*3P`E11lc@&b}$00{iC`3aIfs-$8|nV^UIt@+wfjoI{I2H<YP=Z1*s^~%QfoPcpk-x865CW@thq{c$QBqg5pxHuT>_a2r ze3GP-OwpDm8Cap>k&(=3mLO7Ztt%N_NRP!S90Xi#G^XC#0fk|~@`|Hm8mtn!GBXLT zTKQ;!pH$LbDCXu0woIXtvaLXssZ+yX1=L%oz(sY(5u-w%elWqDY;J2(4Hj>NuUyqB z3>Lp|22aD!|I5a^tD?!5s<8eO9DSA=DLLAqTIV|==gGpRb zmPVzi2xl+yu%iXu@A7<^OH3~nGp^#L&yj3yfX7ds}*oiI)$#`51YXxUMf zVpcp&^WLmy>+cX_^3EPg9ZX3pj;eEKX3t#l%(r~u*m6_%iSdTm(8tEp?NsUNPs@%HK) zmsZdIaP{nKtEsE2slQ6!NPm?6IDIqy*Xh4Wf0DkH{xto!>Ay?=efl5L|Cs*eotZnY z-g)iL>vw*1=Z!mW-bvk=Ov5rp6CyG4Gi1Oa1cYK?6@`P#%PJzN$V&i&n3wy7 zC?~%N9bP^@5u0w3CW4aO8;iuJBwp@?=~H3&^K#$XqGZMocr1(wAwYe@qoG3N1(g&$ z>(7a2(W+;a#O7JE7CXVSKF)iVE=ia8%40kWe1+D0CCSJPSnzuA2=c9p2M$bmX_ES4 zBrg^^9DNE(Sn#(4MeeNWalTcYqpip>#Cwu#(X(calZ69KQIPM>Od4cXq-6uh{fbOY z77jBdb99_u^>ccX#HEU-#Lrpqb3>V*7ovt-I7G@Q@8=(ZpHr%z1N0dO6ICZ$n_ubP zx;^O|7Ta1o+I%hdNvgWf0l-AAVhq3}|NG1s4lW39BcL0Aq94bxsWm583}aw9!Ejh% zMq{xs?jnP>6R-n-&!T!iCjN?_Bw!Z+g^G~TaMU(V1jPUZ5QrZH<4-G200FSt@o^rj zaX_)F0|Mfih}y@yVFi%{LN#WPurZSl9By?$U@V3sV6B0*3?e4l{y-q99DIl@{wo0A z4J_6|U~yJoHeN6;)weH5S?7+_!Fx5XWoI3v)62Uq?7Gr*u_x8H;vk9j%$45XIMxGi z#pAuQ|87H9rlD)GCEKto>)CzlV8*jAHL&8YU-J6zdb=`S!t(CQx_77g?>Rg_dGh}m zr5ZrcoVOUI9#Maqu(TfN8SZHEy|R64GT}biIw47M@jy>!r#wA9J>20NKHipaZ5Tdr z;^gqih<|uwxb?(IA0WO2vv(BlPc*k4*t_q@kx!V{i8TEy#;uYkHiJVSx}!wF@qL0+@UE`BRC5>)hM1y)oKiPxrQ~er&7^* z20jMz`DmrOMD3+4rMBW1b)Vda`RbsSiH8LVlN6&CG^BQc50eUC(T{~=K}juf)q1uV9o4`` zWQm7^Azq;qLR_I?8XRVX5L_XKAgPj@L-XN7gD-sV+1`W4pT#8KDReY8tr%hC7}0>D zhZqY5m3bNqV9e_EBSSW%=((v#T=dne>cYgKFcBD&n25=_2$S17SSf$CsZarYkWdV8 z(31mOwG-&0~49YoCb>rQt4VkJ9^DWt` zEi*^HcC;>d7msBfyJiNKeO-4Q-S3WH3C_DN(dYZ#F)cZ|XAWf?-8qKQKXqvbT#W6> zAwWNHtIhT84@vKV9ksMw+J1ie(%wbtUXAC9VcwOk+5A|n*{=`bYUo`L#7ekLt0nM&J2FTqB`&8#DzpoN11kg zZ8V>=&DtIsm^yZeaXqj)V5Q?Z1kKHso6-&c?+N;#1+CjS-}2sb_Zm0McVrv8pk3P7 zlWlxTMfPMHdkAOCdxL6!AlrDD^gFYS-6~;MwsAKhY|S=y-fQg2Hg135wm8_CL%Bv& z>s@iLyF7eh_~OaC?yVX3)}{7+S@-^&j`oK$*yaEpoebGi0Ye` z>sr2U-aLQi{rxM@B;G$j8t5%IZ(VL~2l&8daGP^Zx68b2t-ap7FupjM={T76Ka-<@ z_LbG6(oSVMp2_-qb9$iVSY)eKsr=Pw*1s=jAVedwtqs+tn+Vm6Y;|P|SP0dMZ0iYi z>B)gi$KkC1NX|ycc4S+(M()pa9Lo9!at=averI$QAy*@tS6w=i>FCS)pUu?}stehk zSlF`Y&wyNRLa2q=KxoVO_vbu>P={>wK-hG(BUexQ-jW#&gwR+rV;vzpfoz)>wk&n` zW#B9838e|yS_tKt4CrJ7p=>N2(@ZEWsI7akJ>!3R#otc0dMMK|ko6y4_IE9TrThDS z=YvrXnzz{5nf~1OC{*LUax_!zn|W@TaolCR8OA%e<7YkZ_RNpn+WG6M&l{dw`oYK& zrV&gqk$@NhB-UL; z>S0n1g^!GTBQb6&%ZvF2|a1XbnuA4Z%JO#lD@ literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/__pycache__/runner.cpython-312.pyc b/conversione/_pipeline/__pycache__/runner.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b91242f72a41a13f43348baa4efd8cc3855c7acb GIT binary patch literal 6970 zcmd5>T}&KTmagiq?ymlCe%pY53S;bUFkl4U!s|ZR1G~cQ^C?GGqaMPmPwE@a$fe_`sr@` z!^^&GfvV0u=R4<~d+s^+-m3n6I2^#>qQ-vWKd-{Dzo3S5kv8$uzXI_9V=)#_V-waB zpTME*O1m=d2{(?hZk9+B8P9~rq&;bG#y8Xd$_$T}(?MnwT!HJ+rlj%?5)%K&ULTb(x1Ol~x`fmvN11yiTcn)}Z>R7itm?p&yXA1?D1k8K*umu$Qb5uNwVlX;&Ut`EtCYrGlvZd~RrKd``kQf*-C@Af8Rr|=yx)&0Fbmd|cPrr%?RTs_L^BW9e87+8 z12Z*NtMbm)KB{xj^C3SfB=~dmV20uj`d>mb4Gvv?I{r8|*mw@bCY%q=G?h^f#d;mw ze57LZd-7q(s)l?7?u}o09L$dVXsY1ubL{0q4%eKK=zJ_6lJ>EC*?ncd^HB#=!7J&Z z9L>RUlY?0naCZNERfS&%@-db|gWfr-6^Ld)9NZPr0*FJqB3c1)cvr*`KqPiWBmvR3 zE213`M|VYZ0HSkOL>C~A?TR=Kh!eXax&d(#AtV?321NYSYh!yFkY{$abrujkyCTj3 z;{2|N3xIfYS41x$F7Arx14RF>h=F_+JNN~`4!vd_y5!_2jzOgoGY6auFF#>33OyPw zS7zQb(_EH1R6b-@v|aZ&2CL<(P|=+zrM0AAuFz^&!>^$Fc$Ed^h@&x+aH#S#U(H6@ zw;o-6Xs^^S^j6*xMgV!uiX3}gX>%+%Tsx*jIXV@!cLPwPw(c0vZ`<@Z&^K-R9iVU7 zR>)Uc`4u4EwGpr0nQjAO;$;MI-$UF%?1LDrcr_KCy`Qhynbn_G6PVIb8Xe4VKAMl^ ztMgS4y%u-JTK9puXZ_P_a)mCBV=mN!DG|r+Axx^@VW+x`y#3P34to>$ZC#*A%?B$+ zP?Hbt90Agw5B_uT#rrDe__F8853d_OSybUjB__<#GKb5WF94^C3l5C6IHKWm>+lyw|6$JC`#AnI}kL{0IJSE#JS$(*2Y z)DH`vQ}SItn>$FES3;E;s-wd^G*Vekq9h)WQz|cU$(y-GihA#O*U1m4TTGf~nH-84 zH9B~yv$Io0Sq=_-`GC4qqCck8^CtVf6G&^oJS3Z5AuZ~MfBz5a8YA6h#aV$$h*?fx z8HGuU49iI!=wOZPqMEYoUgI*N#4{8pGpfWZ48^K&z=lDjnaTmH=u~29RCYMYq|>0( zL*+cGGTCtk&fz2{q(qh%rqpCfWk`}liK83|C2(nKl7|C1oT^z7&fYx~kHL{!MW-gz z%g5y*Oi5fW2Y{T1y4rt~%6TDrQ`1~-KZLCt1z=Q7^KiVUWJOXHIFG|YABOmh1yki? zyo$~~mgR9ib4N_`)aW~XFhU8=@Cx5UQAWK$%a(}FstoOja(gLsUCCsaDPG{A_lzjZ z9B81+s1|767SXoI*0?T06nCOcjfg-CAbJ7amr6nI@Rc4p_8Cj`qDOpQP|96mpcCEiEl}>?~gyTS&vJ7}!u8U;8q5crzjy z2429xUdkMwQ76!hEuzaJ+tr)Ag8H%CqEvx5$440ubHx+Xn0+fc~CWZyZ~PBj3JVX6bm({ zfDClvr5Ktb+7VR*Z)Bunpl@{C4qn+&BUaLdDoin1xvVtKp=IRdfSdU$TSD5NHO_W% z)Ojr~#|_2As1oSX8}p9vU4qG0(@e)SD^(1MG8zRMy`O`~&JVdXEW4|>c~-U7Ir9bF z8B@;Pd2I4I-ilD~okX?W2e0=$1iMt*P}xbB;RA;tH)L3B?nCft@veH*uI~rbC?{n& z8Nw`ZQw+S_S1f393YIvuM8I-Vk8Gx(!O^e^q1C8!MivDr1csoC5GTk;#>gq2H@v*S zLWe3kxY(^G0N05cMrA# zGulQpZE_)3Ev^w@?m-GFoRH&G^h#{@eZ{DLM}UQgl|!LZ1!{Pv$PmOGaQ8a)h(=qbQen8EvknvV^`Y7zD}_!^5hX ztZanokt?@`Mn-PXgG1v3k{rccF-sX~QR~3XQt)L8xeGz$Y&h z9Fug3cMW$&o-*7@CTsXIEUd#22AnfUgf@rdlldG+-@Rk_1RAE0lZ}97z?9-(PvTg} zYZM8eo!ACGE2Fa|g#V@M`LV8PF~fDS%srmvy5P%LSLMvoSw735e!3)85ICtbJ7>6o zlqSF)8k>9-CT#)Ri0xgT(_@JOTe!7)vT#91QZZWL|nu^5UO|z*iPA;|=iF(wUSR5%5jeqvVp7`RrFJAOD{8#HH z8P;NlSCo}W?dXUWxmF~vo2(N>^5jOeX4{K-0$Sj3kvOs$i7%7uk>+0yFZ3_E{}c=Z zZpphOFL8@Pk=(ZttwnuYTIybEUF<9p4Yum#B5`HYU-iU)NcSIF_aENE!k&gg%a+UM zZ`iDFUP(L}T&;O{Wxf8?;_z04@HcG4_dkgz^mth9MBRkv0vffYV+!~Kb+|Qy6#(Ir+Ks2K0Vp5B?h#>V38Oqn;+DZ zLt3ITET%KPdizDKtuiboyH9WL*V@R*|@Bvl~{s z3(?h&w7}US(PK|*qDUmSTwYJz1{wV<{7HD3E!X?En_7U*gvz6r__Uq(zVDo+D3)Zk_1fBtxGCJ_#(HF4V1l zR3y)SGrZ--JSVp?m&dzJqT&d(u1sm2efr^kt##m=H#WnK%cr%LQ+m^BJ$yzBo!xTd z!E@Uf?m3SN^zzcLmT&!r)@oZ<-YJ}3J@s|3)^={~jc?-L*8lc^c5!SA!{5f^Pyc>Id?yO%wJfO8!&Zy}_uYUNV)@p}*BV4=mz8*gKBdE44;;2}|%gpk) z7HR$?c@V66T2Sz+mTH$~R|18;!t9zK>cs5iNP$?nR~XX*T}9%U8TJd1#2dlrKemTm zSgh$s%mu|8s?w_ailP3Enms=Up+L>b-VR~0hLXItYZWiZT6K3Zbn<(ERXMO+!+r(R$bKPM;l%V&8>pM{7LaH6!VF?IfbR zBcl+NYwRcBV$yB6Jtxm+L{?36Z%Q0A(PuAt9Bz;RIQ~5r z{2q%w#rFISqn=_1pJIup*ny|mp)D^~*StcnHGR8ZkG;DP-ty8o9$QRpVd%M1Yds6B O^;}i9O}>O)6aN?HrdxCX literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/__pycache__/structure.cpython-312.pyc b/conversione/_pipeline/__pycache__/structure.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..569c0d051d229debc9d32f17555f9146acfdb202 GIT binary patch literal 8601 zcmc&(eQ*>obm*i?0v`iAm1MaVa~8@6L{M_eWN&Y_I3IDk@1OM^cpt>ne4S zfAZ_uPl3Sa_~J}yUw6NL-TnIY>-Szi{mkR3V(_)^npK`?VwiuTjpAqZpfSs^40Do^ z7>SKAy|m;a>=4(>u?)w^`~V~I-(k;zhJL+50Vh=8Yz3UH0%tGa>=igi0q3Z|IVGp; zl3cQ@Uy!O^5PGYuwp(@s=kB*l)xcF3XxyOjpzW!k@yf0i@C8aQdVEsdoW2^F&dgDY z0+sItTW_sYBXgL)4)fJwRQ(7WsJ}+H*Ark(;R#U>2ktdFC1P?B(c~n>WMz{Ng#py4 zYVt9em|Pr{xQcE~K_AeZB$~V=M@$<4#-XlY5_AkvDjAbS@xvqo5p;+VQxGW~Cn{D= zHU%xJf+Q@Owy>z8%F6_G09se+)Btp96;oSv*|Z;t%bFf7lw#b5ZG@uQkIx5|nY<33 zxoE%1i>hRDdX&_o7fflUpoc{rHQnTgMa|^J~069!0&MM29dAV?~VZi(@)9O6fpP<)u*uhEW-b#!cIBSP6wq8@N)4DfG!Q#)Pq! zZ8#c_NLB|{7ly;=!1|F$poKtf8IzC58U&2Fq&O8U8nRB@R##lb^bmo`LlUMCi^egx zB&yhyMyud6MEh_|F_BP!&BY>d%pt~2E+msiODJGa7!t8vTPQj-B%3xdu4zi3g|3HT zl7eLkm9rN=&MafF==QDG{S5i#Cx3bH~O6C)TQ$wTA`x`?1z0#E^Ua2AI& z?87pVs37`qrh+O0B}D0p!mzAbVjvPdBLYr1r~n^{V#bK5ksga}S+(Rqs`D+4?4uPv z7A3@s8mXa80Xr8>0UL{=E)%Ge1_AI;IP$DT9~MV&vQY{BkR@mV!M-MwxB!BoAD@Q2 zMu6AlW4dW4vK}XDs9+U9z%Wkyc>HF+Vc8SC)Hle{?zx}D8q6S7)my%Eh4#6^m;`IE z)FLfp5}fWR5=;CGjKOs?CF;Uj6%@kfQrJG1!v47wj`=Ah=P6r)*Ih+^4c_1fi}tpN zM0ws-+NS^-f!Zjk>ToxXjWdM3jO5Q;fQAT5rS4+&d*pn>F9Iq9a5%$mcVfHL{jmnVP zKUeUPeZd2d@9ExqmNU7I)h0Wv(bMqzZ+5N|u3&zEL1@D;Gxz5PqGx#BBebNk*e&E|b` z*bLZ62Sp?ROs=0;=9)Z*7L<%XERPU+sZ2*)J*-BD)qvgP;QJi?iYg&2Fl~`290N7e zL+~u(F+>frjyYeVgWxZTq?vY29Eu^Juxp_xflq`-)pU(oRX_bKoi%H2y>sNxLC}Vu z_9{SPJsXnSbDp}1wG-WAhmt$~*;|XNmwIx#wkzxH zy7IqlU3c5P_T-LVd6{b8*wIwi8yjBTFzsHJ?9SDw7rs-kcFOleaK-j_ILNpv%tg0XWH&C3Pnc zUEFhm)l0rw;^6bx+kG$O!S~#>A5y7@nzJkwuH~AJqf&BBoARz{uykDLEPO?`4p-Gs zaY*wWB3QZG48;u)_V{{-~?Vt0D$nVqM1e)o~L`@SFk zi}1x`AJ+au)89A!tmUJwy~mGCdH3DW;J2CQ8@CE zDz6?rNlStL7L~hQy?GUiuTZ#o&a(2tNy6)V`N|~_K1nu-S;%xVCxj3;z=XJ`5ws@+ zLl`UtgSGy0mvW5PumZ0+jWNh#W|!J zPmwo9ULBcSkiBPhX7K|T>M{!+ydY&g8)1u^zLwgZ+A}fovAbiUwWvNXZoTa8$Os)b zP-t$<{BGmcg~EIF0^l5rNkwEGn~KYJ`>`PspXU-ve9_V+kON2vl8~^Wv>SXk^Yk+i zzQb?~(phi$%Ew3^GHk!U8b?&J9c5C(Ch_wfonbGoI#`OH%cF`1+h4NHEMqwGd>wz( zuqD~!J%)YW^&G`{5%qpkV7?O0pl|`(=-xf>%8DLWw2F4{o~K3_D=cy$sIXgz2&-=W_|ZSv6n4R?#eB? zC%H4X=yy=7Y5a_F*p`e5Ik)e`mhmmAu4(s@oNqzKwA>W!?2@&gEZI1Hf?8iOgLZ>@Rn5PQC)sqmqt+k!VOUvy1~SSs;C311PPfZkTZ3uRwyl=5;3 zn;}T{!-rt3o|7DhFk`}=a3q}N7xA1qT-2aa>Is(IGl!PHD``0%u#jMIjU}(quOvwZtG3_Q-jv4X|#73Zi)Oz@|1UK zR3%*M4~qLD^_6{h*r7{3ZEyxp<3>ZEP~t5KZj`*`d38^IcMLN~^C`D{k`HH4aFK9M zo(kvWF+78{w~(V*v|%*3F+6$alyD_H>Qk_Z)|rt1pY<0>HRW9+thx*Dgg4j$R%PK2 zTvR7Y`(6ya`wjM8p);Q1c|#I@fk}8V-U|y@UzA)~0}XGHwqkq5Q@u0)8pCHejasSkR8=W}@eCJ3nK`wm04Se`GPc`sw1C@^t&1iKVZQ%7VI?LYEgOm6|=SvX#`0mBf%I^hD4%7O&$33RSy!bRY<0mM_9qnt;x7Oc+=Ijd{WmoRo6Co zAXC?tty`PyzEbB;H>8hb>jHPHx23x&#iLgp?y)1!kL21nOgWd1IVPG@>#wvdpKO>s zl5Kfl%zM@Cn|R=*^;eqPCxuBd+q`=0kx%!0^Oe0YiIa<`o7ZI<*S*tu@wx2Ap399r znf`&X9iQ&nO}jfMTc(>kvyGi^hb{)br(SOC&Kx*6hB&-2CA^qOFPL0*q4Vs@%dK6x zRgX>8tekM9n$zoZtt(#k=3JhM+80}L?cMWmG;VbZ991H||<1{N!r_;QOd= z5Z*_9gYW^@xND2>L9+)iz*^27myKRV(?K6E5v7k%H( ze?mbT09j|9&&>t;JsoscElneLpW#f+b0x1{Q47!>KfRx!DEmbPZ9ICejq+D38v+4MGw+|SNb z@$9Np+YEyu9kPl^30Fm#rI_p0wQNHgLr`4ct>R+$rN&Fzhubb4d+(m9O}l4l*Ua)R S);_UshCz{%enkuGP5D2W=uO-J literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/__pycache__/transforms.cpython-312.pyc b/conversione/_pipeline/__pycache__/transforms.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..283915363b883b9bb5fb3f35c1f19cb50ac1d4f8 GIT binary patch literal 50874 zcmdSC34Bz?nJ?O_)Ka(9-BN2oY!YJ8!Y%<~w~z#gMJy8727%n@G|-?0pKh_VTZxRF zSi;O88OLg3XWRrQ7BLyiu@mFONj%QnnarCR+@h68?qTj@bTe zui2v!G=jE8vybk&7Hz9;pH540eT%-;u+N~?=rk_l5shH@w)PDze|BGzU=)&sWWgk) z2&sZuun1{Fx?p`Jbzibz6YN5UkSREXEFoLS5psnoLLO2~ZjCTin08SsOc!P#oGIiZ zED&ZPEEHxVoFmLdI8P`-SS*wvEEQ}B=L>d(3xo`W3x!OCiv$P4#X=UsB|HtVFm} z9uk@n3W5vaVc`hEqkQwKSB7R z@C$^0D*P9OepWPDqZ*$3x6$) zNN*ZtWqm9}koGt9t8k0O|CWXSm4*K&3vaXV?^rm-!vDs?->~rSS@?gk@E=(Ck1YIz z!tZE>NZfz_^osdA+I=amRF~OhKCGL9Y2>m9|Mwc)F}pO!@}xN#brU*-$g$~BKS)vG z|9Mv9N~zYoA;I#T^Z^#mP$>Vh&zi98e^&C?66X2uN*;T{JfEWOv!#ke-QLh~U|(j! zC&%$*QPd++RCzKh;gfgbKN-I!zGSqAT$}8KWqzi7Yfi#EcjLcTDLpsglb>Ct<1%sdE#i>XAAxVX6VCMF~@lNG(p7 znuOGngsI6$ElrqeLhAg4sVPWZkT5kBsS6XPnvuFFVX6hGixZ}%A$3W@)O4gSO_*v$ zYFWZm8&b;?rrMFZEMaN}QkN%8%|z;ogsBdsRwPW#Lh8zdso6+fg;Z$-AeQ6G$9Rq& zok{pL_gUk<)d@?Pf>PEbOwB{;+Jvc7k-9En>NKRTPnbF#sgEQ~oq^QKgsC%;x*=g| zK2oa^rWPQzI$`Q8q}C)%Ekx?ZgsHQUx+!7m9Heeem^v4!wFy(_A$3bkYHAerDMISj zGuomO5VM~U#;q&XMv|-AJKDT1k)kJ@qSx(gDYizE+--up*%dM3?m8VYHMb*Ao7dyB zd!2_`T>0&X^W~o+CQo~7t5bBhM>5=PN1ZKR&qjL^)*#H>Z*4{ zj14ur_w9%z?b=hhyMf(n>u@*L*VN;NYz+;RKWRrpL*?Eb5kuXMIzZJmRrITNNkqFe zqAiPP%Ol!l5$*Dbc11*65z(%U=$0(T+w?o?Y9jiEN9jGAch}TJ3>$as*^RJvFMh8_ zrSPJ?HFXhVP3@-54Jf{rJy28Ep~uI5+7-N~&6lDB_q0V)^y)oru@rsdp0-qqexKdP zx?hmsKfb43C`I4Ar(GmP2P4|WQuHn9{sZa$wse0-y8lqR|46#OE8VY4_xGgxkMC($ zNztF&)2^1H@88p|k)l7nr(G*We|As1jz^#FV)wD0uSxLJ9Ypcm=##*+9F)H1>1R0j zx|I4X2Va!HuW|4j68LovepdpY^qq68HiKFH7J>4qlPK7diNv1b%~q-;uzV82Hgy z3GC;oS0ymO!9S9~OB@`Kz;AN!B?)}_o_4KX@sB7xuM;QJEz#~l2r1is0^pG)8%13%(p_S2`|;;GV1c=`uC z^=T>d+Z=pG0^i}_vl9424$8y+M;w&f|6LCDNSUuQ@W#(1@I9W|ErCDgsnP^}8nf>y zDFw67FGVr?I;ALP-&c6_>|4L3+4iB-V`q7Tj&*%oO6F}g)-8?uv%HnYdZYn(mN(N_ zk2CS!E|q&lYTYYR{ja=TY?g9JEpSB|^;e`W zydt&G6{)UQq!zg%jo~ZO;JqS^+AF*tKE3j@h_$Ky@$DOSY-`%Nr?P2#<<4eF>V;m` zXbku(#NXYG2)1gpkf8ZB2f)yGYQ0LnV-m-SEXw6qq-QT_`?Xrl`KboY8O`_gr?gkK zb;Y`fPIN_*T%Km<2^T>N7e}e| zsOg0Jgsa7km{)YRc@DRWt)7J^&fH5`LsZ7;6XH6t0p+}eKTj@#f7JZ0OLNm+5nS@( ztp4J@;y`8p{2_bA7asbyvoi~{jF{N};d>wc;KR4ao*o=~`ke)1PrrAdw1XJ-gGF&t z8a<`{#{G@LQ%mNTbv7OVSnQi~u&5C5^3Gy_MU6sf(fU=53vn+lDaIYYEi5gb?W<`# zRCMCR`o@L+lkF`C+bFH1QSiG2KrR8WzqWouoxgfVt=}m=<~~{6c*u{3maOzIU0jTJ zF70eQAIU!}i#cjd+gKWLJTwbeCs4Yifq{JU%GYj*FgY}n!7xqHXny|wi_ zcGvFk*H>=exvj=uzmrIi8Y(}(Zbw6Yb!~ma?%EA|8fyF?Q53LtcMS^N`8Wl&mH4ER z*&Tgoj{R#($GW@6y1T}@pTVTYJRa-*qp|M3vF508A(S6w6j41lUXQ$3yiq2=9 z%{q44!PrjUzp(g#nmVIseQWW-MnCm61+)DC7A#oM=wHv;whL{0)ayOrS-)!0BLAsV zr!Wj2*1s5Q?yd7fO zeF%j})Vqxc2$^_veoZ~@#}sse1nH86ew|uB#3&?2JVwt>xa3^OqZ2fF8WIZDYp&|+ zM8YSZP`N-c#+LR|E-_;4IB^1ek51f!pT&I$B&;LWBOXa_@}NDzdbpaJogSA+wfr;6 zv;;wy=3}#E)N1QCjat$!Z0gzcd~H|FsM*$Q@2R-3u4i4?JT>47Rt7f&+(YJhA;Y}8 zct{g55VDGC1S5WCjR1to8sR}7U#+>IYt|jnH0usz0r%?sy2%>Cqdo8N>-u!(a}1hJ zeW%V#?_-x=FUz+C?QwkDX|1?QtMNi=$S*1 z;40W$w}w+)tvPC6tn)aTyP^?0>Vq?Tec*dQLHdYM?}6D~n?5t!V9$%>VaEmrxs_%O9itf|T$QOHvCnGwH^Cu!_cVt*e`F&dM; z_)M>{m3v4V_l>N|)rL0=2qGrYd5X1XOc$GwiA{pX&}RLb$;k-;a~n898OV56y5!ma z29<@24G?8O^WP{`hv~6+smkD->>Zt2!5D7}c$xZ<-f~PXg8&GAC#{2T5^J%{~0}y?oXb)2OiLN)(N9;n_x@uHA=hSefFpxk3U8q_S+|$ z>(cy@Mv^V+oeJB5G+n)>IHS&&a&Z5Gl2RgM#I5LKu>!$8?RtiueddEzl{;%2c5K^` zU*t1^QqLzkU3?sQKB2kivlJHQS8)ksdwvI1Du1l&#e84t!K%%bJ0S(cb7_7ekQx`R zXAk?#^vF@?2_Wu#-&AlAwR<1iR#m~ zcQTaa(c9j1)a4XhqWCrB8pWUIzo1vTG`GxY7nb)d@0}mi1vkH5ceyS+yF6@O)>S=f zwq97>v%1#MMF>&B)aGLsRF6r!EX< zEb7{N%anS-)MM)18Z-ww2R4W7%Lct+)0(czkCRQe?9&6zSC3pe5_GD1nd{{M(neO?DQ7WUUXG>TjpnUYrp$gza-ewz(ljwu%SPo#Iyz#4OlE`>_REhp-=3G*}Kn!Nn)t z)#bKRe}pYX)tX-Ift^^EjDF)})N5ywKS{Lvlj1GJg0lRG8Gb{&1xj6JAahvhRp`}Y z$`XyQkS@XK*8*iFVR=aM(qL!TF^e2i?+S5|>`(SQ%$ra8laE=|=jE7?EH8qcrUW?@ zWEyvYSCD6CN`mK8)#;Ffz8M8n2AzQ@kH;bzTi{C^Gtv`i@=E=Vus5zkiY>l3wTQu67-Ew{hm$&*NS&+WrPtyXb0l`Wue2 zh<}?4$}$o-oM`t*xg~&QuWA?DTUtn)1|bBGp7HPSfd4${WQ@CiH%b$a`|H}hP6}N9 z8jq{>1oGFMcD9~qaWPi+o-Q?2Jc1hgbO(!k)0p&uq)<*LQi)EY_RD^SF6s9iYYAxfGpRZ76u zth^>HeWJ73+k^%4XcG!^i|uVBHt~r4c;*iNJZIS2HEz(QtQ^;CjH%p#$UAh;x)HD@HNbSSGdoVB2*`gZJtp{ydxI;Fe%wkiEWT2Iu_ao)I*L%yUA9Ic&X&S!#$wV@nN*mibZ> zMXgFa5%MS(afb@k4M9n#9?O%#Z-^3o=;R519M6kmex2XIxFlMR;&6lts)^KwLh6cxt&Q6gfAt;s>L^_ zvY6rgUnVGD;Scc~wGcfwSr2N@k41fzP5x^2&J1-eaZ$O#8kZT~OKNNb z4(x=Smy3Dx84e{$E#nOCX;VJ|C8OcU)^ZT3`yt0K29LpzBtxvMkh1xm;&aGZoXW9d z4?T5^f}<2rb1@t$o}(DUlpPdf=+eZ{rT7Q*!yi&`0YOCT7JCV3B|zVH+8c*XcQm!S z#3QaIjwv~s^oXzGiD^J|o~aDmji#k{)!a(6UU;nMv0iu35l)*c;kZ##N>}A*s`-3# z@7(8)pIbDVIVDigw=xFhNjNQ(RXk8JY$+eL*?TMdjIZbe8(%ixv=!dY$PL&7yDnw* zEf~p|Gn6qWoH4Jv>hElsqmJC5zHi4ZM@~S0adS}jLfvTY)K}9lr3XF3xuv&q^IkPw zG6iQ277phwzirFzPwPty9DTL@QhRvDqQSzU8OyHghi0rEwyhbpWd-bgsl7&e^{zhH zGL71DM>!_ybq$-Q;VZg#^(1v`$BQ*)3;&kq7kyr$0oo-Fr8uxRxcBw_m-i25hv!s; z%`2}j3z;7Y86LSGA0?sX7(RLlk>_-Tk^f}d)*Z%dc-9i5 zIHPhWTtt-mHBlwUnXlK9-jGV}gyg)3lOxG|?6Nge+>26UFe;ToM9tkt(D&calH@&} z7*$aCXUy67ITSoPDS;U!bUqgEEA@VzjB%K3YfXN@dKYe=Mf zf;}7pcG>R3XjH_!FQa zS&F|hQkCG(d0-)^uodN<>mdOkmEglZ*9Z*^o-Lo>S?o(8TC7s^F3cBcl-$!V1l30} z4Tz8#y~QvKi4z!lDK_#E@dEvDk%AW~Af)19(hQJGU7S)fHi@niExvgQ!9jb$R11Ps0CBFXG?d@?F>$v z=z=R=Uwe7&pgmkz7MfK)STmZL*T1%JZO|UhEF8%!9?C2ZXO@OC=8x;O_J!kmeOme* zlg46G3L4Ft5%548vuwaIkQB;X6tXOmND&edtOX%c0h{j7-!4Fppk1Y& zL4%j>C-v5X16+a2=w@gVjE7sCM?6=JTw-I~&ZIDUNNYsG+>w;+d$u*yZi78mv7Y1a zD@uD43dzrZgtvBXqGlGqhiogUaknCZ#)Y=2t7hEbFwS~0qd&JVcU*($E6u;9+voEo zbT(>Qj1L?&S#G73BchV1+C8etKJ*Zhb)rzgC2M0-yE zB{7>`!EDM*k0&K3yr#@~sIYoKqY(|nR$k+z*o*4Puor1_2z&KFA9z}Knr$r*bc8pW z_02j`7O9vM_=~=6!lmL5dUw4>d<-|}1GnME<_|PPT@+_Cgh)+kdWh7IzJexi(_#1N zCeNAHL+ve1hhZW)VOl&vJy(dl)N`h^Zs+r+-pZb|=S{b)GXhV(dg{`t;FB-=hO8wa zQwi%4%uqI4U#GVYAb=J!$|PfV4I5aP-6WA=wP1t*uOPJyX0SA-GVZUP_$G23pgL6} zLUl?@@2;Zpq%)Sv;|Wn3Pq=+<(xgutNt-#8HZy1pdc$e+K@*tmz00uHR)ow`Lx!pM z4le+Py(9_#>kd57W;DI{R&c!34^a1I5kEc5J?6vFF-HH+8e`wwfD0~iB=UxY-nRJ z_plvka<@83x@oojmeVD}>JJxih>-B4^H3!-_=^StP%LF?~;lyH7R!jeY2%I^4KRnP zY1*)(`tc@AIYyZip-%Qudv@0BuCLl%yR(6k7=g~{bw;HpvvC6Z0w|0<>da*7Tu!7X zV-0n-xdaa}dzh?r@iib*9ZQGX+r4d&?0Ccmhob<|~&$7{Y1OA(3!o2ejQa zn3T4x&on0E%x=SIW={XAzEv03Kn`Kf9L_Bs$z3#*yJ*lfoV)V<<`0T~()!Eho9lL| zOCux?83mjw4;~tr4Iz7X@~AnZS2(v`%_I@hM}B3RF>c3u#xt;xTv*w&vUhLTJT2e~ znF~XP!uv-qeIpyWG%LOwF7PUheLRv;p7U}a$6yK~Zzo(FQ%qjwHE5{$V^NsISm!gW zIV7(0B~wV4g66gO8Umnd8WS8}{{XQ_YVD@F9lL9)D(j_z@iu;q!51kKPCg`xKSf$6 z_3S!Cq7X%3UacdRGKvM9Sy5KLESmwibmT}&8Dkk7!~Ujn>MLU4V4?}Yj)C%p_by)=DyZRlUfIM z`yST`@dM<1hFWMbBD{rsBi4c;Ye8^!*g7w4E~2){B5Rw#tbhY8Ww^fc0=zgspkv{xE)1I95dXcL4(qjOGxWBfDTH3))c)wMJ;CWp0ToLY_ZP} zofMbzQR?DL#W+f&cu{gOzG~V7YG4uU(e;INO3x%OC|weBXVh^U+yQ>;=C>ffb@AJK z{C0`ouJPM>e)|Exy#UFv))mp#M6}L`wlbpah-mkS-$MI|q;QF7+auZ?_q0#=tWOl} zchxv6JNCG1+jkr&j$vb|5g8#k9}!LnCTJ{$k>zCGRSeI^@yu0iG?53DXJu6t>?Xqn55}fON`O*k=Hp8IK>rtsaIY%$X)?Ek625~iLzFztBP48{`pznWDf2>E(+JD1vq86o4-qOM9uytkFyh>S*LK!6k)eoj# z6&ir3q>1AD2OuO+u%xGhPF7(Sm4!(c^=v%efD4ZBEa=Qw9)XL$PH7ad&=;T~#pTH1OM$?JbY%ra9rQ7cOY%SbVdGNPuo7DjV~hIQ zrag7F4fV9dJ&Rl(T51&9<IryIE`({~k#}YUNr)K-6Uxb!}4^g}~!Mf3R(^GL#2p zP1w3NY+k1{dGF%^f1qukGL*4!F!g$M*jzd3iua#6IwxVpEMy)JwIc-wd>LrT=pPiC zD*iytMaxMX7FqYh7Q3m%dC1k`5&r>=_d2y=8EZwg0T&H`(OMifFHzbplre80a{v@V z*`)2I4AmqQ`{jlzlhdk{Qk(%P&)m2JI1b557R-?SPFm(Qy-Ko1O@}z8fy#+34OGL# zE4nmg#RmaA`yd$UwgV+lAG3W7`NdB`mEmvXGDw~_o10U;hBAF~nSFbao9&43yHEO0 zx}RDM`*^?mwEr|kmv;J37tbzEnm|8`SMf9_4c|vho?B^6P#|xssjRNq9e;8;)FC3Y zLmNz%NFMQ@P{j9Xcr+t|N#wgd`uPlOl7_t8$FE7D35G0MjM{vl zj;1mLSTmV|e#@9>n}ups1=WTnO11IH8hjVuMM1cOTZuF=!b95lfz2OKc(|b4a6#dL zJxr$$L*a243J=;3a@dLA$`3^E8hN0Gr1%qJQa>Rf5a+v}L##MCO0{teKScDc?lvxg zPx82&A{jz5V|vJSx+p#AsMB-Q<%t*$cfioz;5qI-!2~`LJxp*SMy41QX_9g=Y6@al zx!J`zfQW(QyuMk|D$Z3PP1uMAvPtCuD9=v`TY{8mf1uWGmG?rJplt-5aG$7UPUtAS;&uD-PT<+VeOqV6h!9f88$EhCQE zLyp;D$J}nnka7c=0Y~(2)wiq}z002W1$LeL>T6{|@0FEQB3~$NMfSs=tt z^7WYZ7VjbRkExlfc{58I5vwupMP0wC&or(<^p*PG((UtEl14-#R$=_o@?OiZc`K&_ z5~)HIi7S~j5h38konLX4yE;{N3Xd@QXAKY#poY>Dafks@YsONgf>D-IcHy!6b+%L` zhKfHxb`^MTbi}8IlO~6hl8v3^>=B>K5et z(o<8(P+GRF_UTsTb6W{Xj|p5SnMs^P4CAJYet3a`vj}3SZLXZ5$toLrVG|FtYL1$# zq}>eDD5}TfAP>~(*{#22pBdD?o_slZKzk*1B!BTx{^D@{(vkd?L-{MO&kpCWAIYy7 z%C8CMZwlKt)6_0t)>DTDvIk6qhLCkx$h3@2fQ~CDyz!9yS)}C!5lp$*tq#U@h<+t~iYVuscstYdus z>(r0Tna2*QiUY1P5y*q#6eHEgk*I;4J=(!x>Hf6UNFJh#T3QO}w>8ll1KAcqeW$V0fK7ral785Ohq%cqA(I$68V!;%{M{u%#y7Oeu0b>iYd|e& z$9nYC4A>!Q@$00$g7(nXGLiyHqe*a`@E(;n;c)_i?-^WzDbWTd z74B?&t|l0%9&(8;fh}L|wjesMHck_f{t7V++ni*H)J+zW{YD?! z18a?m?VU^U>p8(&oEo+2F#44Zo=<3fW*Cr}@Nq(u7JT$Qv!Nm7zj%iNqBbR>Qv95L zU<=Ui5Q`YusDLcgh3$>5lP+xNJcLcdJkF)4dg^SIxa{hZ#txxY--3r7JIvhX4i$9) z&k>n9ea5SgUV1dR>*aj|_BV5`<_y~3$Q#`C?xSx#dVSa1`+E2E>aj*Sa{6ogY6Hy| z>w>$6K=>KS=q@$r8gLKIT``nfalIhySVO{2hh(;AE{vr}Rg0O7tUyZNq7Zh|6a=ea zRHGYMHZU(_T^ceiWvgDXg|<1vINR1#w`co?n%yZw0ou&08KZXpCp>-+!qJTE;hcFRIg5vK7GwWS&hl`^imt66o3pXs zws%+esqXaKn|A-&R{udkZ$baOzIlOyizOo&vxYKg7ZLX@zC613-Z4p61^11rIWcx3px1} z&kiJ72o*&`OUr`%B;84IhaKusMn1o1dGPA{Wl96tU}7|TN*#6%7D6Vryc0B4v?Odg z@jvuC+dLup2;mzk;Q!;khGg_TSv3*MkpKfL7@7p*3Gyf+5#T-8En-u>*o{&dwax4V zIr1ZAQ88apJQ&xSd}fCrMiBMQkC_Y-1IF-L^fD%?#dPQ$X64kS>8%dfU(LCc6STja z7qA33V59k|%clmOyy6SCgfhyyWE~jVtDLo$h1jFf@Es&yfJLt+IIs@)Fli%)!U}8%!T!lMIW2W?z5Hq9JX?d zMum1M=pJ;uzx(Hn?==paw@fNQrm4WDLoOoG(*cwXH8Cz&nNb&`;lGEQm`xR>B;NXw zXtG)|x%7xrkRg>CnW#MaqQW4A|HUCg@gyoKvSo{|!vBgHi7{_rV;*8*Hn?RcH;pBY zu!$^%Eh{243w%cCNuVWpjXsiQub738xbJgG4<(8Br9lQuIzc3(^mO{*66u3k7mk{W zMol(!)r%YYxAkrNJlVh)25=Jz83qMEooB7_{}(48B_A;1MH8-TMpaCY7EEF-2o|^t z`6l7?X=wcZl-#7EMMlGc!+^b{;!p(M1t#O`nL`o13_QAyDNGiti9OHIDWjl2yi4NRe8K%M~S|MLuRwbk9YmP4?Mh z0kXphw#KuTeeAf-2$`Z}WsN9FFL3)D0t#hia1AuPpYii)??G9#anhB6IDfV>RO1B? zp4FXN>|4XG66{^mVdoPWa{e<0u~C)k#UpmenrZSKY_b-THde<~rzrDHNScBw$CFsd za)Eeu-Eta98C3JnlQ8QPk~Gpo>JL^ z9oFtoeu;7!g2h&~9*Q%;Afv|^2HJ<;B56$xJF1$t);#_w97)uxMzVbS+9_m#!wI6> z#OsuubH-#B8%b`06vf-#97*O&zrfuc84kgtS|s%_I680BsdgL%L2IeZsBt4%G7*&M z1r#8bV8#~GjI|;H^kj4Nr}m`=%EPvNn8(^OVCiGa>e?g+pK0u5Bzr3>r+;tX-au_o z94LQt)zwvRtO;jTz~(1wO8>sTeL?-jCNRla^G3233}r0{XDu4ZS~irmES$BXJ9X5Y z^~&@i^W4$wyzWgrusDF)$b#r}&uMIVojq)wcWvpL6;~@lR zjBpF_9pkfQBizTs=9!m|1nnVndB{*cxP;&DCk+)GL4uX-*`F!lGoxJwSJLD&0YQH7^^J>k2 zFB%PSxfbs&a6_O-c;#=RJ+|_jy0zZf-?-CvB4e6@`JIVTu-R9M6cVC-57T ztZd@5>E2G6&Qb6+3XW00=64F8-%LREhxj#;O5!iRhd*j#Y6?8rFrQzZND2rPI4|_N zNS@jjB@M$g=QEr*0-2gp9Lspb#pw98G}B4YKr`J=6P!=?8voN?}ICI{$szKemrngLQr@pb{y7T7z^`nmIT&X%d zXT^0LoU{x(){Hu)1a|dp>8rUdCkA)_%270KMp4*`m6pRR5fEN=Uvdwmgl8=Mm8EQa zI%+>YL*5$i3!7&JYeFio7fFLM8;Tc~*zESeXDW4|d@0P%eEj-c63y^)NndjRRubg@ zv@j9M5UPz#G6#*Po)E7&j0nNex4djgKsfR1IdDfoSSC^MkRuqtE3 z6uhhs89`Zd#F;8TscHgX21JxsB3*zY^MVX#yP%T5S-2{Fc0ZiZ&wk~3V+Z5*hk?)zXHf^Zg08{kc zTe07W4BtM(0}zdM(Ze&*5!_gXR^xnm({2hWotT0E8+W0Lk~L5sS2LOS#;~PQ#=WSP zhsq>PLPln+pgNo}o9zY~F;5+WKhvsVbN*}f136b32RGcDvpiNmaLb%GVlEgm7X;@G zn@eKTRNa9+tJm3or0+<;dGT0pBV3N03c5qq#UayTMy~j5%yX`^Bbt2aG67Nyi%)AX z8;t=(J<2_<&^G7ThFAQO1*Z+{8C>%2 zinmr=U-I^I5SqfSjY5@f+_w?>YD^D zsr1KWEeRtUDvw?9XrhOh$qG(^h7cCY(o9a&ztLlWxdUS}89mAR56#M%qI*eXtO6fM z)T6SJ@F=Cy<EX#TAH2Mk!LirZ+iH9Bmif9%+Iy3Hu}yG$uF6ZasuMs86$VN^#9@}e(fD^NG`Wkq?&3p5U74V)Tq zbFmBW=V@%?JB(DA?XKL*Pc30~TQNONt1+8}zkbLy;rI3-G5J_fdO9IiS9%v?AxY8} zC#k$gylsZS1hJke+CZI?ENa3f^FQDeAd(aJGvm)mb3VZ26Ys^F;!ym`n;v8|YK)d_ z$lL*Z1X2CWe4T=nh5WKYegd2Y?-$Ciu+w5AKSp0p!{2xC%h-DYOTgesbrSwyb5DjI zKXKqqA~}@cx~k^@LXx1OOm&tqvBX`ZlV31_tLKyX3duC{Y?)+y%t(c0@=Torn{ED`Q z*>`T__MN?Z`uF$kzjz?%yqPs`)G;-%Bj9@V_@(17w}u?^M;uFs981HFaXxztcu{az?HZ*HTbP z!GwN$gs+xlB6=U0BW}P0_0$>v%sWF-f-iY-cmHF3kBvk7z2vpR-_nhg;F36ZPVmg& z>?jk_af9BtS^LU{S2thUJPyOP&Dv|3zh!sDf-RJ>D72*VgYv%;Kl1;Df7smERsDD7 z+;Lq}%F0&`y?XT0(Qyr;*UElNx6f@FTL$ej8?}^X1R@hW=OPlV4~(ah>_?vabaYm>|D{kHtHLs{`Yr?ms`jFReqVOSFUG?Wi?T;ZuJ^Fy2n2(+) zNzl$}PfSUWLv086wu|Zs?fE~$Dr==LjlEQBNl5UbYME%oD9U7Zyh-R)ye`f5!!N0NZWnGs&*5 znHmw>j`^u0?4S`@T|mdnd0bvD>tk{~M#r{LiEMGDK2-(!+#;Au{UHCZ$7bo8x~dwu z@T_J|=^{zY)tta>t!O`YM0<#P@BAYG@x*;(@4)@j;RF-s5fNmsN8F0J^$kKW=Me#d zfvH*=b}Mvkm;GNv`QT)p>tdbMJ+C;a(@K2k#g*Nr+m^ImZBNaMRe_|7TW?xsehksN zW7>#g-jHM7fPUC9e*$F74P^9Lg4>5|OAX7{h_TNf-G&YT+^zm=KQU(r_q z4TEPmvs8w*4rC5zF1T&W8qJv*EbRNrfPN^aw7XIw)-Rsv-b|GGlRew8WTUbasT`l9 zNlX6^)%&Jx3FL?=R`SN#>l`-0AD-+3Z;9dq@BE_irLwG%+;A#pF=X~a{y@kVH_1;E zZIvLZoZ)v6mJg-Zgu9lwxE^)`&~nO{PsXma&spFOTnt>_bsrZ}h8N!@Rkxz@EPX=QCGTm)++1K~w55Z2MrRmFOrK0m*dO|Cj@NMI9(%^AsT zeiw>jimhl3G`AfnA*w|&6h3tn7LlG4&SqB}JAt3jG+2tFn1L`(=EzREZLz^G?c9>r z_g&uiTH|@$g(NsOJlX3BR1cd9$Q}CXJ9=c34z(&#NoMX!8M!X${{%PcLj2y8gvgTK4 zgK`uaO&cUORg!x|9TTtMSrsF~Q#%&>)`1v9AvP08Y|Pi~B+eOsPCpQ5OlSMzC|>y9CYMy4!J=k?Do&9oC&P@o6`rVk zMVLE0+C+zs#q$HnlmCL|(M1$POz;EEg-9=crsqs(O7XyoVQX2e*qC_Vq`st3&b)z) zfk(r(Ww2V8^Vj}z*|2q2Y<@+|9OuDNtE1b(gZ2F5jrzNY^z-9FD%xXzM+O@j@8n(^n;{scxu z@L)38YD=65!#8UXK9}}ob@(`RLBQJhKrKGp!P#L)q|6q)Rk~DNI}TlltEpC9<3kr6 zb?Aa>4HQqVrCQA>D}Pdy1WTdd1q1PU5B2U-WCw5|S+Z=LhVM@HD)c!x1;Tn0E_LUM zTnpGa&cGfk6Q0eIGe^}+(fC74l&X5W#(Ptw_hPRUj%P6WQ;-7%FfI@f-RTLGpMmmI zktbE0rBQ54snp{gf_qk^VopfQn}a6W_Z73-oK5I@L>(}WVLC*xje9cz_p~x zWt>aH&MlG=uc1$p_7)z=9OAO@?Mky2$ta zcqsNb1)oyTMZsPKF$AZwTz(TNv|OqRSuBh+H78uEm_zP9B2D4$R7abXe89I*Y!=~F zI>FF}t#{0oSG;nRDG+a3$_uPtM}v$nFYJ4s02uY5RBLY7rH|;{uoGoc)sy%YdW%mgeS01-s?IMyUi_&>cdP z8}TYE61OsMSoE^@bJCHe) zIX`5X&+vM&J<4Q-aj@s;gHmXvOKaK-Eou$zMcb>gWMB};y7TN>Q;BKADV#}0jCX2`a*+koKK;$`nN zU!VQ9`-6hFTEmO0e{HK7Skmk0b;5Abu*MDqr+_vJlqn^`{of`Bh3tpS`deP+~h-y<7 zT=PIuo3a3c+N@b4jyXe)Iia}~*NeiADz?cjV7h6>X&m+!>jOC#8;2|fV7VnPm+oJi z^2b-8*~V8zbH?%k6<>TdM`JJhB4VpyJEJ~y6l`=P{lb>DaZ2pYD8dwMss25f8{c&9 zN@v1pu*Hq_Wbi5NG-7CMCq}}3{0=!=lZ+)hM0iX(j75@X<#x;}-L2|uVWamwX)`Y5y77f0F=kqsvdRBKCn+mWWD zf*8kUnB!6Aq#5TVdC$l$?8W04%yfPk)7fD@z*+jyo};~I;9fL+9t;Z8ZC#sgIi?2= zhaGc1)0mAjdQ9Eg?qyMD{m>Mu=&8VQbh$8k8Sq}O7+O>{TCwta<$K0I)qk+@7v`H4 zTLUL?uuj2c*QL&pnM;ReE*+j(9@}Z{V*aFg4g3ww2$s?R-D;`hoDut-m}p zv~njTJZaPKBxxK4#LP(9hHd3TYq#SV)=`u7LV8bnZ^y4p)5mjAGlidsI>{m@Jck_i zU5E5yZOE|pe(43xXf~@KMm0nZi0{KN*}yRIP8VrK%Q~oKX|NKenF+Uw3C{N+mXp5WLY%IHEE4-8 zl-Y|KN|$<$j!e$M<+XBT*!hb1uAx-!20YOg(5=R}S_j334VnVNxa>Nncw3Nv0m*2B z);?BKz?~Xmo-&G**WSW6IZYV^Szjota5V-$+U}D6+H_|KBCmcnK)xwVH@RV)YIogtLi)> zs!k?mtQh<48S2w=jJolmzg-#fag|4@la~}_N=5b+6D}FuQyZT#!|#_*zi{EghK@Ys zn<#6#(YN5E9y%7T(>JBDh>on6avWf%$^mXHUN8PLp7c%KUofZeK;tQY<0%%y;q{HD zN^!PEN#m&)wu@?j?Z=@%Ch+>G0r=%!`6+`5^UeARB59lPVo?Y z_sh#d6yJh%MM_kERCe&qBK$^}le_Mix-M9NlQxUrD*j>VE0sf2*Kx-kXR~H%1IaVB zVmYu|$1G$&C^Z*r$0lmDG7|gkU%O`ASkG%?J>NOtTXe8!qx(pQ==8Iji*Ahz{SD;! zvIQpzIuEtDC~RCvCN_=2#$t0Msf8_y`~)es@RH;&l0rVu7+1mbI`Q6%&%@*)-yJRmU!}cQ8;?Uxluyx6dGML2* zJ3oN7E-5-kM!S&0WpW{CDk@J1A~9G}9>!oPYp&W$@^^#5 zO6g3ADoHurR!F6?z)-<~fa+T6m?9um%knGFbf!Kary{uR#Obyk9nP(#lS@oY{HQ-a z`VcMuPcdwkA~;oRBON(Ny!M1kHCmF_UFnPxY{MWnizAdzDF&lI&5>_jqU$f#V@D_1 z9s82mtx@38%B{%q$vw)f@LxDwm0=cQy||pqLP+BdflT5>C)DuufT0}f#7I)pMkX9n z6h?fH7x%rx&m3e7*&k45qZg;ownUPeJG||O4^Lpyn4x7P6-&k8)+RcSHIl;P#Glc2 zU+9;VOWpM)ttE$XBC8iG7I_KkLOoX!!lKnP0f#n9hqK;Fx6|>2JW>FuPR1+s!K?wt zf#Cv~d&rI*W#_rSKVi z`s(THsUK_}UbKBe&e4U71`DpbAn%&{_VJqwD|`0_DudcfHGKz0vWkYXiiWdFJ`id1 zDfVz)d_0)h*K{*$&9!Vg=>tmp;x*l=x3Z?PqgE@zS@XM7J~n4vni7jhqnJKy6PuwJ z2F79fuxG3?WT?EK=`*1H8TO>bBoQZ|O07wNLz|e%UmEb(LXczzl_`rSi&ljZY)ticcnJNNakuvY}=-?VBM< zFI!ucW%VjQzJP5=o^U#ka!`S^ibgq=K1EUsI&#?~K8s#(eVc znmb~iF=U<*@D7`2OP$$Mb6YwRc=oR>Gsjbq69*eS)PXvRP%(n{f9cL6pN0?6{fX05 z6dae0g1--Wv5PtEX%V_tSu&-*#Vw|LD_QrN)l@llU-!ae(KnIEx|b=z5AJXDjJ*_G zFxLO`u|V&FvA_#sUEdyi>3d@@y)oAR^|Aiv#`-Ue^*@gv50rAc3wx^=`a4SHsC1Sx z7Q|=JmlJ!PtkKBhESb4V$GUmMuOnk4b^37>^$?v71@47aJ*&>I;g-XpIeS8n?jN4k z`1^N1anEe-u40rAI6hQu;uSr2Y0fGxEg@thmlCSs6S_>{Db-Ts6V6V+vHC}sd|dL&s3>6RVm1(K%7A+e>b%l zIov!-4mUx~F@KRdW~9otPjG*cFX+x&ne$CbNn%b!lSxzxY?(~zVumeC(c&rol*zu- zp9JR^pwcAT!*qTs=tG5uRC`6GZHVe)6DbZgrEUoKGv8#A>Qg7FKFNV5;^aV+O|K_V zJ{R2{p;#3KjO;*nOL{^RpGC*$2R4euhSQPBPpg`_LmaHOa#trlxePOG7wg zR#(mK>9hVtZ_G>@&6$F={k*4_Sz%mU_54?d&4s~?U%|s67p-1Qg|9uh*9$fe+e+g7 zHlD0WF<&tCnEoJjoJzz=G#cFCBg~!~Fo$ik!LSs}c|GrP-j!)1wu(!}fG$uuXdiUG zo&7>a$X3x+J7lZ)*z8ce^u?as#+C{RYe37ka_Gx16<>T-fC_zaH(g_2=^;korOH*+ z8HNuRr&Z@9eYnnmFh<})sG4Dt?uW!A(NqA;NMH={@N@vp9S?C74_f~h!005-BBC0d zbS6ddVya5Tz>FUr<_!nc=e7m-2|U-(2oAU=V>rzj-2J=+7b zhb=R~&_wyyBKnk_KM>GIDfXp&w=%hV(?vr$iv|thoMqw6<<|>_GFNw-ZsklJp0;>o z+KQoRE1)=@wlgh|T2edCg z6=)qO8{nLk_Kl|oT0?njeq~v!5Xn?;zK{8waeN*=bG+!`sAWPo49V?77_GHdNuXl} z{UNx_*U)$ag2cxxd-qMl|KgAFReVX)Dg9+MvBSDU3LtLTNn zCRL=t;1m@>FgR7U-eR!Xn-^2QCCYk>>X+u75tE+o&5wbt-dQoQ%{w~=wtDBrz;#y3!O%bJKx!oza9Lv(0-~d|A^@F`PvtDuR`2%J)f9Ps~K%pce`4{pNXQ!M`%1Nk0TtFsM>?W^)#ZOwSB?MVLE z*I!UyoX?Rf`?0y5>O(biMP;Du?6oY9h!?k^I---m%vaKQsHnz6o_Bn1e~s*d$G=l- zhd!p&Vn_0F{e}3i1D_b?X0k7l@elZtMq50Q$NJVa4*dR+W-WdjrF z{b&<$Bn!p09&*DQrkmDQj1CMA_KDI?v%~JD4)m`(;%H;$jPwa7;Gn_lZlmhtC@&*} z_jYdcO$q1@j8dv4dpw4&CZ0AEf9%|W=p`qdHrOL1euR9kq82|ft+n~W?K>2?kBcMh?!n8 zLw{yW$1I3MGIn9AHAby5nKcs>ik&D*+Sd-Bhnx_^DYI&Suyd-so`{JsThnpZnTYOi z8$UjhXcvhQmi)p@>`d{;cyECE?{7dacWJDrKK-MAjm?Ud=mrK>}xHRDD=<4Ky-+-oawdd+w;fhJAr zv}?$_ay*5=R84B$aOt{`Y5lmFV2dVo#x;Pe#?uH)*QCxJE`20qsvNfxY@?^wgiICV zb^dfKN^&!(E<2eN9vf`>krke371n05mS^89h zr)g5trShf|JVTROGF)04GHn^3NpL<(uL+qpju#L-ig>hv2ykhRd7H zEl-nmu(6?dT_Y3DP($Ld;$HwP?K744Ygc7zKFoZ z>?_$@mk_*^y=6nlR5e~kaJdREBX~KM(cw~-zJlNiO=_V8uOxUC^7A%dP2d{!@wEi6 zV_j4kGHn=NPw*pJWEd(HLnhC7rIvnyYlPG}!=IK$D~aIvT-x!cg~jF>Y=XB0B; zcBa9C@Qm`oCx>QKbl3D2o!c>ZEAA1VQ~dshkvZ5Ci0-%dcxh}V$7cS$Mnp*6#R&Ss}!VCiSJWvImOOW&`Uud z1s5oIo>E_+*v~2U6vYlw&`2Qho?<)2zJ>s*w2l*aAJfbA)340Z?t2t_lY)0BV5sh= z6#IaJ+Z3os?;i;KCkpOS@UIlmaTO0hbr=hpxLA&MGI3N#Cc_i494X~Ef@8>*p;+Qm z8A1{@6gxmQs7HXavq8auo>(V-hy?tHy*tF4<9e-DdrOn@x0;!MtC@bsq_O9Hrb(I# z?;7VHA<-fj!p`W0z^53I{>kbX^uQAh7m&g7t%i*FB@=^gw3N zGg$S3;f_(yl5MB7g}PBgN_W{fw(1t@ZnLP7qCj%SRQ#UI022pN7?4W9vT-v5EIcKR z0qGpDGQh?Gy9C&?dp+YaFdc_`C_wST0X1dWs_Sw}1}`9!6_BA}1;_v^Kn7R=GC&2~ z;edk|ki`mc@B(Cj6(9qw02yF4;6OGNuxvbs6_86n`S=tDc9UKJl2C!^=7YBFq)bV-_HZZV!d=CfrO5md$ zd`to#=O7RWtHyo~VrvNl4{#9Zhk;ETd_n@996Tg}%^VaY(8WO@0ru<>4#IeVfo={$ z2gShS9Bh%mRt~mFU^@p-aPYGyIVei09u9&VWkq#x@T3Hu;vlS-Sn3%L`Xulv4*Dgq zlY?K8z^`)f5gmWFQpfvO0;@O(pX{vU8V+uhz)c+7%)qkoS`KcJz^xqICV|^ISSNuy zIJlF6<>R|JxSN5?KC9EwU=bGoiUfX@gOBJb^V0E3 zhVI8VNMIEQt0l08gBvAq69+d-U@Zr?NZ?iuZj-?69ITVT9UR;#fx9@kTLSAj*dT#> zIJlRCcOK>7V|wfX9)(0S@&C_SS=KHq_DnSg%6gCX&A6*MrPb=jLHJ}$>&@uT>&qK8 zBzMocFt2CckiBBSI&7~9r&bIZDz29e8P?s^>oRm*=^*7Zr}xh4FX<}@O~>g|}*`KV$3s3GaOZBK8zYt%cCXp1UiT&FcoxsBxQ zPjA1QY{=AgS-~CHr}pZ|{W_(tl2Y*soYXR8$VWE24hy?QdBZq*@Npep%}YrdcI68Jr^%`pmpK=Ic;tgwe_s42#o_F2 zAx$PeB0aZz_l12u`$D-Z!)dF!Qb$uVyE882_2l8eNn;^V0LiTCDaKS?7cnw&)kh6G zP$5>)bc432w|QKHTL1??;5MFa(k2DU;5xAE+3ZBa0DT!Y&{=3TnoSz4r> z8-!6MZg8zxc3nSMakc4l0`KhA=4mUn0nfOG?!k@$&@TgzUHIQ&gay7ibJN50nLu zUbad}>DsA*%yA8FaNg<}s0yCGJo9q`@64k2^QQ)k1I>eF14pmQPZ{to54B}xpd(m6 zP#Apd(pNtx@J>2C#Va5E&G_|>B^BB66mHQ?1@KN@mNtiyaEnd?cxQ1Mp28>N7X8Tp z@8nO>mJei(Yj7LP8uVPR8a(|LDi82ZMGCUlk85y??lgdRa<$qLDiF8m0s-DJCE+Q2 zGH%hI4De0{6^I&AfgOYO*9!+9yNXX{1?Fg%QS;&!jA~whcgi#I5+vakT_C`_^HCt-(&}T z0KbXNR6pPTJDW@<1G`NYNvm4rwyQ#Qt|oWN2Ff3=)YD7QnNI&lu@Mi^U*}In@dKid zb%EF}3T`+@+AtX;QK#pln^+ZJjQu zs$ElMS$O(`Dj{|sfznay__j>zWb1?584t1Jf%4VG3R>HtO#qb5ni4GQ0?BQ9ioHkR Kgg-ccFTDUPqdiFg literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/__pycache__/validator.cpython-312.pyc b/conversione/_pipeline/__pycache__/validator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e38ffd4eda2a123bcf95b931a4b5a9e5db62f539 GIT binary patch literal 7800 zcma($TWl0rcJ=D_yNzwYJPNRln}^#EY`|kP7@Nlo1L84EX1%aZyKmWU(+^KowSi7q zvUX9Xn;^NouyI8UN^dkPIbtF?%1YrQKdZ^dO8IGGPtsFMBt+&T{%Zy{+2Jea-tOvd zH}-0;)c0}foO2)NanHH^Z!VXeg7W3%*O3Qx6!qVDU=&-a^ZGV)7AZs_I!cWcTY7|s zF%xBC>(ABX(pFEb!en&LJy|9ms`j z1}3r#EOH1=WJ2_~6*=#kMqGF3fO`!u<6?l;xl18=GGGcYYa;;B=|Sk7C%se9>nFV- z=(UmFaDWjR(mSKmqHY!iIVr}&G=PkPx6Z=v^=|?3bIa$}&uy?x`krlqx@S?SscPI3 z4PqkH6U-LMEWHU1TZO}Hg2PpC?pYOj!?{ezT%LkDM8=-3eI{P1`b5!V%Q|1h|$R(T0eRoygQ-$M2zHxinlD4L8<1F&u zb*FamydRuFt`%xRr7N|fKWLVX`J{p-3qETSaw}m80h|5BN=U@Ki&-s_>pC9R{LF(k&rXwMk0dNeY&Ys#i zL_vz6WQ1=+!h{$?p?HFiM0qe=F(S-_q5)&-u~7J?90}jVTP&B-)Lp@F2=RPdEOZ;M zq^l!PS_(?Yhhf7A_F5?=qMr3con@kHS3U~f3dw>P3yJ0FHX-UUW{SjOp@~R5Qke#N zn-G;}>K_wN@R4{l5*I3T8}F&JJd+fOjU}QHUWkViC=#C_lqI6h&Gs~&5M#-xU`+Po zi|rR~7|W|6I%aHXJdu#&30VMP6h(w*+IXTeO*kZi#04?JU%EDE>}lg2VJX$<8wfuJ zswY%rE(-i4k)LERGgi=Dv(?i1tRb)x-G((c44NzGR)eW-DeXh|mDG5&tl#9)5b@QJ z1RqP4;e1KUi~7BWJ?xqXV>PV9B5EGONS!@8dsDchTfw|WBeD?F*>GrDH$_8ZLR54E z)x~hx$HK@0Ke&Ra*Y}|M1-nRrci-R!Qs9O%EPj?#II&$W-vJ7va0%iTKdOt+>J$zqlPLosR4Dv~EU%9EwCmyh+_8g%cnI>>i8aoSU0_+Z~)t z#DpLU%|wt86hjlia3Ve5!O`i+w15pH2+kmaLg263r|;<8D9oJNGD;Y>_eT@q zP*gh6j-iKuyL1_~iXWjhy%z2H~F0VLur|EAU&gb4bbx8ARpZ?8`6>n>L@P*T- zUe0`&KAm^^=PzU~XxFr}+3xIT+3?bqrO?vV-1g3IoLvPQ<#fFUrj%|qJkc5s3D&k? zm<=3F(a)^m(_X`xl(JX6kIp8=sBSf!gl-!H?I3oaa#T41-CHv5QNy@X2Bd86y1Qh{ zB?fC&JeL718My8>jCT|}F*wXr8Df~T?wC%7%AdEbx7_hWT;{Ld!JbC<6ekRgmMn1% zD@AH&2^th6K@3OW0^!3T$CA0~&asdv7!*ltGZ;`d=C2pU9RLfM#huWIyP?uqFgu-% z#AR_88G<(t*hLrq?!%QILoFrS6tZ>#by}l z=Ay!g12DknCjBj_{o6UJaES74nLC~L`sdE%y}m`$f=RuZ?E(dv`@k4a702g{ailmt z`@-EgAIrpYJA%u;6?gaC*}TQA?)j;bx4Kk+#*sGVt?qeO#+7Rbeq-&(+q~-Fy&dm< zY+tno3!K^1{cvA_g67LQ(tOoTnqNz#c`E-O{`SgCnQZCp+T|>h{cx2Fe#=?YO=R@Z zD%Xie^<;EzV6X`_i5j*P*#H-&$PHogS4@&uEtyJ|qT+_wCEr%ZH+Q zmdx4POY-vIa(Axb=&JSDcMf;@GqvTf2DH|kgU@li*b0IoB3&29V~>Ot@%0g?K8I9A zF8j?iHBKY?u2n9@>=ZRjJpn9S3_k}k6LWDLL(HrjeuUvK=<^f}Ocm+|=iOdujxgartbSVaG3wV-l7Ba#lHv7sWa^zx=o%0M>m;>BAuQn>LYQX zJU>S?@vW_YLWm2ur^OSgZKF8w5h`A+{OJQ&j)d*x5C4O`*}r>+7out;>szjW8d&KZ zdPqO>4F4cuMgEyx8<;1FIc*QZC>PWeOitO+i)Ho??PEnoDcE1Zy-a<|$ZPZFy1xh%GX9O{7fW zi@l}*HTGIH1)0CH80$9L_*THh&aI|QVY$>4WJR_w7&xyh95rj1HOu(iNkR6zlwx^T z(UBts-)AAWMlQt)7ry&3`-npx&Tyk zwd+s9>Z!uQ>+){6`raxmY>Rd9q=GH1PTpgvLq!k3g8YW=(3HYRj7s1CrQ##)uN2=f zbprx(@Dycj0eg3&qgB4(SnX58{p}ps;QfE)s0x*0ubRano|E^L*HV5$s{4zi&r#l^u?nCVcwF|Sbk$1H2?K|Qn>ozT7b3mkg$R$c zh7lSYFNG$1AT*&#Xu^mWfbm6G1sfIOBq>jlz;q zK#+K-icyKTZ<&5rr(RyXws1}B&GxR;ANWt-!DrsSA0!;}{Iq31Ew(|2Ea5HPuTwfR z*_C2W9D%kQ+MX1{^CV}7SC@%MD8=@79@9A-+3P0A7-a#FdUbO&8K0QM{i8ZNCg0SV zF~}F-gvGF9K$&37(3or5R7e4T%OS zsYWqnb)pEQC6TeD4CyGuR-=<$r0&-J057h0G(4G%-vp7qrQa8i;+zjNA{rv ztS2t?@%V@bifJ7`lT66)d_~TCKoU~@ojo1lNqF-kr;Z^f075`;_ya~3%=2U70M22J zcPtRdz`Ji$SU3Zvl<{zBMjp02P}(*ho(zeSJy55cB`k3e;zTqtrkkck$Os`fyd&wH za62OFW~e0@M%Y;4WhD_pl5P{>m1tDPk*|3gHU%VHJQNeeK77Cs&QZlg6DNKo+a%F1 zuuaCq;{XfPi8vG%al}o6TiuEcdlW@Pot9t~@Gpc!(oCHb6SpLI)`85}T-*q-5#Xhx z__#srNoOQkXF_qEg)Bhluny@gyiMpPvRw&hsUXYRQjE{7hUtkv1GJCeC;bbA*6^C) zY0_fPJO}5_y|PkHUxB4Mdj7-Ov$Q9@RUORi&|E9FmKAHycdk11+~S3W3)$exmaY|7 zcfmrL?5~_a^4#jmx%*eG1JA9Fd27b1`tCalcFN!QzZTAEE|@8oyI`VvkN)$g|M=;z zNA7d;_KaQaTzqfgy(NqK-b!7^s`i&V+M7P8PH1E5^=#WxS9WLK;Z^^tX%o^!VJsq{>GQXSLkG*iyLYn9vm z!sUUi-{pHv?J{+y>Ab@^-;?Q4x34;OJjahi>deBAw9m42SyT3sHj&%Xnrj`*`A@An zhw{!XFFfA)WG1QpID2Kq)0SrPrSVA3_(#PtYkdDnam+pUHEQ&OZ5eZ#PIs4H3e?c5 zb?bLl7vQQ5nLF@U)vdK?)0!pc=JOuk{7hy>9bf8P@f=8*j(=EMOJh0|CD|NrN?{_uc&vKTp78;F$-GFT&Z@M>n5& z18EajNZ!5ei+$Soql4Pc0!8m{$Tu`;Lm*(&m*ZbuU9NvJveJ4i*K!=b_4O23#?||V z#>tiRyk0`!@HTTB2{ zj6$Vbu!D(4#zdS$i)Szhyhd2oO>%NNDi9x~b5VR39m5#JkKqI!uE|6qN<4*l5`Q7D zCC$)#YweFE5Ip*y5Hm2u4*-%3Re_~x`gfG$ca;6VsI4#UC+92$^8mdkjS3VrYErwJ zZO%%+3}`=j)_CYOh80dSG<}*@;bjW9+P>GMeP#WKrp;i%bqgO9DBNHAxY#~DQfz-a K{D#2Es{S9BrjdF8 literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/runner.py b/conversione/_pipeline/runner.py index 7eb02dc..125aeb9 100644 --- a/conversione/_pipeline/runner.py +++ b/conversione/_pipeline/runner.py @@ -71,6 +71,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool: print(f" Ambienti matematici: {t['n_ambienti_matematici']}") print(f" Titoli header uniti: {t['n_titoli_uniti']}") print(f" TOC rimosso: {'sì' if t['toc_rimosso'] else 'no'}") + print(f" TOC orfani rimossi: {t['n_toc_orfani_rimossi']}") print(f" Versi poesia riprist.: {t['n_versi_ripristinati']}") print(f" Header verso demotati: {t['n_header_verso_demotati']}") print(f" ALL-CAPS → ##: {t['n_header_allcaps']}") diff --git a/conversione/_pipeline/transforms.py b/conversione/_pipeline/transforms.py deleted file mode 100644 index 1c6a7cd..0000000 --- a/conversione/_pipeline/transforms.py +++ /dev/null @@ -1,974 +0,0 @@ -import re -from collections import Counter -from functools import partial - -# ─── Costanti ──────────────────────────────────────────────────────────────── - -_TOC_KEYWORDS = frozenset([ - "indice", "index", "contents", "table of contents", - "sommario", "inhaltsverzeichnis", "inhalt", - "indice generale", "indice analitico", "indice dei contenuti", - "elenco dei capitoli", "argomenti", "table des matières", - "tabla de contenidos", "содержание", -]) - -_ORDINALS_IT = { - "PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV", - "QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII", - "NONO": "IX", "DECIMO": "X", -} -_ORDINALS_EN = { - "ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5", - "SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10", -} - -# Mapping PUA Unicode (U+F020-U+F0FF) → simboli Unicode standard. -# Font Symbol di Windows codifica lettere greche e operatori matematici -# nel range Private Use Area invece dei codepoint Unicode standard. -_SYMBOL_PUA_MAP: dict[str, str] = { - "": " ", - "": "(", - "": ")", - "": "+", - "": "−", # minus - "": ".", - "": "/", - "": "0", "": "1", "": "2", "": "3", "": "4", - "": "5", "": "6", "": "7", "": "8", "": "9", - "": ":", "": ";", "": "<", "": "=", "": ">", - "": "≅", # congruent - "": "Α", # Alpha - "": "Β", # Beta - "": "Χ", # Chi - "": "Δ", # Delta - "": "Ε", # Epsilon - "": "Φ", # Phi - "": "Γ", # Gamma - "": "Η", # Eta - "": "Ι", # Iota - "": "ϑ", # theta variant - "": "Κ", # Kappa - "": "Λ", # Lambda - "": "Μ", # Mu - "": "Ν", # Nu - "": "Ο", # Omicron - "": "Π", # Pi - "": "Θ", # Theta - "": "Ρ", # Rho - "": "Σ", # Sigma - "": "Τ", # Tau - "": "Υ", # Upsilon - "": "ς", # sigma final - "": "Ω", # Omega - "": "Ξ", # Xi - "": "Ψ", # Psi - "": "Ζ", # Zeta - "": "[", - "": "∴", # therefore - "": "]", - "": "⊥", # perpendicular - "": "α", # alpha - "": "β", # beta - "": "χ", # chi - "": "δ", # delta - "": "ε", # epsilon - "": "φ", # phi - "": "γ", # gamma - "": "η", # eta - "": "ι", # iota - "": "ϕ", # phi variant - "": "κ", # kappa - "": "λ", # lambda - "": "μ", # mu - "": "ν", # nu - "": "ο", # omicron - "": "π", # pi - "": "θ", # theta - "": "ρ", # rho - "": "σ", # sigma - "": "τ", # tau - "": "υ", # upsilon - "": "ϖ", # pi symbol - "": "ω", # omega - "": "ξ", # xi - "": "ψ", # psi - "": "ζ", # zeta - "": "{", - "": "|", - "": "}", - "": "~", - "": "±", # plus-minus - "": "•", # bullet - "": "√", # square root - "": "≤", # less or equal - "": "≥", # greater or equal - "": "∝", # proportional - "": "×", # multiplication - "": "÷", # division - "": "×", # alternate multiply - "": "≠", # not equal - "": "≠", # not equal alternate - "": "≥", # greater or equal alternate - "": "′", # prime - "": "*", - "": ",", - "": "≤", # less or equal (Symbol 0xA3) - "": "•", # bullet (Wingdings 0xA7) - "": "•", # bullet variant - "": "→", # right arrow (Symbol 0xAE) - "": "÷", # division / range separator - "": "", # Wingdings decorative icon (rimosso) - "": "→", # right arrow variant - "": "", # bracket extension piece (non ricostruibile) - "": "", - "": "", - "": "", - "": "", - "": "", # TeX large paren left U+F8EB - "": "", # TeX large paren extension U+F8EC - "": "", # TeX large paren right U+F8ED - "": "", # TeX large paren right ext U+F8EE - "": "", # TeX large bracket left U+F8EF - "": "", # TeX large bracket ext U+F8F0 - "": "", # TeX brace top-left U+F8F1 - "": "", # TeX brace mid U+F8F2 - "": "", # TeX brace mid-right U+F8F3 - "": "", # TeX brace extension U+F8F4 - "": "", # TeX brace right U+F8F5 - "": "", # TeX bracket right large U+F8F6 - "": "", # TeX bracket right ext U+F8F7 - "": "", # TeX bracket right close U+F8F8 - "": "", # TeX integral large U+F8F9 - "": "", # TeX integral extension U+F8FA - "": "", # TeX integral top U+F8FB - "": "", # TeX radical top U+F8FC - "": "", # TeX radical extension U+F8FD - "": "", # TeX arrowhead U+F8FE -} - -_SYMBOL_PUA_RE = re.compile( - "[" + "".join(re.escape(k) for k in _SYMBOL_PUA_MAP) + "]" -) - -_SUPERSCRIPT_RE = re.compile(r'[¹²³⁰⁴-⁹]+') -_FOOTNOTE_BODY_RE = re.compile( - r'^([¹²³⁰⁴-⁹]+\s+|\[\d{1,3}\]\s+)' -) -_NUMBERED_HDR_RE = re.compile( - r"^(#{1,6})\s+(\d+(?:\.\d+)*)\.\s+(.+)$", - re.MULTILINE, -) -_BIB_MARKERS_RE = re.compile( - r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b' - r'|\b(19|20)\d{2}\b', - re.IGNORECASE, -) -_WATERMARK_RE = re.compile( - r"^(BOZZA|DRAFT|CONFIDENTIAL|RISERVATO|PROVVISORIO|SAMPLE|SPECIMEN" - r"|DO NOT DISTRIBUTE|NON DISTRIBUIRE|COPY|COPIA)\s*$", - re.IGNORECASE | re.MULTILINE, -) - -_MATH_SYMBOLS_RE = re.compile( - r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]" -) -_EXERCISE_TRIGGER_RE = re.compile( - r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that" - r"|Compute|Calculate|Dimostrare|Verificare)\b", - re.IGNORECASE, -) -_MATH_HDR_RE = re.compile(r"^(#{2,3})\s+(.+)$") -_NUMBERED_PREFIX_RE = re.compile(r"^(\d+(?:\.\d+)*[.)])\s+(.+)$", re.DOTALL) - -# Erano compilati dentro le funzioni a ogni chiamata — ora costanti di modulo -_TABSEP_RE = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$") -_FM_RE = re.compile( - r"https?://|www\.|@[A-Za-z]|\bUniversit[àa]\b|\bDipartimento\b|" - r"\bCopyright\b|\bLicenza\b|\bEdizione\b|" - r"protetto da|tutti i diritti", - re.IGNORECASE, -) -_VERSE_NUM_RE = re.compile( - r'([.!?\xbb\'\"’]\s+)(\d+)(\s+)(?=[A-Z\xc0-\xd9a-z\xe0-\xf9\xab"“‟])' -) - - -# ─── Helper puri ───────────────────────────────────────────────────────────── - -def _sentence_case(s: str) -> str: - if not s: - return s - lower = s.lower() - return lower[0].upper() + lower[1:] - - -def _is_allcaps_line(line: str) -> bool: - stripped = line.strip() - letters = [c for c in stripped if c.isalpha()] - return ( - len(letters) >= 3 - and all(c.isupper() for c in letters) - and not stripped.startswith("#") - and not stripped.startswith("|") - ) - - -def _allcaps_to_header(raw_line: str) -> str: - text = re.sub(r"^[-*+]\s+", "", raw_line.strip()) - text = text.rstrip(".").rstrip("?").strip() - - _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys()) - m = re.match(rf"^CAPITOLO ({_ORD_IT_PAT})\. (.+)", text) - if m: - roman = _ORDINALS_IT[m.group(1)] - titolo = m.group(2).rstrip(".").rstrip("?").strip() - return f"## Capitolo {roman} — {_sentence_case(titolo)}" - - _ORD_EN_PAT = "|".join(_ORDINALS_EN.keys()) - m = re.match(rf"^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)", text) - if m: - n = _ORDINALS_EN.get(m.group(1), m.group(1)) - titolo = m.group(2).rstrip(".").rstrip("?").strip() - return f"## Chapter {n} — {_sentence_case(titolo)}" - - m = re.match(r"^([IVXLCDM]+|[0-9]+)\. (.+)", text) - if m: - return f"## {m.group(1)}. {_sentence_case(m.group(2).rstrip('.').strip())}" - - return f"## {_sentence_case(text)}" - - -def _extract_math_environments(text: str) -> tuple[str, int]: - _ENVS = ( - r"Definizione|Definition|Teorema|Theorem|Lemma|" - r"Proposizione|Proposition|Corollario|Corollary|" - r"Osservazione|Remark|Nota|Note|Esempio|Example" - ) - count = 0 - blocks = text.split("\n\n") - result = [] - - for block in blocks: - stripped = block.strip() - if not stripped or stripped.startswith("#"): - result.append(block) - continue - - m = re.match( - rf"^({_ENVS})\s+((?:\d+\.?){{1,4}})\s*(.*)", - stripped, - re.DOTALL, - ) - if not m: - result.append(block) - continue - - env = m.group(1) - num = m.group(2).rstrip(".") - rest = m.group(3).strip() - - title_m = re.match(r"^(\([^)]{2,60}\))\s+(.*)", rest, re.DOTALL) - if title_m: - header = f"### {env} {num} {title_m.group(1)}" - body = title_m.group(2).strip() - else: - header = f"### {env} {num}." - body = rest - - result.append(f"{header}\n\n{body}" if body else header) - count += 1 - - return "\n\n".join(result), count - - -def _merge_title_headers(text: str) -> tuple[str, int]: - count = 0 - blocks = re.split(r"\n{2,}", text) - result = [] - i = 0 - while i < len(blocks): - block = blocks[i] - stripped = block.strip() - if ( - re.match(r"^#{2,3} \d+\.\s*$", stripped) - and i + 1 < len(blocks) - ): - nxt = blocks[i + 1].strip() - if ( - nxt - and "\n" not in nxt - and len(nxt) <= 80 - and not nxt.startswith("#") - and not re.match(r"^\d+[\.\)]\s", nxt) - ): - result.append(stripped.rstrip() + " " + nxt) - count += 1 - i += 2 - continue - result.append(block) - i += 1 - return re.sub(r"\n{3,}", "\n\n", "\n\n".join(result)), count - - -def _extract_article_headers(text: str) -> tuple[str, int]: - count = 0 - - def _repl(m: re.Match) -> str: - nonlocal count - num = m.group(1) - rest = m.group(2).strip() - - title_m = re.match( - r"^([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda].{1,74}?)\.\s+" - r"([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\(\d].{4,})", - rest, - ) - if title_m: - count += 1 - return ( - f"### Art. {num}. {title_m.group(1)}.\n\n" - f"{title_m.group(2).strip()}" - ) - if rest: - count += 1 - return f"### Art. {num}.\n\n{rest}" - count += 1 - return f"### Art. {num}." - - text = re.sub( - r"^-\s+Art\.\s+([\d]+[a-z\-]*)\.\s*(.*)", - _repl, - text, - flags=re.MULTILINE, - ) - return text, count - - -# ─── Trasformazioni atomiche ────────────────────────────────────────────────── - -def _t_fix_symbol_font(text: str) -> tuple[str, int]: - count = [0] - - def _repl(m: re.Match) -> str: - count[0] += 1 - return _SYMBOL_PUA_MAP[m.group(0)] - - result = _SYMBOL_PUA_RE.sub(_repl, text) - return result, count[0] - - -def _t_remove_images(text: str) -> tuple[str, int]: - n = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text)) - text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text) - return text, n - - -def _t_remove_footnotes(text: str) -> tuple[str, int]: - lines = text.split("\n") - result, count = [], 0 - for line in lines: - stripped = line.strip() - if stripped and _FOOTNOTE_BODY_RE.match(stripped) and len(stripped) < 300: - count += 1 - continue - cleaned = _SUPERSCRIPT_RE.sub("", line) - if cleaned != line: - count += 1 - result.append(cleaned) - return "\n".join(result), count - - -def _t_fix_br(text: str) -> tuple[str, int]: - n = len(re.findall(r"
", text, re.IGNORECASE)) - text = re.sub(r"
\s*", " ", text, flags=re.IGNORECASE) - return text, n - - -def _t_fix_tabsep(text: str) -> tuple[str, int]: - n = len(_TABSEP_RE.findall(text)) - text = _TABSEP_RE.sub("", text) - return text, n - - -def _t_fix_accents(text: str) -> tuple[str, int]: - _ACCENT_MAP = { - "e": "\xe8", "E": "\xc8", "a": "\xe0", "A": "\xc0", - "u": "\xf9", "U": "\xd9", "i": "\xec", "I": "\xcc", - "o": "\xf2", "O": "\xd2", - } - n_bt_before = text.count("`") - text = re.sub(r"`([eEaAuUiIoO])", lambda m: _ACCENT_MAP[m.group(1)], text) - text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text) - n_accenti = n_bt_before - text.count("`") - n_bt_orfani = text.count("`") - if n_bt_orfani: - text = re.sub(r"`", "", text) - n_accenti += n_bt_orfani - return text, n_accenti - - -def _t_fix_multiplication(text: str) -> tuple[str, int]: - n = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text)) - text = re.sub(r'(?<=[0-9])"(?=[0-9(])', '×', text) - return text, n - - -def _t_fix_micro(text: str) -> tuple[str, int]: - _SI_UNITS_RE = r'[mAsgVWFHTKNJClΩ]' - n = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text)) - text = re.sub(rf'(\d)\s*!({_SI_UNITS_RE})', r'\1 µ\2', text) - return text, n - - -def _t_remove_formula_labels(text: str) -> tuple[str, int]: - n = len(re.findall(r"\[\d+\.\d+\]", text)) - text = re.sub(r"\s*\[\d+\.\d+\]\s*", " ", text) - return text, n - - -def _t_remove_dotleaders(text: str) -> tuple[str, int]: - _DOTLEADER_RE = r"^[^\n]*(?:(?:\. ){3,}|\.{4,})[^\n]*$" - n = len(re.findall(_DOTLEADER_RE, text, re.MULTILINE)) - text = re.sub(_DOTLEADER_RE, "", text, flags=re.MULTILINE) - text = re.sub( - r"(?m)^(i{1,3}|iv|vi{0,3}|ix|xi{0,2}|x)$", - "", - text, - flags=re.IGNORECASE, - ) - return text, n - - -def _t_fix_header_concat(text: str) -> tuple[str, int]: - count = 0 - - def _fix(m: re.Match) -> str: - nonlocal count - hashes = m.group(1) - full = m.group(2).strip() - if len(full) < 60: - return m.group(0) - skip = min(10, len(full) // 3) - split = re.search(r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa\xe4])(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", full[skip:]) - if split: - pos = skip + split.start() - title = full[:pos].strip() - body = full[pos:].strip() - if len(title) >= 5 and len(body) >= 15: - count += 1 - return f"{hashes} {title}\n\n{body}" - return m.group(0) - - text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE) - return text, count - - -def _t_extract_capitolo(text: str) -> tuple[str, int]: - def _repl(m: re.Match) -> str: - num = m.group(1) - titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip()) - return f"\n\n## Capitolo {num}: {titolo}\n\n" - - text = re.sub( - r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L]" - r"[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\s\'\.,\(\)]{5,80}?)" - r"(?=\s*[-–]\s*\d|\s*\n|\s*$)", - _repl, - text, - ) - return text, 0 - - -def _t_normalize_numbered_headings(text: str) -> tuple[str, int]: - all_matches = list(_NUMBERED_HDR_RE.finditer(text)) - if not all_matches: - return text, 0 - - pairs = [(m.group(2).count(".") + 1, len(m.group(1))) for m in all_matches] - depths = [d for d, _ in pairs] - min_depth = min(depths) - max_depth = max(depths) - if max_depth == min_depth: - return text, 0 - - base_level = min(lv for d, lv in pairs if d == min_depth) - count = 0 - - def _repl(m: re.Match) -> str: - nonlocal count - hashes, num, title = m.group(1), m.group(2), m.group(3) - depth = num.count(".") + 1 - new_level = min(base_level + (depth - min_depth), 6) - if new_level == len(hashes): - return m.group(0) - count += 1 - return f"{'#' * new_level} {num}. {title}" - - return _NUMBERED_HDR_RE.sub(_repl, text), count - - -def _t_normalize_header_levels(text: str) -> tuple[str, int]: - text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE) - text = re.sub( - r"^(#{3,6})\s+(\d{1,3})\s+(.+)$", - lambda m: f"### {m.group(2)}. {m.group(3)}", - text, - flags=re.MULTILINE, - ) - text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE) - return text, 0 - - -def _t_extract_articles(text: str) -> tuple[str, int]: - return _extract_article_headers(text) - - -def _t_remove_header_bold(text: str) -> tuple[str, int]: - text = re.sub( - r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$", - r"\1 \2", - text, flags=re.MULTILINE, - ) - return text, 0 - - -def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]: - def _norm(m: re.Match) -> str: - hashes, content = m.group(1), m.group(2).strip() - letters = [c for c in content if c.isalpha()] - if letters and all(c.isupper() for c in letters): - return f"{hashes} {_sentence_case(content)}" - return m.group(0) - - text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE) - return text, 0 - - -def _t_remove_toc(text: str) -> tuple[str, int]: - lines = text.split("\n") - new_lines = [] - _in_toc = False - removed = False - for line in lines: - bare = re.sub(r"^#+\s*", "", line.strip()) - first_word = bare.split(".")[0].strip().lower() - if first_word in _TOC_KEYWORDS: - removed = True - _in_toc = True - continue - if _in_toc: - if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line): - continue - if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line): - continue - if len(line.strip()) > 200: - _in_toc = False - new_lines.append(line) - continue - _in_toc = False - new_lines.append(line) - return "\n".join(new_lines), 1 if removed else 0 - - -def _t_allcaps_to_headers(text: str) -> tuple[str, int]: - count = 0 - blocks = text.split("\n\n") - new_blocks = [] - for block in blocks: - stripped = block.strip() - if "\n" not in stripped and _is_allcaps_line(stripped): - new_blocks.append(_allcaps_to_header(stripped)) - count += 1 - else: - sub_lines = block.split("\n") - converted = [] - for ln in sub_lines: - if _is_allcaps_line(ln) and len(ln.strip()) > 3: - converted.append(_allcaps_to_header(ln)) - count += 1 - else: - converted.append(ln) - new_blocks.append("\n".join(converted)) - return "\n\n".join(new_blocks), count - - -def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]: - count = 0 - - def _num_repl(m: re.Match) -> str: - nonlocal count - content = m.group(2).strip() - if content.endswith(".") and len(content) > 40: - return m.group(0) - if _BIB_MARKERS_RE.search(content): - return m.group(0) - count += 1 - return f"### {m.group(1)}.\n\n{content}" - - text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE) - - def _num_letter_repl(m: re.Match) -> str: - nonlocal count - count += 1 - return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}" - - text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE) - - if not has_exercises: - def _aphorism_repl(m: re.Match) -> str: - nonlocal count - content = m.group(2).strip() - if _BIB_MARKERS_RE.search(content): - return m.group(0) - count += 1 - return f"\n\n### {m.group(1)}.\n\n{content}" - - text = re.sub( - r"^-\s+(\d{1,3})\.\s+(.{10,})$", - _aphorism_repl, - text, - flags=re.MULTILINE, - ) - - def _list_section_repl(m: re.Match) -> str: - nonlocal count - num = m.group(1) - content = m.group(2).strip() - if _BIB_MARKERS_RE.search(content): - return m.group(0) - count += 1 - split = re.search(r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", content) - if split and split.start() >= 3: - title = content[: split.start()].strip() - body = content[split.end():].strip() - if len(body) >= 20: - return f"\n\n### {num}. {title}\n\n{body}" - return f"\n\n### {num}. {content}" - - text = re.sub( - r"^-\s+(\d{1,3})\s+([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L].{10,})$", - _list_section_repl, - text, - flags=re.MULTILINE, - ) - return text, count - - -def _t_extract_math(text: str) -> tuple[str, int]: - return _extract_math_environments(text) - - -def _t_merge_paragraphs(text: str) -> tuple[str, int]: - _SENTENCE_END = set(".?!\xbb)\"'") - blocks = text.split("\n\n") - merged = [] - count = 0 - i = 0 - while i < len(blocks): - b = blocks[i] - stripped = b.strip() - while ( - i + 1 < len(blocks) - and stripped - and not stripped.startswith("#") - and not stripped.startswith("|") - and stripped[-1] not in _SENTENCE_END - ): - nxt = blocks[i + 1].strip() - if ( - not nxt - or nxt.startswith("#") - or nxt.startswith("|") - or re.match(r"^\d+\.", nxt) - or re.match(r"^[-*+]\s", nxt) - ): - break - b = stripped + " " + nxt - stripped = b.strip() - count += 1 - i += 1 - merged.append(b) - i += 1 - text = "\n\n".join(merged) - text = re.sub(r"(?m)^\|---\|\s*", "", text) - return text, count - - -def _t_normalize_whitespace(text: str) -> tuple[str, int]: - lines = text.split("\n") - text = "\n".join( - re.sub(r" +", " ", line) if line.strip() else line - for line in lines - ) - return text, 0 - - -def _t_collapse_blank_lines(text: str) -> tuple[str, int]: - return re.sub(r"\n{3,}", "\n\n", text), 0 - - -def _t_demote_verse_headers(text: str) -> tuple[str, int]: - count = 0 - - def _demote(m: re.Match) -> str: - nonlocal count - hashes, content = m.group(1), m.group(2).strip() - if not re.search(r"\s\d{1,4}\s*$", content): - return m.group(0) - inner = re.sub(r"\s\d{1,4}\s*$", "", content) - if not re.search(r'[,;:.!?\xbb"\'][\ ]+[A-Za-z\xc0-\xff\xab"“]', inner): - return m.group(0) - count += 1 - clean = re.sub(r"\s\d{1,4}\s*$", "", content) - return clean - - text = re.sub(r"^(#{1,6})\s+(.{20,})$", _demote, text, flags=re.MULTILINE) - return text, count - - -def _t_restore_poetry_lines(text: str) -> tuple[str, int]: - count = 0 - blocks = text.split("\n\n") - result = [] - - for block in blocks: - stripped = block.strip() - if not stripped or stripped.startswith("#"): - result.append(block) - continue - - matches = list(_VERSE_NUM_RE.finditer(stripped)) - if len(matches) < 2: - result.append(block) - continue - - nums = [int(m.group(2)) for m in matches] - diffs = [nums[i + 1] - nums[i] for i in range(len(nums) - 1)] - if not diffs or len(set(diffs)) > 2 or not (1 <= diffs[0] <= 5): - result.append(block) - continue - - step = diffs[0] - - def _replace_verse_num(m: re.Match) -> str: - n = int(m.group(2)) - sep = "\n\n" if n % (step * 3) == 0 else "\n" - return m.group(1).rstrip() + sep - - new_block = _VERSE_NUM_RE.sub(_replace_verse_num, stripped) - if new_block != stripped: - count += len(matches) - result.append(new_block) - - return "\n\n".join(result), count - - -def _t_remove_urls(text: str) -> tuple[str, int]: - return re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text), 0 - - -def _t_remove_empty_headers(text: str) -> tuple[str, int]: - blocks = re.split(r"\n{2,}", text) - cleaned = [] - for i, block in enumerate(blocks): - stripped = block.strip() - if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped: - next_stripped = blocks[i + 1].strip() if i + 1 < len(blocks) else "" - next_is_long_hdr = ( - re.match(r"^#{1,6} ", next_stripped) and len(next_stripped) > 80 - ) - if not next_stripped or ( - re.match(r"^#{1,6} ", next_stripped) and not next_is_long_hdr - ): - continue - cleaned.append(block) - return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), 0 - - -def _t_merge_title_headers(text: str) -> tuple[str, int]: - return _merge_title_headers(text) - - -def _t_remove_garbage_headers(text: str) -> tuple[str, int]: - def _is_garbage(content: str) -> bool: - if content.lstrip().startswith("..."): - return True - if not re.search(r"[A-Za-z\xc0-\xffΑ-ω]{2,}", content): - return True - if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()): - return True - if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content): - return True - first_alpha = next((c for c in content if c.isalpha()), None) - if first_alpha and first_alpha.islower() and len(content) > 40: - return True - if re.match(r"^[A-Za-zΑ-ω_]{1,3}\s*[=<>≤≥]", content.strip()): - return True - if re.match(r"^(Figura|Figure|Fig\.|Tabella|Table|Tab\.)\s+\d", content.strip(), re.IGNORECASE): - return True - return False - - count = 0 - lines = text.split("\n") - new_lines = [] - for line in lines: - m = re.match(r"^#{1,6} (.+)$", line) - if m and _is_garbage(m.group(1)): - count += 1 - continue - new_lines.append(line) - text = "\n".join(new_lines) - text = re.sub(r"\n{3,}", "\n\n", text) - return text, count - - -def _t_remove_frontmatter(text: str) -> tuple[str, int]: - blocks = re.split(r"\n{2,}", text) - cleaned = [] - count = 0 - total = len(blocks) - cutoff = max(5, min(15, int(total * 0.20))) - for i, block in enumerate(blocks): - stripped = block.strip() - if i >= cutoff: - cleaned.append(block) - continue - if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped): - cleaned.append(block) - continue - body = blocks[i + 1].strip() if i + 1 < len(blocks) else "" - is_fm_body = len(body) < 250 and _FM_RE.search(body) - is_fm_hdr = _FM_RE.search(stripped) - if is_fm_body or is_fm_hdr: - count += 1 - continue - cleaned.append(block) - return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), count - - -def _t_remove_watermarks(text: str) -> tuple[str, int]: - lines = text.split("\n") - result, count = [], 0 - for line in lines: - if _WATERMARK_RE.match(line): - count += 1 - else: - result.append(line) - return "\n".join(result), count - - -def _t_fix_math_symbols(text: str) -> tuple[str, int]: - lines = text.split("\n") - result, count = [], 0 - for line in lines: - if line.strip() and re.match(r"^[\s■-◿☐-☒•▪▫◆◇●○•]+$", line): - count += 1 - else: - result.append(line) - return "\n".join(result), count - - -def _t_remove_recurring_lines(text: str) -> tuple[str, int]: - lines = text.split("\n") - short_lines = [ - ln.strip() for ln in lines - if 3 < len(ln.strip()) < 80 - and not ln.strip().startswith("#") - and not ln.strip().startswith("|") - ] - freq = Counter(short_lines) - recurring = {ln for ln, c in freq.items() if c >= 5} - if not recurring: - return text, 0 - result, count = [], 0 - for line in lines: - if line.strip() in recurring: - count += 1 - else: - result.append(line) - return "\n".join(result), count - - -def _t_math_header_demotion(text: str) -> tuple[str, int]: - lines = text.split("\n") - result, count = [], 0 - for line in lines: - m = _MATH_HDR_RE.match(line) - if not m: - result.append(line) - continue - body = m.group(2) - if len(body) <= 100: - result.append(line) - continue - has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3 - has_exercise = bool(_EXERCISE_TRIGGER_RE.search(body)) - if not (has_math or has_exercise): - result.append(line) - continue - nm = _NUMBERED_PREFIX_RE.match(body) - if nm: - result.append(f"**{nm.group(1)}** {nm.group(2)}") - else: - result.append(body) - count += 1 - return "\n".join(result), count - - -# ─── Orchestratore ─────────────────────────────────────────────────────────── - -def apply_transforms(text: str) -> tuple[str, dict]: - """ - Applica le trasformazioni strutturali al Markdown grezzo. - Restituisce (testo_modificato, statistiche). - L'ordine è semantico: encoding → struttura header → costruzione struttura → testo → rifinitura. - """ - _has_ex = bool(re.search(r"\b(Esercizi|Exercises|Problems|Homework)\b", text, re.IGNORECASE)) - - _transforms: list[tuple[str | None, object]] = [ - ("n_simboli_pua_corretti", _t_fix_symbol_font), - ("n_immagini_rimosse", _t_remove_images), - ("n_br_rimossi", _t_fix_br), - ("n_tabsep_rimossi", _t_fix_tabsep), - ("n_note_rimosse", _t_remove_footnotes), - ("n_accenti_corretti", _t_fix_accents), - ("n_moltiplicazioni_corrette", _t_fix_multiplication), - ("n_micro_corretti", _t_fix_micro), - ("n_simboli_math_rimossi", _t_fix_math_symbols), - ("n_formule_rimossi", _t_remove_formula_labels), - ("n_dotleader_rimossi", _t_remove_dotleaders), - ("n_righe_ricorrenti_rimosse", _t_remove_recurring_lines), - ("n_header_concat_fixati", _t_fix_header_concat), - (None, _t_extract_capitolo), - ("n_header_numerati_normalizzati", _t_normalize_numbered_headings), - (None, _t_normalize_header_levels), - ("n_articoli_estratti", _t_extract_articles), - (None, _t_remove_header_bold), - (None, _t_normalize_allcaps_headers), - ("toc_rimosso", _t_remove_toc), - ("n_header_allcaps", _t_allcaps_to_headers), - ("n_sezioni_numerate", partial(_t_numbered_sections, has_exercises=_has_ex)), - ("n_ambienti_matematici", _t_extract_math), - ("n_paragrafi_uniti", _t_merge_paragraphs), - (None, _t_normalize_whitespace), - (None, _t_collapse_blank_lines), - ("n_versi_ripristinati", _t_restore_poetry_lines), - ("n_header_verso_demotati", _t_demote_verse_headers), - (None, _t_remove_urls), - (None, _t_remove_empty_headers), - ("n_titoli_uniti", _t_merge_title_headers), - (None, lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0)), - ("n_garbage_headers_rimossi", _t_remove_garbage_headers), - ("n_formula_headers_demotati", _t_math_header_demotion), - ("n_frontmatter_rimossi", _t_remove_frontmatter), - ("n_watermark_rimossi", _t_remove_watermarks), - ] - - stats: dict = {} - for stat_key, fn in _transforms: - text, n = fn(text) - if stat_key: - stats[stat_key] = stats.get(stat_key, 0) + n - - stats["toc_rimosso"] = bool(stats.get("toc_rimosso", 0)) - return text, stats diff --git a/conversione/_pipeline/transforms/__init__.py b/conversione/_pipeline/transforms/__init__.py new file mode 100644 index 0000000..9b02e60 --- /dev/null +++ b/conversione/_pipeline/transforms/__init__.py @@ -0,0 +1,4 @@ +"""Package transforms: pipeline di pulizia strutturale per Markdown RAG.""" +from ._apply import apply_transforms + +__all__ = ["apply_transforms"] diff --git a/conversione/_pipeline/transforms/__pycache__/__init__.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd0f2d74f561dee43310cd2579374cebea6803a1 GIT binary patch literal 323 zcmYLFJxc>Y5Z%3tCPV^)ouF)ClP24X3Kn7^ida}&Gc333xnys5nY)WzI)8$d_!Ims zHntuJf|Z?cT`Kp2A+MNu^TE8wyp2Xf67X`{>Zg#uyJCgtpR~JyJQGP0BB_iF&7&lu zM8+9S;<`89n4evXbSg5%N))!ZbC?&0%xkZVwu(v3d}Z`p3s#g^m1Tv(DCQNiOMz4A z9xS^#J)iv0pfje;w(#E6u)c+H7+j5eO&>tI6rkCJ<<4@@b%ikiucYl9FOK=W%N3Vm zrlsO2GM+j+Q&?zc6^E5Cz82sB+UgQu;_JoWD0i|l>IA#jbe@Wx5PoAy=?A&^BnMwx Nd+)>j$36~1{}0d{U#$QD literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/transforms/__pycache__/_apply.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_apply.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c98ec7f60648f34ef2c5d1933512af30950d75da GIT binary patch literal 4848 zcmaJ^OKjZM73KUSYV`kqEXkDQk49rl@i+d+@>h2JiJi)60uX8#4*5o+%|}ii-`JKq z0<;JUC& zbKiaU-E-$J?d@3!o=@I-Y5l%ilKzDtdYZzIm;Zr}$C4?jl9@1*c0x_SJ7p(nN=+ps zGi_$m_S=y?$(l)h?wyW*5L+zlQYA5YdyJ)xCO?%WH z+N<`8{>^qD?N|HhfI2`2)j>L>4$)zCn2x9;LND99=%_kM$J8-8u8z~)>Tas23Y}0V zguca|q*Lk?-J|ZId)2*kpSq9kSN98D)}E$0HAiRE8Jbt~^niMR9#jv4u9eK1ZDfwj zl2?k!Vc2UiZMMIkRu8Qua~w)I2(SmDhMc3heM<~+` zgE-vt`$J62Hn+s892WB1BC%yKH<$Ih05d|}dqlISzCt{|CG1gPetY=I^@2xAe(!eQ zqU&Zad^Ox# zY?k{vC*Ux4*Au%9%`P+B%QcDBaPX>FM>AZ<(7BkBJOE!CTr>2N#a-L=$H4>#U2j|K zM03itKo~K_NHE6pM`HSLLYv&fO`%)Eg+m3Lfa#CMhUm5pi+Q0fXdQYo9L8Ov(KRkB zt?G`3Qk**)%H3$R=v+6_BL<$M=XXNqXlLlK*j6!I44Zhlj1W`8R)_{cq^~f&wCeeT zF>~*)T5ysj-5^3?xV8<}6RlwD&fV~Aa0G)$hA`GjF5zsgL197waiZOW^Cgkluyk#V zh%G`(d@WM>129?5CT>As)a4Dav7)mAgkxQWbBVYJy)Kr-fNnUDB7=(noghm5GzaJS zO^om|a}2yU(bEhcDfqp-1MOpp*L57F;<=NKzP8_Hq9XBRgzrT8EfT={vvB-p;fx!J z70FC;IqFhOehit9i=2rvfz!M#rqA$>7;fTSG2G01Vp!&VG2FrjVmQl(Vz`x$#BduQ zjp24a9>X16iQ!H@8N*$CPYieReLL{<4m=aX-TXid_wd;m?&YtRbM*b% z`H%5yG0ix?5yQLr%@|hr8;y7)X-ONYO7fEQqr4<7gAX>+8=0h3$yB5$4uw?OXsV>W z#7BLV)ScVW4BPKY+MI|ySZP`ZMI}{9qf<_8+Pfxbl^^`&!1BN*(%0GYdQw1_6DMIS71ge{% z)Y1(K#oC68Dx_>6>d00xL>MBz7?|RKflRM0zdo~Em|pY7*zF@0Uq;Q&!WW{>(eSSONkO>X(!4iq6s{!v%U z?{%~~>(}Q4imQ)*Piv{@*#$E=8}Gr5kKXX3*eXp_mI_;Z%@yyLbT4A`aR1eb5Kys!D)$I!j+iQ zN>+(rF*gS#mm}(1Z%)IicWt(`_N_c`gFj7uVSQ4_wC!HA^R{c~wzmKuwLVL*o`>YL zDgCbShvILG)iXww+^O31`EjROdartnZAzF@B!Ae^cGL@DJXk(NTrD#7sFc`tI2><7 z$xt5xkz21cBVo-YGk&_@x;De)?l*Zvhw9$XUcGYj*3#m|^S2in9uvdTfMJ!vkos8* zHc0xc6&s{`hMdmYu|c+Ho!H>j}6i|8^i_)oZ(*!hSbe=VS{JO&^c@j8{}-Z z8yf|i32Y{@nZgDu28J}v_F{ts&Guu1)Xdm!-RB+p=9M6Q;Aqo7S|+ygXR7i-kO34lN!jtI3sw14(2P))vir93 z=c@AgpapSO%8ovrugWKaR)ihUdn16um^j?+$V!xgnd$W zWGfGopAGsE4@lX`5Dy|AlCoo4`HNNgQZS5oM9S{o%Ac*u=Ym~`M};w$s`6qmhFB5f zg?<9@q?GLl=bJ*jN6OA@<*!!dYr$T``-Fb6DqjxvBc2vmY%Yg*M&K(|`L!UAcvi{| z*36$n_zI9}XwD(ThhcZ2Z;l{7imrQg>&TVr+wW}UwW|DXa17C0u<~^9D#8gB$shi?I5oR z!L^2It_#TxkidDIugVvK*M;OJ_IZ|PRoM$}3CSBE*}Ih&KL3G`EP(_lj#qdAEqDb zerxZG@u^>0zp$RjpHDm=Kl=H5&&E$YTzoYB;jN$<#(b5PMwL%9zi#Dw!LW(l*kA*0TD*9*%nUGif5@HT zCwNxbhpL5!7Mw_CEmg|8edtn0%2tikNUhW=PnG%tT_SI$t`yNHzpX;lDt_v@Gy4bL zV!Kbhf-`sSx%b?==kGgr{%W^d31~kb+7{bv1o2P&l556nXSXx{uIde~Ny zV*!r6B*zMzvXUGYIF6DW8*rQ@IdP`NF1LKfO27k zgpdrAX8p2&f_`+XlsPj6MeQ5N=wv``+#z@(f(*9(c<);8CrKPd98Mt(H2oIF|-stZi>gyl!_%8JK{tyq*vM?vBMkL77D8ylzxrQCV z0RP)FKvoGk&y(?})+Xs(ukhqag?K>e?+>zTB4p8;oP!mX5In^jU{zLbgir{D9HR!| z=k?83)_QN;xal1kY4VPE8*kinL&rUTpwy^WYEp?cS~UcOkV+vzHI0cO-X93425EY9 zjY1aKQzb(ZK3+}Dldjf@a8PLF{WBsjv?Bj_D;EyU2uQ+KYxPZuQ$j$5o{anTyit0 z4E8||Oq@Ve>!mmQ`UgBc-Gd%fjz?WUvNl7Nc!mY^J|@ojMiHupcf3mK1X85%*s=JT zk6oV}T04|-oK3Oqd9ANB#W-N0PaG*sz>`l#17RnFrcX(LFf5Z53Zbb{M_@I z0ZP5Ts`a16@VfhMJ20i(2eiwaETaNti;1VLJXv(n%n?YUUq0YBwCU@%ohG zSc+}LO4+yEx!4&y8sk%J-Sa6!2L^?h%x46UpKUx@_{rlC*AYX6QpjFnBt#GqTA?TN zxg)oNARc795k_H^LE(Nu+%BDxp|XV zo4Xp%D0D7gMU0A3p>Lf92~0tPLQP`l+|?9g?j$6!G(`hQPGV=>)f6f>OEG8}&Di&Z zv?3grVl(Iu;b_r~Y*86$Dj>@6kdZira0IGM&@Xcny7_S;&@8VS{Zmsyh*#;!uo!Zi z^t{I4TqWUDE=>nO3Wsq*rSYh0)YS?f!#77hLDOc-) zDb2c8_@8zaCYj5Y?Thx6@h{AWGv%;P#zoki%jXu)t$aVlR>y>Owr-uSdpdsqPaw?Z zz2W7Lds{FWnRh_~EFOSYR0Y~a;I1WdT~)DJP;ffn_XYf;LO?<-F#kCGOKm`kQ(tQW zpyp(XJ(W*->#k#oiUfps+nynwhZ-ZeCe(f;k zqd?X9ESVo03UITxG^kg1O4BOfoj;;j8k8WE7O((b^*5ma35_&0c6Gp?x248?yRA8* zcw26tX^yycUcDB?2p7EbuQYZA-8UP>+ozh_B8oVp%!s#7YTY?y4tGySlsR|3+qfI~ zELW~y9=de-QePJMUIuAygC5>?UvK}=WlwjnX8`IpQ~@jrCsPPvJ$@?-j>3m!h;kk( z8FU(^oWgTCAP~Tn--%wyS0%9%iAq31Wlc#r&ZhtjbEV13iMT61^r`Fj)xWLYWILn1 z*^(t~K9x3G(`MVFGUsw|G5C#zu^FSi85_i-UcWpYgxItih;~5MTZP*0S-64mVwvKo zae|{prXhz@e37d_HqIjiNY1t#hUBs>N6k?h5-?drxE7*FBGk9{>PZ{`yBbgDWJ<07 z!|FUsBw%Etq{BkJ2@b=DPoS8G^B=<60s~H=MY}XB$`c5$RT)thf)c97(;9%dty+YkS1^+b zgj80V2qW32<0(Ccgm+YuQ!Pl~rV$cD&sT?z{>#8r7}Ez{O0%{lY31~4`jWj`>6DLubQ=vmwrHI-5$3WlV&HT{bV8e`d{KiP5(*1U$a65#?1eHs!1b zD5!1t^}+iGe^vd^*?y0SQL*l1dD8#8{U5m2o$b*}8_xDG*~+w|V&&Sx>_XX2xTl#K zfP2=l+_Bj4{`uIs4YocD*o&U8Y_8=ii&tXxU)YXhEHE=;Ed|)$eq9R(`gXgFC_g9R zBL8l8N3Vu&tS*6N?OVk;-MVjpZpf4Oz{ zH(PffY_0xyYxO5vtIJ!f@5A88@p?TBjCvU02+*%qb-0)J`8B5C^Sj*{2bAW{g}M-5 zwxAN7@Ja6mN5fEi^uU}!_?K=0*}E<*c9f;_zqs(x#d{am8?LXvePgrQ`=T!MG`!Go z036oap8+gmvW5V*8;l8TTvUw{tj<rGGjdrWBTpl6;WvRL!oP&8EL=WkKgyg=UigfLV(7DX*4fwB>DRSi z<=(HFd_F$R`Fz?YD$9if0fCd@Q<0>@iq!{Sj{s{zvV=6NY1eh^BKlR%0CzBzk*B8u zf@%;$vi7Y=`+lL_)U~UYR?}-Wn}!lvm}y4TPP;~EXVJp)iTQjm%ufe|PSg)u;NK~^ zfo2RONq$KT{hhGhSK%_iC93XsZR6LB4aiE5#)^63 zb#W=)c(1>}iW&4(Oq@%!#V77NvMd`}ALB9vJmNy4Cs~oWct5<2Uo(gCCi-59SCbV< zE?Kpf+sg{akTL{3;?s%2WL@G1_ut#bubG1`kOaqoN5Ye=OG?ScH8=*mW}3|UUhpX1 o3tlt(F$)A@R*^uBMUvfQtRh3;XMA5=O7z6%?jQL^d)HL?ADP0lng9R* literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/transforms/__pycache__/_constants.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_constants.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68f5d79d8919bf27671a21666db512999bf725b5 GIT binary patch literal 7640 zcmai%dt6gzn#a!x;g)g-@0UijBM}7^v}&ywAZQc_O#&(e)4+SwSV(9R6l3fpU^O_M z5vmnzwbG`xSbOnawAFTWcV~BJ_kiQ2$A&hpnj=Y5}(x}T5i+VXp! z?|b>Z&pGFPPu}zM^mGje{~k3=I5$n_xL?Vj{7b`N;^icc8{q0W!ZmV)C#p8Skymko zYBxs`-sdm!tY2dSQIkZHM3PAgNhKQc3`ryDB;$;xQB5*Q7Re?#B$wooX=FN?L1vO! z(30rn$ZRs_1W)FYd7$&jv!GhC05qR01YJa)1N{M63|c^Rpi4+5Xd%f0T}rY+mysOM zB9aSQO!7dNlWCwU$aK(^WX1^|<|!eiK&!}V(C5h-(6wY8=nG^$=!;|nXc^fEswd^3 z6{Hf>K&n7DkxeZfF37(p#9`c&=ce& z=v(9z=l~hSnNE|pf!`r#K+lqQLC=vP(D%supb_!`=y~!X=tpE2^aA-9^dk8L^i%Q~ z=;!1T=w)&R^eVXqdY#+={ep~u-XynhrZ34?z_-aA(67lipg$#dLGO{@1O0vS2cSQr z@gI^ug7}ZA{+#yw6Y{4}Ba%_jF>)W2`TRn5Cl9FplKdIY#Jv8Td<$*kaxM7_8cS6F zlImYk{cEa!L-lW|en<81s79&&it67}{RgW5DC>EieEXFD{^^Y7Jl~ikBnv4*$}ZI` z*hE4q`Sp4zVQ=w!^Kkb)Q4k;bcJGWQDljC!Im`)3mE1*SWk>pfNtdijO=H@W-v1o$ zlRjmie~I_Wn6l5mLhkc1ktd3aDh7>NQ;z&M7@7aX5DeoZbEX{m?@u53^i2H8a6go6 za;Nls7{AuMDf`5pK5yK8+LR+7#oeb*+2?V*&x|SiO#D9?%$#!Mf5hEqP1)zS;9mNF zJdrt5j{MKK``jt}{8zlsyaOuT@BI5U6>hK3;qp1P#Hnp}lg>7`w$<$-&V5d=p!Ini zTCAoWfW5oio_2@d>2?W48f&M|CurLQtw-1`biu&(4riMKs%T}qqoYI6g1yh#x<}CN zbMAAy+}c+=okg18$X8ga%aoFwFjuP}B|<55NhvV5PjLCX{%oJ41&rOhvyex0K^_?QyjEy!!-?UjTEL(59@Y@S ziSC^~r$1L{6I`usEv~^<9M$XP$NSP%?1h2LouIk>fNAUV{Y`scTKOK$V6Hxt3oc@hc_1MB+;&ewDMB#c8PwPS^%h79CTcL$N{I&Jrp;EcH_`!y+Rg;H*oVQPNnVTL@FZV=;e|*JF5ll7A7yJCpnd7KVbfjP|~c9O{(cW@0~LH$%LdP!Ge8u=Oy* zlZdY~d<*di!w(UUGCYA8W;lS@%kVVfF^Z!~8b4yKZ)58l4BtWgF~c*6KVf(l@i@bC zhh=UB@MLf-L81Zd}rx4$n zFIqBzRd z?8DGy)`~kJbcMBsu=7=hJ&4yB9!9**P+9gj7%KPw3x>Ved4!^P2k|Cr4I|!St++u$ zu=@^T3+%oChOqk%VFl8X`x>>shkLAhgdsk*ZcMnFJ-28tZi#MuQr-B7x>+{SATFftk0pKy z=KskgzZ64!Cv;yz{*0EV?E9o!Mhm2JR`VTBiD2q?1`&8f>$2FI}yPZ6u}i7!8;JaI}pLu8Np>4xg%xT zEjwz;P1W|gdcD0yU)PG?t?)ECP7Qwx;cub}WGlzR_ilh|hC}%w-WTt;7f+4QCH^lE z|NcG25AZzq#vCtauOnoEHWT_=i~I&#OF>6RnXM?W&)o*vqUu z)Wan$S>&HzP}Z*7X$#2t%CaZ`ux!~fTcAv)&LG^bJw9KDw`^T;abW-c{ctH8oAk>X z9n1X9a0&G;Cw-p2rh7w0=^46eL|oDhIh)}2Z2gvDvxLoJkDLtRtRAU`m^Qr zy%N?tUEg#q&)xaI*LhQst~uV?Sq%4Xr@Rkv7ubqmnH1uBxEEY@@0M+`t)RJSr_I$2 z#?LQ5vFuJ=HLal)B5~_6shr z;FHqrR#Sz2t6|4Bley9&rP-k{*6OP*cBA!4GSqsu$V1iek>isR;6B1T(e7}$-11iu zyjSdYDVcpX36grR+v$=ro_uGT4e~p*zRqB_RG5u*Ry$N=+pA0_E9@$Rz1&o}L#|4# zsjs#gtK|cEmc6#VrrcmQRN6OJn&n1~aZ|0yY^cy%WV3Q(IlPI@TftFoOt){-!)JwD zL;KpT`f`h*PL7l8RW+U{}EWBMQ92ZOAdH%5~fmctYaJ=SG3YR_mF_$z)HNhoiCj`@DIb25epmo?V zVjYK|a=3nE*LYgiV8x?kb(Sia7E9yOGW)zkmP-pqyyMSg4CW4bFI9Y_ew3I%n=?%O zLe;oBHCz%)ki+k2m?(#_BrZ_{^~n@c7^G6r$gnE*423k-l1?FmK_-PP2H6PN)BC(J z1?J@smBc|;H2i(bs&ykuOAa&0r3N`1HBf*WC_oJqkPRL&$YTc6s6iexP=FdJKn)b2 z88DbG8?1`WpawH#D2>fR$eA&yj>EHyhOBW|y5iEJIIP_;;zO9t4CYXS*&H=cfEp-3 z4HTdT4CYdU$MdMcTxOsEHBf*WC_oJu%x4)qOAY2T0|lso0@OeOYQR9t3>HuWEi+Jn z8Yn;w6rctS@|nRxYOqL#lGt+;ejvlD$BQWxuofMKCA1}0NMWfArLkoQ^d>4mGf;qL zpa9K)K@l@3rUpgq5)_~Y3Qz+Dr~!lJ%wPpISk4R-pau$10|jJ*#|&07gA!`6k{KvK z4HTdT3eXG~lrn==)S#3ZC_oJqpau$10|u*^!SmE$jSMBRwG`Gdc!9!t1}{?Bz@Uu6 zMqZ}Un4YItPI1-a3Wk*oV+Mv*h?^L0Ml>?qg1D7oHO10c4Z~VQ6T>>hml&FHtc9T! zv7TWA;x>lc5qB_bq*xMbVrWBbX1EhO+Zn!$=wR4_*vgPFd@L~BMX@xto8cacD`QTE zdvWY54BN1^ouLcS&9DRURfZl!FT;I^`x$m29$@H4JjgJBc!=R^h(BbwQ6)dnl`*}F zJy667hLwm0hE<507;a|x$jESuic8BFe@wYWAjQL3I1?@*4`$Dr9rhge_xi_X7d^-<4yKJy z*Y>px?Csw>wqWBo`9IVBRQF(d^%$4?UFIy-aa_F`-02Y@iJP@>P<2joS_5|nDrc$g LbJINXSK$8u3^)cA literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/transforms/__pycache__/_encoding.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_encoding.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8df4b661aa428c8a2385b0b23d0124049c838a1 GIT binary patch literal 2995 zcmb_eO>7%Q6yDigC$=|^8;3&wpxYLzIJ9wTp)Js+xJep98@Iv<&@|O{<5{O$+3PU7 z4yok|A1c*pD>dauaRrF2IN%5l5fV}n2M}D4kWeBY2ulx098hmYZY9(cZ+2}bX#?TF zNc-NKnKv_U-h1Dh{n6+1GSDUlDr(WiFu&7Itu<7IN*;u11~WqpMwm^YA%r>1C)gx6 z#3lJ59x>RZxNw8gptzLAIJ+M7<2-J>$Pc+!oN>d9XT=#0%y?Iv@xqL7o(&3_-T@-( z@f1nQ88xM;B34C3i>0utjlLkB7z~T?lxB$0^U0A^VuvWlVv1&{;xTzZd1HsDt8`cu zqkSEB==~yh5YQlN`K9Q|{=>(QNw9;|A3k9!16QF@beY4fQPW_~v30R7V9uZ`?H1?rTpVHkV$%g=G}cDXxogrQ&!Q~zS#`Gy zY?htmFtQ!aMwy^1Lg`4Scw36A5Alp2%`yh!VVW^Gd|GgV(bB!U9>yP zoTE2;pkee`_5So^IG1zIB~&-a5_&DmHA+(HG3a^hUM!W?44qmh=I(cf&Zd$|2+QLt zRzgG`4aHL0xI%OYs1lOK)G;NYf@Ba`vqPo@x3vORtzT%z=Px`L7Z~LDpy!r84H{LSe{2(*$Z=Z9w zS3nJ1Xt6}GT>Y{UJ8N;eK?vZ20VzEaPhNo*_ysQ59!)`(GAI1)x5y0vc_+(S+jf%P3M2NB?B>p zL|;Yg0+;2;rd2Fx!Yxbf&9YgJraj#~%f=~y0+D+(MZYgPEblPWVL69=<**wL``lqy z9rlI8-U)gwq*$oOLb8Rz7D`)a&_b$(`Ye>P(DD1|OvZnv?Ud3Zhtq>f#U>HVtr+wj+~%f)XzbejcrTNq!B|JQQ&=1ERQBtpO5Nzio|72 zCA(mOPRjydz6Sc{<=!LvVLCwdOd#N}-+Zlw?dJIGk%eW#pN)KLL+)tV%d|c=-G05j z5Pdh4J9=C2UE4dkcP{W`X?yPvkCcQ+u6w1ig}~z`|Ec$;x47pM|9l{n>stssy130R zje*<#mTTja<5ReB*yKz8r}F%QyJhOsyj#qB76kv)`uw3ee#67Xgcdrxn2duk%^3A^ zu?obNyiO|?liE)O>{@HIOf7>YlSMWdxkzSh+o1yoPwm?I{ORD9EmQZ7|0S^v21eR=rPk$Cjj=3!~ zUnSQvlbN{*(XcFzFar;UrT;pF7mMZXsNf|cNqj)iJ_wY zR~NEb^Z|ppkGRE$zV1qPiThV_C0|>MT@Sd_sc#LDG)>A=$*?~9+UrMN9yl6#r7Q8% z%;}707!T|1TfoWpQ843a8^%GqpQcwCen$N6%i-PRMUVkH^kl0^iuOr^k-mYb^lA_3 zgdv^Y7!ct&yQ$`EHAYgTi>{`p0>=&HSds{9eJ(f@` zPSp(iNw#5XCxC6?j8h+Kl`)yZ>4efrsLeEbdIm_DLkRuC^#96u{$Tc%8{BA9AzEgD znS;eWvs`iiO{qe=^}Obj^K$g$%<65~1>EUb|^2av1_{m#LF zax}wq@Y4V(c6tL#8RS!bY`6Wf^p_gaY3pBo z-yL@8kWUnO z;@U$ljqTBpJ9V<&dtuaj#VuRBqr+D(dM}P#!Im}O8h>~6+?#KVV^4$E{Ui64^Owfm z-m6zGz4n@0p{w_f{r>1V_Z9bu_pK}L3zvQbr0X;ghj=o>dquw((KA#`BkI&qiGEo4cZ8b!Q;PBc zT}&HeBqo4s97qof?U=4?MdJY?XmT1!r>;oCoL17^rb$48D;0%NkyZaFt$qvEC5;lT z(%?nJpV&;|!dUGX`qacBzy!>O2oo+K@QYv}T-gn#Z2%J>mCW?2Sc@ zH#*+?z1~wX$5Yx&kV>ZZ-hx6QIMEw>sxg}hO5bUcHH4RS!d%cV$|gQM4|mQdVvk7t zbFvPu`_!zg4f0`G@0*+B!u&}lQ?~@!Fcsu|VV;v|zpPXG zWTP_A9Q0L%;o^FWO{>!M3*KNPJma0^h=6y-QAke*`eq{`zK`?W_H%q6@y+y2MZ&jv zBH+vH^UnF__@EzZBJqWlH!k#f@o5XQuDK<-$t!vZyyM%vmk-T}OX}+(ov;ROS@>7T zmZ{xp>&k3mc6BLrBwhW(+);64Xw%PbVKJ?6stR&FvjWlOWFG7`)BT*`N?41 zxXV_poKBoxy_{;!u-$9o=IQOe^V!q=o5R1-{jK)Vg}-4RpT3;Do${@ zePwfaiyhoz2KV8R)H4`?cR|63W~h{71fW6T&5u*-^cXS0xRX}IE8u{G9D!VJ1jbfC zCE|p^R`>+G_`p}TEybkS&Ml^s9EF}|_J9gT*~1V_3<`!8p(CF-jaa}WMP+E;XJRzR z#558WD58@}3-?~9?m4Cf1Y;yE;MGNVw>q(iQ-MlIq%R>+Ob>6oRYL*F=mSLzDqF*# zuA!DhFKIYK6b>5G+&vALSm~`7cLs0$bq01C05y9sl@K9DmF?JikPc32%0H)rDp8^FDLtimHEYt#p$%_L};fjiY&`V_s zBB0s`WyApJMTH_fOI=9>4D6-?yVc#O5Us3RWN_({aykTP(o|Af75Ahs?Ma!bC767;Inq(-c?C!?~bXU|?&#-c(MMfu!kFJAjVrJ2m%J(I|& zs|Om?TB%g86LMyWby~)vT9?apb^Zk)xjuYs!gnm{S)4k4%(FN>{ZIFf{c~bc(JhBO zU1vQ)cY$IO^vuZ^v^~zVdjs%i)PB9avt!b;AbA#)n&i3ZS?D>2Bn!`6PrPs^;M+e3 z*=34WuIOYF+;0%UcU@J`u?-Ua$Mk&~Ue_UpTB80;yFimJn8r!V2EX77&dvIq8d>8y zF4I#2LW&~~U{6g0GdyUXIWiO-E-8tE@u#W@*8u1LsAL8eF>);!S>Or5RA$R~6m|Xoc#6yr{t>ceD$CZb3?+urtbK<)y3HO< z&1cwS+4`p4n%b46#M0KGQ=5iN&8yf08h9Wvu+@BWW9FBkpNGDPWY`Nze|3D}&Oq`9 zkJ*+_>OMX6@u97rOTT_O(>{)ajW2*eQVSXT&?6?@=#H~b*v90U9lLAW?#kGY|6+Wz z=?l-JbB|xjPo*gNt%m;1m;ZY3SA&0fHp5;nEDg#(YZ=N~>Q_b+qj%o?@XAM%4<^6X zX)WfQk!onp)-)(8Pcg3u0#39_wZ^(!J1c?p1QD%8L z=w?V{evaUaBrTZPA#)lOwM_7}$-1e?d{~rqXbcImDaGD00=(xteCXmq)T}&8jVaW{YY3 zrh$RBi(+8J;pKmu-Ye-gbbmMvI5--K9-&t0Nf1Je5sUgTP()Ffw~)$Iw@%Utbzt%r zO%9cYqDoW%RL3M;0C#pEaaz@|0M~_82c?oL!rTJ!ljm9CM(O{OehO4@E=sejYlA-Z zz^M&l(cuvBaw@5k?)c^mF(WuX4U+M>c=3gZ_>-T#5_gNIkC`MBIaQgvoL(|j`k z`v455Z=or2{R8Sx${e$BZRNE}=cOrdNAu>Nc9p){48|QDKp;JkN^Y8@RZKP0n1Tih zF6^O7+T$?8YZldXcPp?E)pmC~oK<-kK-I6nK>q!o#U?I9(E5@M;O%g}V2})97hXPc z3Ntje6s&NX^Dr?C9T+bV=7S=66-UowHiF#%I^2?Mob?G_Pz$rN1#A2wPp13=FKfaf z0pmmk^nfA{An08oc|l!zF=8TuUq;ZyL1nY;|0`)1TBn zX6*;)cAH+v*n4*DC${Y;Hb&C+{&dqTo4)O)v*55b*)uK2cUoTFZh3i=Nw*B9n}*_+ zC)VZ%ZycPY%CS>*bi3*(z*bdnT=RsjQYp?Q$w!M17T4$x-b;qoPOOp77uOdz>4)#F zg|=Q8_|h`?oAP}dt5$0h<{#^FKZIlEx>PK?mS$U${1)4>#dLfVwUE;aYW;(PT4iAe zyetROH^VoGU?pp~6hr|eD!zZCYJICEI8-Hjx9Agqv-P_| za7YiK5g zq(GxuE0t35pP2c3D2?juF;+4Q8i@sGRSTy9>BY2nv(It*|9B=ag~s54#Ic-(vjor( zIgm<-r+O9z&aR6w#~2r64$v+Ulju3bHP=T^Sb{zv13wLrf>8BW!BZV(Z0_A zVrui~7q=d}hF7m8hg0+e_r1v-Ti3R&D`V?^L;#p=V7vX~-BkTO?_=A*XN{k?uD5Q8 zY3D%Pylb=HpS(AjI-R!l#!XLH+k->T+J0qGtyVa~*N4A7s^#8dhqstv1v;H|vcc=+ zB2!*(-p4U}Ar1nLPgoT4GRKRisLH%@M7<~*L1hK!Ac{y3ZOI_^>BRZDAWz=Gj!w)l zmZez2iat~1wNB-2`Xey()Ho1|aPvWah}?uG2$2$g23bx^)ASR{_b-&`zZos9`7a7H zBSrV*^fmNQsv}21vDUev-5B36uJ`R@Gk1ihkEZH#6clT=HDTlY+TwcKJ~ng4cDgBb zK1V^Z=HBSo5H`Bj-`vM$?jqepAHf-+D4!9Uxsx{f9G%kUC@j}>Yf~F1)@IjhzE;}G GG5!zOB6Z#X literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/transforms/__pycache__/_headers.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_headers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55f3d9cf326455f7ae96943c56ea8fa6ef81fc53 GIT binary patch literal 7344 zcmcIpeQXnFnx9X9C$Sv|hZM@>0}?|B3HczQgwLc~S{fY$wq0u0=XfT;>BLEACL|d9 zYVB2OLM!1cbYlc3%yCj3!RlP~PNh<%?n?XL{$n?2Ryfg3TJn!@Tp5OcU_Z1Z;0)Zl>Um4Q>9Q-koX(jyB2MF{u=kIyR$Dx(OqimHhyMQkgH_ z)7{OF94111Uaox$bALoGLa%Xc75PEo#*=sRZ^N*L zvMs1Wi*clnr>?vC=P;$Pg{qL&gQwqy1xmIJMrH~l4-*Rd0DSRm*^zWX80l_a=V4kg z7TS+vzbQQB$IyqU5X}>H#U4zEk@yfDV#dL{p1ulS5rm2`8=`n3#MTkd{azi+z@izR z+e%;ZExN+j@`ktFpp~s9b$YGNi0i864Lliym0mW4$QuN23h6+aL;4%U->}0Q?PZBR zU6vsUZPzN&*y;fg*i=@LvQ|+=(9?40K>u6b-9h)P&52%zRs5)8jO1|@`(km%0ql9)?)t&ds=LK11MgbpG zMt&RmvGSDYS5$gxQdU{TuZZBsBa{3^mGwnD1vDK$-^~<)#Ip(3ca!&c6FDhL= zn5lLw?pxRw>$tiz{j+VJ&=-Cl#jJ~?!Xi7VpmK{6h2GP6fn4|RF zz{4tgwEKf#?BwNl|E@Ex-04eFe|sont%=zdOi?yd$Hi*nws?2EIpIj~iJzvNT`6l< zrpkI{W_~96&IiGiVe=Y%G7+r#c}wG^y{$cAms{S|(srqI?ORVIC zW-9x-SAjjI!2m!^kEq7i&zv}Y{KWC0K^H9{Y?K<2s0SeP| zr91!v6rZtFL${UifDa{NF`$ zz=hzQXED6%hOiGji|uGh7~6jXg&%_KN`7!+BjnH!dt5_t7&?Cy&~aSD=t2#|54wO{ z&+A!e6Q+3hSU@tsSdcn|WhjcFu>85xr*PiCum$|(wTwj{&yWh0T9a`ZA)7;=P_mgGY zE6C~VLDt=T z;@l5zE4!QBZLRJWx9i-c?$+M+@F9ffXxiQX6uFA#b@L%414uTxaJ&GOBfphekZ>Wf z6NHP(x$0gh%g6_~yqc@QNuSoQ)CmocSII{}M2IX!vpv`Bi?s{2SpteTeQW6U2w)6* zXO^OjuB?tQR%Q&AHM7p5iwtBd2|zvD+`h=rvkr>39vRTucfe%cH@-gGWG%b=BRy9g3P#+#)h%gN+pkYnS71l@NRe@K!VH3g(Fc$_|8KIR9Jt-Q~ z32EHpvROePIXAcVHmncn=gn`!E=$^%7kkf{fyc7%8PL21EGVq&vJrSOoE#KuVbpQt zs!v_KUP1nfFW|+-ITMUY!+FFKHk9norb3(IJ!F6#%ERou3GIlFBHYA$Tkcq>u?;I2r|XBE)XA)uMs4M&U2s+tUeMHOU~0s2{QSiB}Dy-TqH^2 zLmNX}Bz{fLkW1vy5@3)@wY94xuTtU#)d=Lp!wXZ&c^TOj$8iyiH&2WQ#)N<{J0%?q zc7RV5jxT}-5Nbq_1RgU9e_%{5!jIPFFbpNX0OGrULow|uNB{M~*pZl&@Fx522Ckh- zJNh5XDC+%nMJ>6YuO+`POw@vje?)(OpxA(>Yo@(Hy%qwHaf16$ISijxd<}6CuE~!LwBl;D@34r)C^gxR0ATm+@9I$eWEuz}`_SQx znc_mipWd-AUE6!Ve%Wzg&X_SffQ`O-r1aUVtq;jXbL zMB&BfD_wdWdLaOj{~g4W2=UOq>4CF(#o3;AwkI9S&YorazQ}7|n>S^us-q|8X67ot z`N^q2SciX8AFW?(UTBWhU){N4Z%Er4;+@O(=0ttkzU$s_()OA26X$)~r*(HdU)lOo z*5Sw?h|IR873=l|CHhXR{+G6tb$jIHv~@c=AiR9l967EPGJ0$D_*eR^+MNGlb1Xah z<7XZ?`j>6}kyrDy*uZuc0xuMA`uwH;MfQ%3F^=~-5r>IF3~w8 zK%`UJ1!9TBAW*VWwQl3}NoYBL0YBLd0-V)cfzE%?ws=FF|A&UpT0d!hU_KNX$iL4R zTD~@P0_%KOX}u!Oi;s*fBA<^e+PSeQ?oT?tIQ6CbbN2)DtGRRI2CP)`i8o>D|7)KB zt7EX$!M&RP}d#8IRsPT{s6_6bUS$7BL-iwV9t5iA#>ygj_r&NhGphQWSn7kkq zQXD9uD*Y6iA=Sxf12p}Co-zh2=;_+g#S;rB9_bm((Sg8k$k8=zd=W!5mi8gI;=rBeIna`&LMjfxW)gTJC)-^1u~suO+f1D{6PbvVVDll&pT2BM|Ff~ zp!-gs`wX;Zq=q6_g1iF}VxFbzk`*7nnPU31n5+e;pf^k{vIWAg5pWtvH)JRft;VRb z4$CyMjY6$PwH@$TaJnj&WiF^9Pr5f1NVl{P27tu9-Y-v2O$iblJ^SE|O8QCD)& z3m#mD1V9tuh%aloykfErvGFpr+Ye*Nh}7j+cU^CyBi3=N?|R>@m(q2;k1@mlMcoz> zv%%cyn9VU5_$~si&?`=z_o_6^jB^w&HfLIh7pi*58NweAiu4wK5(NbakSwEI2Bsi6 z`oZF(LU7lQ_>xf=LB0qA5*BqY+-knw9IwBzGw%I(?Dkl~duM!QM^Ac3&+?Ams2)h) zi`)NT?~d7k7H(eJ{OcEf*^#n$&*?#YZMI*N;X9N12X(yfF^FCYhLURFiL7R>82rOjz@_7U(qC5iJ9zm)gpXF-;E_?b5`6P? XU>T^_baME9WAe?VcORig{89e{Qr{m> literal 0 HcmV?d00001 diff --git a/conversione/_pipeline/transforms/__pycache__/_helpers.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_helpers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ac5beb788cfa6be654464f5f46a3a2ee60cddba GIT binary patch literal 7693 zcmcgxTWlLwdY&seyosdlw!AiFMHXpElq1=WWyu#sk|S+o+K#lgVM`(y&RC{Qk<6Kq zFU%0DE&`N{4TwszD;h}}8oLY2@D{704|Rb8+m}A=3%N8Zdg=!?4bb|H8>K*3PyNsE zqDv^*edz(_T>o?X&pH41|Ns2aWYQB5rpDHM?^p@qKX9QKG=H)Yjj8CojtQ4UM^^tstU*cS}=Y&QjeNI;P*ieI1*4O2qHqs)NU*k$p_UkwLm?nV~7YHp(H$?8f5xu z;edGZ7M$H(5}hOo3GYx1JoO17(*s2DFpAVbcZeipavHOGTi{D$8u56QJJK=5(Ts4g zUW$Y-Lfs)8tm_qK#ug6YiZ1xpdLeor*o3(>Q4M>COSBueenl>lB=Mk@A?^`>OWz?K zq{~4nTAx_BF$(28qLALgbwLIgeqGl)B?N@KA#^(22jLcS1Q zgvTGc=}~CE5Kw5CMbV3r2T9@`pLEl~pbF@ya4K;~;1TSWP_%wQlHmHPquL;-OI9q~ z*NF}QH6B3xIYe_r)?76oj)m{bb3@sNrhhee#XZT79^ZR(@3$X+MyAYNbMHa)9cPHX zci)&G)7+l#30miPX*9zYl2@La+cU=Y?`f!7wbZXzn$ni0WPiqDPZ{msvrwDU5Y|0g z>vKAm)%@jqny?%dv7G&7O+T*M3dQtzP^V36=@QG?7DXZ@1+ni&NcEz3!eZ~5>>oYcIAlSWwst1$hDlfRHQ z=D|;STiAhbCGNhiy>x2Y9Vq`lPM|{8?tTKV;hlNg?Hl1OJM5;!7nskw>BP(5(Kp@u zZg0KtehN3qvT06o(=DSwfv9wab(7cI54TQ?i>(T=PCD1gGpG&@JZzZk@4GlWc46d# z%^v0)<4&91+3HYLjaT^Dg^f*3wti2@Ck6dM+bmv>?c2F8Y+?Om|4`q>vB3|ZpGh9) zTcPh6wR6~jyWI4IKxW&*4U_ik!25 z7_dqQ+MN4h-QnylQ9(3iK10t5h7+Y}2<4a`;1f?M8M;=~4=r~Ye zu_FEpaDR??X)w(nj~$PJIt#}q|%Zk;RwmO%@ zjJ0dU+MTv`XRJL@O%{3!i7Uy$-#MSvW)7cCIr>uOcT?QEuUW}Y6CWpLmQJOt-Or9@ zxUvHB ze|V*{Ki%2?&;6Osp_NWoy3>{EyqIaalsfo9$~>ClMpsRiFZL&yjI}vsIh0cMQ)GhU zL>&-q0cNb^slZG@47Lynum`GIc{OzdRwMM33ONHroSe7_@WaaNZfa^oBWsXV)|3NW zz63L)PJjg;3G#ZZLXfubZd%K~{xNJex!^w$?>*p)((g2HAj4@RDk zi+lpZgyOYfxbqPzPCgiAfNHv=o@~r6m+qcS-M|o1;R;6wgd4tqPc@(Ad|m>JQXUh6 zNSN`+V>i`|JR;0M=^_e-f?|QXP*BPHgDB|tdyp?E7qj=|3nCbrxAUgC`~i%HZpp5o zp=O{(O*pdPUiFjqar^a2$Hd2-ZO4y9#vSSiKpke9Fe;HYhaRk!kl!b%kc9R@ zFGb@4qdUN(7CcUyS~GBAtZ!rlwc{EGjt)cQs8B_U!ny}tmq!)W8=MVDisnNwq;9G1 z1aa0cDXdyXKf%4Qajnoo;I={sW@i)z38JKE0j&Ijdq&aZ@ke1k4D$CxtS_rTRf_V6 zwbu>6hdf@%4Jdij4TJemFfaqwyoiRNPbYlhJr%if45jPN(FCi{>4^$6*zhLn{I%G% zgeGG;Fn2C%GS8h`wbrc>``9YLnX3B5o`s&zPer*G=DNg*l?G?J0TAYSLs!;VIe$KO zKGFQ#Xv-R_zcW_M55Y5<_e2L)xr%vH%=AP`)F!wjlN?K)OV_qOt9{O$&2lF2g6KbG zbKStSTrUBf!8*Y0jA4J0O&OX~OtXqMTYv`JVfgW{VT&&hG@EqNa#kRKP)WB2e@KJ| zaAIU;lZS&UKKEPDAAKuRGNal;7>vr;eMQ?Wve}X-K?JO=5I~@4iZi3nWQJ$3gKEcl z{|e8Wr%@eVyUh$xgM0u{+S??!+DcllT!Ii@b89c`@9^urv& z{dPM+^1-*8c7?^)w!i~G@W!%&ykyBfEkKM+ssi1VcPfT8;{lL>k+5~L3DT~JO_iV^ zIp0!^fOS&;3iKEvFKUgG(0YB`Iqm>U0(IkhELPP)y4oV>1QZ^~YB&wTq?& zQ{v_~!H2=j{*L9Q^!{Vd==A=c=a$~Ar6yrr(8pOk^`!+(oXlG4vw2Ar7oKzbU>DI# zF-?@rIS7L>znA+RKO81N>|*=Gk+__^{P_B#>&vy7<`Wr1_p@Uu!`T#b_Q&!Fdy`dp z^ef2xHH9V8rxZ^~P|An0o+(&Bk-MouNyuueEj%bz7xJK_@IvWOgKRY zd*^863>f=ZH+$25h%tjZeU3CDgK9bB!<>n2q4oyVshom#eLnz~X4 zk1h{ptM)FQS~!)oW~!Q2svPMmN2aPZRoRxKNvktQ)250)bA-`S8Yo+{KOw?Tmt#xS zOPW+wN6OeyP(`qVU~i%;(VX}wWo}Gyjp}m`H&3?1i2z>XS zn&FoLd&BRU5`nFl#R_{~b-mEwnL$``E4uSP8yOoO8Fmdi=)C;C1l4(dV-aGroy~K* zekV4g%Rq*KRD2&IU@rVrnj6e9)$GA9Di`Y)>T?8Szw-V!PJU=Bh_ftr1U8uEj4vxJ z^ZuCs2OVqF%njs>z?FDSv?s;XsvC8f6|LLN2fc20eg}Ft{0M{J4Okz>Qblzgp)s7p zE^Dk^2mR__1l7@_u+l7;8w%|UNUHvKkljzK87}6g1<_Jg<-Zh-$DmYS;=*kh0B#*A)Bv)HY+LKi|0+OYwHJs!+ z?c|~4XpVqn>1WGb&*&2_>g2dvMK9HNtP^2e%o>NvMt-O<5wQCY?k7Tq>=fM?~G(y zA+h=qY$?Zp$v6wqa0=5+fTh;~R*NhU?{hShWR)!CR(O>CN)OdfdFX!Hqv}_Y1iYu+YLBL03Vm0ql{f1R*v`}MQwMGXuC2Y1A!3;L7J&T~HW!0Ku zsIgvA*p1`;z*x6W5YhM~@AGmtToT1`$2c^DZ7vtFO!PTIb0ly(dpOPWGntQhnd)-q-MxReCzqbJFo<-|0h+*E2|xEV}irYd%3hJ|0wz#Bb<}vP`bjwT^5nkB|3ns5eMg;;E~Vk0bYf4dA?5K)Xn8O z3291%UTGELUx3_%bd9#!rM8H?_Q4vhb<0}c z6Ajr>2x5(O%gKdhF7yxtvS7*h6{96`@1-2*=h~9QwcovDyZ@p9C`QdoROFIhAye~H zNXhDl6sG_haB43)_KegWm{gswZW<68)5J5f1>rS2JxQO37d4Z`I6J$!vFY4^&@{kK zRuw+CytB1=$CiLp^G;iHJCqu0o_qoQ5#AzpU78lg+`O1pATF&G#s}e9MEqEqcKa@I z2;tqAra?DcBWb0Z^QKkKu`$lerm0aM@3m_X7ArIc{SwkzFLy~&H6g8b@Lq@LbEVaY z^Y|t>=pgAJHRyzabVEE6M8_o`Vg(Hlu;s32`&&nR9v4W#$ht8pt4?J$cH>as{><^3lx!S8M^}DAv&@n;?YqJ5HE&XFo3Y8h?LPXqYLXI z9Z92YsyAgcP4%UW)~SPX@-0z9Zwyik*0Py5!f!$;i# zaDP;J!x24^u-4C8Hz%x{lh)RG>$Ze-Thh8Cs7o={4=WQ)U8=MqcrcsjEnhLF1w+a8 z%hQ)5nrO|Oq5hMtKi_$GXS}2LfjHN6G-WK$RmaQs-e>0w-O?LVsAFb#c=wIHNrNrg z{?JgDs@ND(r;O|7jkbi*7PZV7>!4w(bUlouDVf)lCp6^|%OlN(Of}3SQ$y&D*LO|t z3ca0VHr^SImc*HkIMp%RmdzJoU5I7ig%w!>uiZ2TFEO{l0UmNnihIZxOZo{}c>|;f zJ+ha73EUK79->%NoDKh!i3YMCLzVh~$Dav_3DHh3X^Poeqbb$8Le&q$=D@3CI zgt8)6{@PFaX$jKuDlhkkQF6tAO3bVDsA;KQF8Nh5KR_LzR)V=a|9+*PW>u5WSmqLs zjivw!S6KzA4y`dB=~u7Cv&<1X>Q}SsNm!Ma;%eIqxT^m%az$PPzWnqg>>*ZjVF1=+ zM6~ADllH6SwE`0d02t8tsnIgIeku2Bl7KNF7!MMt|vb zv^%82g~F88 z`L*);26P3Z>f~8~LBlfXDyY-u>-5qd?RegL$n8ih4TY?%BVV~D$bJ~{Kv6dVyH#Y$ucZ1Vz8+7|zt`Ui3dy@_E!_XT{rNFtyMIOe()Fi9z z1Zuq7Hs%yX4tcS+F46LGylo8e9^STfTdj@d+`x(p+;y+$+hc1OwA-@&v8}qg+IFnP zcA6J`Zl4Xj>%L>1n4W8#hw;#uPonmD%r@?I+T6S#!kqZ*Y-R61%lxEjzze=e;0ZP| zPZ}T@&aQHNzvRw5*$a=7Px1ttAY_DOycCiFwrQYcKR7UFJ2&?`H&1>(wfXa3pL>EW z65kBVED>&R4m?xnboLTt!JV;(g|K3!l^*A1NO*5r$$Lfca5xcRsz`+s2<*Ul0bgPm zoF6C4B5beH>LK3CLgbLva`GGyhBCAs5_?hBfmFIcB(U_V}e8PHAYGzXKt-a=}n=w>Avg7r;jJ~l|k~U ziqMzOGvLisMx2irTgq4x>`a-=K({Vh6&<`&mn^kEG}&X$Uslf!ecJRh-`_XScb!gj zosOS58y`5AICFmf44XK^CeIAbbqz!J(21}rL}rwP+44h(Jn9;LY%l_k;}P=eEuDj1t~tRD{JEy7>OKxyr5cl{*uaJMWi1ARl!8 zmi|@c{H~*kT}P9cE+cUYuj>spieIoDBB-zHHshWOy>ZY0v%B$+E8Cp#_5} zwDY|{lzeEY`q&k#`G}vb{u^J)ST@6inMmgc#VNBTbTPawvhDqS-{VDIGvEgV7CbSn z3X+*U1fvfQU6sW4;ks{LCG_T$!5lgfyc8_{{VTQ6q2w!@=b4^xU8p*Aan^kQ;KR1= ztDSMCXR0T`^eiw%*SAk^57{GcMBn)7**j-vEy>#LNoL1y%=h2?+gIXDZ=CA=-$f;% z?iT$en}r6BDdAO*g_hLPZLV(UX*>w?j9@3Xbp zlvT>D11jKF?N<=f>=6X5id9?Z*U3*xp{nYb1m&n1$Rg~1OL&eNp=4KQ zL0Q6H9P%OT#vtrdOF*5)R&6evbos`;BH9L(X&sI!WvrF*v_^_h#~`NELcp2}IJMb8 zQG~Y&w^zV~^SGMDR4ZVrd>)iP020t)Xn7WH;lCk)5V6Drkzt8-rXpMsIgl)=o;r{! zF;5*%m6Ux;6w@6+I%O`O*%RJ#V_#5{Vho|n31;J5O~>q8b2YD~^riFq%7nghZo`(@ z^5llybNW3gJ;2`d;pyJT<(0RYn@qGPS#A&ZrOGQHDc$%jK^NIWiUo6Zv}?{>|4H|( z;xo-JG@t739l!4clu21C=Pfk}OU+#Ej{Aya?VdTy-jrosmdqR^*W>d5`&e-FW9o53r(?ugr z0ZzO)+OU{LTp-pPkRB!2)fihbA>*Ahka z611ijERBuORRybh&d7N)cZ*=ve)1Oa*A$!vYL1~M7#ZQ?3xs{aEaGD2d`4ks{h6`_ z{9sj@9!9?LG0E3WYv906xWtPiS*XRUlvW9x6S+q2D$#5BQ4R1Zg7=TQ;Yq;cY__Q^ zZfYIi1;e4*n0vAwAQT=3ImEFJh&4nwmJ?d`f+y^B3$M218{+LMOhNKLQ^aGFd8R&G zAE~<0IB(jNFl|bjYJ%z%W145yCz$n-?m5Qx@u^tZM+3864{NuBGc>a!yd%eg;5^KmgGaMd{IB$%G2BOyJr6?aaY>8{ONXRy6 zJl_COgngg|qKJl;CVP#YOw&Vd=diGBKVo-A+K&+^;QcsFvZN#>__*^NDdi8lZawd; zKW`*Kj-bh_Ivjafzi5ZU4xnKNfRV-GLC0|-?SFPB}yU6YX}}(Fl?AN)FuqI(cw8mORB7V z=2G}lFVxUnkd;skqBR)`dD58oV)I&Y0e24dFOr5GPK?~LnrX65|K`OR1d2UzJ_ zge>KE+O{+Y>@~|IjpZRJM+>Y<;*ZH1Ecm=f0qDVpJAf_~e9)527`#>FjF~P6WlbYcf!33ZWr9_z~!N>aJRz^?}nU!60n!{@52W}1D9RU-UaPlQ0Id7E@;m}dluTW z@Q#J{5=uU?;Nh^#KdET3Nx=$y$I1g|fs)~d_!!INoimgdD0?FA?6GSTT(JN;*v(UR`4JG|vj~SpUJceYsWeqCfTxJzB z`%?CE!DyP%hP4?5S+wu5-n^)$O2Ddqqaz@4IWu!MeD=np zXf>GUK>!AIOq}hR-5xL79XIYym8^?YN34;-XjxPnH#EjIjaj@numUeY-Y+Zdf%>E8 zT5`x@r=@ejH2qrkLnZ`}(urPL``VeKr~8if9qR?L;d z7g^^BAJl!%BSOQ_;Ue6^A0PoqrSwHpy%|bFw|#JO=B@Bs83KwQSAU1IrxrqAj1Oeu zrrPM0*_z)B{+;_*?m6bjR8NLd(uc?oyKW!4c_>5R@+ao+r2HueC)cSp-q!h`9WCd*87 zxcRA;HtK>@rnrP|lDk0>cZ2L{U2d?H=0r+Uk<#cFijCLZ({AXdR{@)P_|O=#2KU6N zQb}yffE+t3_+G?^t%V;I;qzhcu;CPOmpo=C5q7KZl9C(*TxH9yu>sb&@GlK?0qQ=5 zTRJ_RA{Ln1D4S%O;#AWQ93JsN4H#e`L!+2-!SLpS;d4KN2*?ACKEn+$hGG_~A+BEv zrtRbbdntSbcfhH-!;#&5ItTdJ04DHjiS$`L3pCg&WRIOS@Oz1S48CwroH6^{(#DgPM`G|UA+3kBiqABAJwA5a&Ap05XpiS7 z{4*qQR7#Rx5r6qRLi=bIL)JYOOvz`G4YJ=1?(8xz4sD`dp)74 zQtcKABhuom)k?HVb(KzH+CF63m#&+-Nz){VQRCd9iD{d(e41NQY30lIKi5x)X}TTB z=i&1|@ALote&_ztV9;U^&R$vfy|2fxzo9}QF!{{-PardovDg5{;w%xs2XK~TsQ@uX z4v;v;(j3Jy9K;+mOzZ^PVTx6~P7SCk)~KOIU9m<3HJXYwTBy-htkFS@?j~W^M;w=V zk1!nK$2<{VDCn~Z;{lt<31aA|ZH(hbI9u4mdq#NAu&>iLG3tY=u*b`F+QK1D@Oe6I zH#lDKbw5IDc0R%(w&SwF{j%ewOAfd5O27LhhfImwq$o2yCyw($FS02F8ZspO)(=8F zkBP-P7QdU%M{x=BBbzOOC5jfr2o@z-GD^XZmS_?6ssyQMbDhAj1kRt32#H2p9c3iO zugaU1C3o}rs7j()DgveLdgH&4RQ$grm0wfVJfCIh!rX+vY-7O`E&e2_SQVO+gx|pV z&(I8WBcuPmagJ(9%{wLaQ)X~t8;jBmIz?S!4Su-HTLTN@+=cx^LD=yWTTRK9$AfNI zn-=Y&q(G3XLDa9gCh z?fEhLHJ8-Y)#Z|0Lc5IFwK6GiqD%|nfKQak02hS3$os;wRuDbBC`|aoQJLU5nI7|q z-cgzHgu`5rl_`J77lg*+Lv{_1oDhKElEZn~;XLhdJDhzoGZYAUuM0AxAPXzg-q3hZ zlyRSo56Nm}j4;Q_WN=axkXzXbVdH4eXlRV-J)9=(4B{}@TkByK=;W1g=xTKFQ#jr-S3b;+9K#oM*B zT@Osn%ckbEsWo=`uNG^@ykq|4?8$|_E9RE+{HnQj{_yPK2{a^FGZ`_~V zjcKZ93~@ukv!bqpO^RKNGci27AJge`TQ%L2J%H&=Gsoh`5;xMi-BW$5I>XGt_`xMp z)51`yDXr_7>dWY=6Sc8pOVo~aI23FXNl@?yy36Y>i00wq`-?OIf8;|sVX$~*!gw7Z zwht>ne97@7j3pIUqMRGJcGK91%X{Is9)#!uhAZO_b^$Em5?*N@h2D9qSR^e!TEUjU zf+mpgf;$Hx{|-Kn%GD6yH9Xvv0r=8>xdhlixSs}6$GSR(HrqHby zkh|M#ybY2K&G8W~$W4a%laY3~_(8<%0bhi3m&m*OIJluFAe`(j+QN!3;Q&Pc#Qr^n zeTk7fm^HmIOJXhU9}e6Z_`o$y%rJ1UHxfd!Z$;g(D5llNz97NuYXPn9N4?MC3hLMi z9ziC*10u*PC_pQdET5%p-DYZhJGAS7pKuVODJ-Km&YX{*Pl###F7QDpor|CQLYbar z=VKbiU&lxTGu4+>V|rtvCU$7b$u~mNO{Z9t8P7sZxyS5ylmK zt+zAEBd{$jwR9Y~vQz=x%-lgJOZ0{U0Z&-q+(Q9R@VZ;s2fh<*&_sl1AOh&gc|*dU ze0kw=TGyUBwWRA>qPm`327*Mz5spLrh5@7{uBf9*?od=dQAHg|L?7s%K~`Z_s*-yY zE-vC*zDA<`MHL03<^x+~E8Ya$SV2EYRKcy_DjyMx+Ng}%Dku!8$kK(@QAR8>Z;5YN z3hB6SHTHzW^kFk9FRb264DNz;tHdJzf)YR{wsG_Ukf%`zO~e|h&)^glD5!mA%#8Y zgS%s7X-UiCs3$KmNnD4!#M3N;QlT~oK=SnS0acTWwzp8LCcW?L37$4=)}R&xK`(Ff+H zWph*7+>$acn>&~EoeEOgYjVV+&?M5&JaSn?kgz-oW!1yJAPWyzo(H$Y_+Att9F}Pi zjUyaih5kSXEYtmpt|g(RLbf?Jt5P7IuSaIc@6d}DUdynNimc_)^_NwOB<6(NW>T<2 z0+0Mgrdi+cu)rTdc}yP&*n^*LcG;^O=(v#%sC z&JDc1XUTXVMnSZ?@3~)k?=}6x_esOg$I|=yJ~uj2`xBOg2gEZsTb~pcPTihJ*R(7d z9WffBht-z%$oEb!(C?mqw>s5uzpi8P)Vc`&@0y9)fMl9xWHH3-pq%d5LQN`paglruS>v=nxQ>eQIeqV_|-4 ztUa@n*@PDeKoq)u2QqIEh_2rxSVDwSE>u(>q=+u75&&iifLRJ)7B77o@uPmX1Rw+W zOj6b}0=6yTW9DwYXY_cC6N=&mtWb~-x&AjG9#Xmp@t&r=x2Z3RkxJ-Rz^^C)t1OC+ zimN2ygHE1Igqm%}Dw6o6vQIbu3i|=y;%Mj(oe+d5`INC4AeR4O>^RNhh|bRxC9sb+WyXGqwqX$${iPUj72)Gb{;$0-SE6?a%ia0)jD|9WgG0c zda7%{(-m<|diQm?CWnVzlb)f*#z8x+Kwaq(Au|H!;k~1|#{`puD`wOqjBczw+Ycb1$BA zI^ea2E;8SbBE%O0I!>Fd4Av{{6pwgu5SjNugkW87m~vz(EnR!-0DA0aG01*Eeua|9 zb(p>?_l#~&y|8%scc$O(`pvEtowNKgonuD5ta7{Akk{?b(VNx{=L?^5f})@>qp}8> z7RSQ@E=P+ tuple[str, dict]: + """ + Applica le trasformazioni strutturali al Markdown grezzo. + Restituisce (testo_modificato, statistiche). + L'ordine è semantico: encoding → artefatti → struttura header → + costruzione struttura → testo → rifinitura. + """ + _has_ex = bool(re.search(r"\b(Esercizi|Exercises|Problems|Homework)\b", text, re.IGNORECASE)) + + _transforms: list[tuple[str | None, object]] = [ + # 1. Encoding + ("n_simboli_pua_corretti", _t_fix_symbol_font), + ("n_accenti_corretti", _t_fix_accents), + ("n_moltiplicazioni_corrette", _t_fix_multiplication), + ("n_micro_corretti", _t_fix_micro), + # 2. Pulizia artefatti + ("n_immagini_rimosse", _t_remove_images), + ("n_br_rimossi", _t_fix_br), + ("n_tabsep_rimossi", _t_fix_tabsep), + ("n_note_rimosse", _t_remove_footnotes), + ("n_simboli_math_rimossi", _t_fix_math_symbols), + ("n_formule_rimossi", _t_remove_formula_labels), + ("n_dotleader_rimossi", _t_remove_dotleaders), + ("n_righe_ricorrenti_rimosse", _t_remove_recurring_lines), + # 3. Struttura header + ("n_header_concat_fixati", _t_fix_header_concat), + (None, _t_extract_capitolo), + ("n_header_numerati_normalizzati", _t_normalize_numbered_headings), + (None, _t_normalize_header_levels), + (None, _t_remove_header_bold), + (None, _t_normalize_allcaps_headers), + # 4. Costruzione struttura + ("toc_rimosso", _t_remove_toc), + ("n_toc_orfani_rimossi", _t_remove_orphan_toc), + ("n_header_allcaps", _t_allcaps_to_headers), + ("n_sezioni_numerate", partial(_t_numbered_sections, has_exercises=_has_ex)), + ("n_ambienti_matematici", _t_extract_math), + ("n_articoli_estratti", _t_extract_articles), + # 5. Testo + ("n_paragrafi_uniti", _t_merge_paragraphs), + (None, _t_normalize_whitespace), + (None, _t_collapse_blank_lines), + ("n_versi_ripristinati", _t_restore_poetry_lines), + ("n_header_verso_demotati", _t_demote_verse_headers), + (None, _t_remove_urls), + # 6. Rifinitura + (None, _t_remove_empty_headers), + ("n_titoli_uniti", _t_merge_title_headers), + (None, lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0)), + ("n_garbage_headers_rimossi", _t_remove_garbage_headers), + ("n_formula_headers_demotati", _t_math_header_demotion), + ("n_frontmatter_rimossi", _t_remove_frontmatter), + ("n_watermark_rimossi", _t_remove_watermarks), + ] + + stats: dict = {} + for stat_key, fn in _transforms: + text, n = fn(text) + if stat_key: + stats[stat_key] = stats.get(stat_key, 0) + n + + stats["toc_rimosso"] = bool(stats.get("toc_rimosso", 0)) + return text, stats diff --git a/conversione/_pipeline/transforms/_artifacts.py b/conversione/_pipeline/transforms/_artifacts.py new file mode 100644 index 0000000..a3e2f67 --- /dev/null +++ b/conversione/_pipeline/transforms/_artifacts.py @@ -0,0 +1,106 @@ +"""Rimozione artefatti: immagini, BR, footnote, URL, righe ricorrenti, watermark.""" +import re +from collections import Counter + +from ._constants import ( + _WATERMARK_RE, _TABSEP_RE, _SUPERSCRIPT_RE, _FOOTNOTE_BODY_RE, +) + + +def _t_remove_images(text: str) -> tuple[str, int]: + n = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text)) + text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text) + return text, n + + +def _t_fix_br(text: str) -> tuple[str, int]: + n = len(re.findall(r"
", text, re.IGNORECASE)) + text = re.sub(r"
\s*", " ", text, flags=re.IGNORECASE) + return text, n + + +def _t_fix_tabsep(text: str) -> tuple[str, int]: + n = len(_TABSEP_RE.findall(text)) + text = _TABSEP_RE.sub("", text) + return text, n + + +def _t_remove_footnotes(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + stripped = line.strip() + if stripped and _FOOTNOTE_BODY_RE.match(stripped) and len(stripped) < 300: + count += 1 + continue + cleaned = _SUPERSCRIPT_RE.sub("", line) + if cleaned != line: + count += 1 + result.append(cleaned) + return "\n".join(result), count + + +def _t_remove_formula_labels(text: str) -> tuple[str, int]: + n = len(re.findall(r"\[\d+\.\d+\]", text)) + text = re.sub(r"\s*\[\d+\.\d+\]\s*", " ", text) + return text, n + + +def _t_remove_dotleaders(text: str) -> tuple[str, int]: + _DOTLEADER_RE = r"^[^\n]*(?:(?:\. ){3,}|\.{4,})[^\n]*$" + n = len(re.findall(_DOTLEADER_RE, text, re.MULTILINE)) + text = re.sub(_DOTLEADER_RE, "", text, flags=re.MULTILINE) + text = re.sub( + r"(?m)^(i{1,3}|iv|vi{0,3}|ix|xi{0,2}|x)$", + "", + text, + flags=re.IGNORECASE, + ) + return text, n + + +def _t_remove_recurring_lines(text: str) -> tuple[str, int]: + lines = text.split("\n") + short_lines = [ + ln.strip() for ln in lines + if 3 < len(ln.strip()) < 80 + and not ln.strip().startswith("#") + and not ln.strip().startswith("|") + ] + freq = Counter(short_lines) + recurring = {ln for ln, c in freq.items() if c >= 5} + if not recurring: + return text, 0 + result, count = [], 0 + for line in lines: + if line.strip() in recurring: + count += 1 + else: + result.append(line) + return "\n".join(result), count + + +def _t_fix_math_symbols(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + if line.strip() and re.match(r"^[\s■-◿☐-☒•▪▫◆◇●○•]+$", line): + count += 1 + else: + result.append(line) + return "\n".join(result), count + + +def _t_remove_watermarks(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + if _WATERMARK_RE.match(line): + count += 1 + else: + result.append(line) + return "\n".join(result), count + + +def _t_remove_urls(text: str) -> tuple[str, int]: + return re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text), 0 diff --git a/conversione/_pipeline/transforms/_constants.py b/conversione/_pipeline/transforms/_constants.py new file mode 100644 index 0000000..18760e0 --- /dev/null +++ b/conversione/_pipeline/transforms/_constants.py @@ -0,0 +1,161 @@ +""" +Costanti di modulo condivise tra i moduli di trasformazione. +Tutte le regex compilate e le mappe statiche vivono qui. +""" +import re + +# ─── Keyword sets ───────────────────────────────────────────────────────────── + +_TOC_KEYWORDS = frozenset([ + "indice", "index", "contents", "table of contents", + "sommario", "inhaltsverzeichnis", "inhalt", + "indice generale", "indice analitico", "indice dei contenuti", + "elenco dei capitoli", "argomenti", "table des matières", + "tabla de contenidos", "содержание", +]) + +_ORDINALS_IT = { + "PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV", + "QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII", + "NONO": "IX", "DECIMO": "X", +} +_ORDINALS_EN = { + "ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5", + "SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10", +} + +# ─── PUA Symbol font map ────────────────────────────────────────────────────── + +_SYMBOL_PUA_MAP: dict[str, str] = { + "": " ", + "": "(", + "": ")", + "": "+", + "": "−", + "": ".", + "": "/", + "": "0", "": "1", "": "2", "": "3", "": "4", + "": "5", "": "6", "": "7", "": "8", "": "9", + "": ":", "": ";", "": "<", "": "=", "": ">", + "": "≅", + "": "Α", "": "Β", "": "Χ", "": "Δ", "": "Ε", + "": "Φ", "": "Γ", "": "Η", "": "Ι", "": "ϑ", + "": "Κ", "": "Λ", "": "Μ", "": "Ν", "": "Ο", + "": "Π", "": "Θ", "": "Ρ", "": "Σ", "": "Τ", + "": "Υ", "": "ς", "": "Ω", "": "Ξ", "": "Ψ", + "": "Ζ", + "": "[", + "": "∴", + "": "]", + "": "⊥", + "": "α", "": "β", "": "χ", "": "δ", "": "ε", + "": "φ", "": "γ", "": "η", "": "ι", "": "ϕ", + "": "κ", "": "λ", "": "μ", "": "ν", "": "ο", + "": "π", "": "θ", "": "ρ", "": "σ", "": "τ", + "": "υ", "": "ϖ", "": "ω", "": "ξ", "": "ψ", + "": "ζ", + "": "{", + "": "|", + "": "}", + "": "~", + "": "±", + "": "•", + "": "√", + "": "≤", + "": "≥", + "": "∝", + "": "×", + "": "÷", + "": "×", + "": "≠", + "": "≠", + "": "≥", + "": "′", + "": "*", + "": ",", + "": "≤", + "": "•", + "": "•", + "": "→", + "": "÷", + "": "", + "": "→", + "": "", + "": "", + "": "", + "": "", + # TeX Computer Modern bracket/delimiter pieces (U+F8EB–F8FE) → stringa vuota + "": "", # TeX large paren left + "": "", # TeX large paren extension + "": "", # TeX large paren right + "": "", # TeX large paren right ext + "": "", # TeX large bracket left + "": "", # TeX large bracket ext + "": "", # TeX brace top-left + "": "", # TeX brace mid + "": "", # TeX brace mid-right + "": "", # TeX brace extension + "": "", # TeX brace right + "": "", # TeX bracket right large + "": "", # TeX bracket right ext + "": "", # TeX bracket right close + "": "", # TeX integral large + "": "", # TeX integral extension + "": "", # TeX integral top + "": "", # TeX radical top + "": "", # TeX radical extension + "": "", # TeX arrowhead +} + +_SYMBOL_PUA_RE = re.compile( + "[" + "".join(re.escape(k) for k in _SYMBOL_PUA_MAP) + "]" +) + +# ─── Regex compilate condivise ──────────────────────────────────────────────── + +_SUPERSCRIPT_RE = re.compile(r'[¹²³⁰⁴-⁹]+') +_FOOTNOTE_BODY_RE = re.compile( + r'^([¹²³⁰⁴-⁹]+\s+|\[\d{1,3}\]\s+)' +) +_NUMBERED_HDR_RE = re.compile( + r"^(#{1,6})\s+(\d+(?:\.\d+)*)\.\s+(.+)$", + re.MULTILINE, +) +_BIB_MARKERS_RE = re.compile( + r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b' + r'|\b(19|20)\d{2}\b', + re.IGNORECASE, +) +_WATERMARK_RE = re.compile( + r"^(BOZZA|DRAFT|CONFIDENTIAL|RISERVATO|PROVVISORIO|SAMPLE|SPECIMEN" + r"|DO NOT DISTRIBUTE|NON DISTRIBUIRE|COPY|COPIA)\s*$", + re.IGNORECASE | re.MULTILINE, +) +_TABSEP_RE = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$") +_FM_RE = re.compile( + r"https?://|www\.|@[A-Za-z]|\bUniversit[àa]\b|\bDipartimento\b|" + r"\bCopyright\b|\bLicenza\b|\bEdizione\b|" + r"protetto da|tutti i diritti", + re.IGNORECASE, +) +_VERSE_NUM_RE = re.compile( + r"([.!?\xbb'\"" + "’" + r"]\s+)(\d+)(\s+)(?=[A-Z\xc0-\xd9a-z\xe0-\xf9\xab“”‟])" +) +# Math header demotion +_MATH_SYMBOLS_RE = re.compile( + r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]" +) +_EXERCISE_TRIGGER_RE = re.compile( + r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that" + r"|Compute|Calculate|Dimostrare|Verificare)\b", + re.IGNORECASE, +) +_MATH_HDR_RE = re.compile(r"^(#{2,3})\s+(.+)$") +_NUMBERED_PREFIX_RE = re.compile(r"^(\d+(?:\.\d+)*[.)])\s+(.+)$", re.DOTALL) +# Orphan TOC: voce di indice senza dot-leader (es. "3. Funzioni 174") +_TOC_ITEM_RE = re.compile( + r"^\d+(\.\d+)*\.?\s+[A-Za-zÀ-ú\'\(][^\n]{2,70}$" +) +_TOC_HDR_WITH_PAGE_RE = re.compile( + r"^#{1,3}\s+\d+\.?\s+.{3,60}\s+\d{1,4}$" +) diff --git a/conversione/_pipeline/transforms/_encoding.py b/conversione/_pipeline/transforms/_encoding.py new file mode 100644 index 0000000..2ce4ec3 --- /dev/null +++ b/conversione/_pipeline/transforms/_encoding.py @@ -0,0 +1,45 @@ +"""Trasformazioni di encoding: PUA font Symbol, accenti LaTeX, simboli SI.""" +import re + +from ._constants import _SYMBOL_PUA_MAP, _SYMBOL_PUA_RE + + +def _t_fix_symbol_font(text: str) -> tuple[str, int]: + count = [0] + + def _repl(m: re.Match) -> str: + count[0] += 1 + return _SYMBOL_PUA_MAP[m.group(0)] + + result = _SYMBOL_PUA_RE.sub(_repl, text) + return result, count[0] + + +def _t_fix_accents(text: str) -> tuple[str, int]: + _ACCENT_MAP = { + "e": "\xe8", "E": "\xc8", "a": "\xe0", "A": "\xc0", + "u": "\xf9", "U": "\xd9", "i": "\xec", "I": "\xcc", + "o": "\xf2", "O": "\xd2", + } + n_bt_before = text.count("`") + text = re.sub(r"`([eEaAuUiIoO])", lambda m: _ACCENT_MAP[m.group(1)], text) + text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text) + n_accenti = n_bt_before - text.count("`") + n_bt_orfani = text.count("`") + if n_bt_orfani: + text = re.sub(r"`", "", text) + n_accenti += n_bt_orfani + return text, n_accenti + + +def _t_fix_multiplication(text: str) -> tuple[str, int]: + n = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text)) + text = re.sub(r'(?<=[0-9])"(?=[0-9(])', '×', text) + return text, n + + +def _t_fix_micro(text: str) -> tuple[str, int]: + _SI_UNITS_RE = r'[mAsgVWFHTKNJClΩ]' + n = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text)) + text = re.sub(rf'(\d)\s*!({_SI_UNITS_RE})', r'\1 µ\2', text) + return text, n diff --git a/conversione/_pipeline/transforms/_finish.py b/conversione/_pipeline/transforms/_finish.py new file mode 100644 index 0000000..a5f8a8e --- /dev/null +++ b/conversione/_pipeline/transforms/_finish.py @@ -0,0 +1,116 @@ +"""Trasformazioni di rifinitura: header vuoti, garbage, demozione formula-header, frontmatter.""" +import re + +from ._constants import ( + _FM_RE, _MATH_HDR_RE, _MATH_SYMBOLS_RE, + _EXERCISE_TRIGGER_RE, _NUMBERED_PREFIX_RE, +) +from ._helpers import _merge_title_headers + + +def _t_remove_empty_headers(text: str) -> tuple[str, int]: + blocks = re.split(r"\n{2,}", text) + cleaned = [] + for i, block in enumerate(blocks): + stripped = block.strip() + if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped: + next_stripped = blocks[i + 1].strip() if i + 1 < len(blocks) else "" + next_is_long_hdr = ( + re.match(r"^#{1,6} ", next_stripped) and len(next_stripped) > 80 + ) + if not next_stripped or ( + re.match(r"^#{1,6} ", next_stripped) and not next_is_long_hdr + ): + continue + cleaned.append(block) + return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), 0 + + +def _t_merge_title_headers(text: str) -> tuple[str, int]: + return _merge_title_headers(text) + + +def _t_remove_garbage_headers(text: str) -> tuple[str, int]: + def _is_garbage(content: str) -> bool: + if content.lstrip().startswith("..."): + return True + if not re.search(r"[A-Za-z\xc0-\xffΑ-ω]{2,}", content): + return True + if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()): + return True + if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content): + return True + first_alpha = next((c for c in content if c.isalpha()), None) + if first_alpha and first_alpha.islower() and len(content) > 40: + return True + if re.match(r"^[A-Za-zΑ-ω_]{1,3}\s*[=<>≤≥]", content.strip()): + return True + if re.match( + r"^(Figura|Figure|Fig\.|Tabella|Table|Tab\.)\s+\d", + content.strip(), re.IGNORECASE, + ): + return True + return False + + count = 0 + lines = text.split("\n") + new_lines = [] + for line in lines: + m = re.match(r"^#{1,6} (.+)$", line) + if m and _is_garbage(m.group(1)): + count += 1 + continue + new_lines.append(line) + text = "\n".join(new_lines) + text = re.sub(r"\n{3,}", "\n\n", text) + return text, count + + +def _t_math_header_demotion(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + m = _MATH_HDR_RE.match(line) + if not m: + result.append(line) + continue + body = m.group(2) + if len(body) <= 100: + result.append(line) + continue + has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3 + has_exercise = bool(_EXERCISE_TRIGGER_RE.search(body)) + if not (has_math or has_exercise): + result.append(line) + continue + nm = _NUMBERED_PREFIX_RE.match(body) + if nm: + result.append(f"**{nm.group(1)}** {nm.group(2)}") + else: + result.append(body) + count += 1 + return "\n".join(result), count + + +def _t_remove_frontmatter(text: str) -> tuple[str, int]: + blocks = re.split(r"\n{2,}", text) + cleaned = [] + count = 0 + total = len(blocks) + cutoff = max(5, min(15, int(total * 0.20))) + for i, block in enumerate(blocks): + stripped = block.strip() + if i >= cutoff: + cleaned.append(block) + continue + if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped): + cleaned.append(block) + continue + body = blocks[i + 1].strip() if i + 1 < len(blocks) else "" + is_fm_body = len(body) < 250 and _FM_RE.search(body) + is_fm_hdr = _FM_RE.search(stripped) + if is_fm_body or is_fm_hdr: + count += 1 + continue + cleaned.append(block) + return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), count diff --git a/conversione/_pipeline/transforms/_headers.py b/conversione/_pipeline/transforms/_headers.py new file mode 100644 index 0000000..5e34247 --- /dev/null +++ b/conversione/_pipeline/transforms/_headers.py @@ -0,0 +1,110 @@ +"""Trasformazioni sulla struttura degli header: normalizzazione livelli, concat, bold.""" +import re + +from ._constants import _NUMBERED_HDR_RE +from ._helpers import _sentence_case + + +def _t_fix_header_concat(text: str) -> tuple[str, int]: + count = 0 + + def _fix(m: re.Match) -> str: + nonlocal count + hashes = m.group(1) + full = m.group(2).strip() + if len(full) < 60: + return m.group(0) + skip = min(10, len(full) // 3) + split = re.search( + r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa\xe4])" + r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", + full[skip:], + ) + if split: + pos = skip + split.start() + title = full[:pos].strip() + body = full[pos:].strip() + if len(title) >= 5 and len(body) >= 15: + count += 1 + return f"{hashes} {title}\n\n{body}" + return m.group(0) + + text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE) + return text, count + + +def _t_extract_capitolo(text: str) -> tuple[str, int]: + def _repl(m: re.Match) -> str: + num = m.group(1) + titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip()) + return f"\n\n## Capitolo {num}: {titolo}\n\n" + + text = re.sub( + r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L]" + r"[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\s\'\.,\(\)]{5,80}?)" + r"(?=\s*[-–]\s*\d|\s*\n|\s*$)", + _repl, + text, + ) + return text, 0 + + +def _t_normalize_numbered_headings(text: str) -> tuple[str, int]: + all_matches = list(_NUMBERED_HDR_RE.finditer(text)) + if not all_matches: + return text, 0 + + pairs = [(m.group(2).count(".") + 1, len(m.group(1))) for m in all_matches] + depths = [d for d, _ in pairs] + min_depth = min(depths) + max_depth = max(depths) + if max_depth == min_depth: + return text, 0 + + base_level = min(lv for d, lv in pairs if d == min_depth) + count = 0 + + def _repl(m: re.Match) -> str: + nonlocal count + hashes, num, title = m.group(1), m.group(2), m.group(3) + depth = num.count(".") + 1 + new_level = min(base_level + (depth - min_depth), 6) + if new_level == len(hashes): + return m.group(0) + count += 1 + return f"{'#' * new_level} {num}. {title}" + + return _NUMBERED_HDR_RE.sub(_repl, text), count + + +def _t_normalize_header_levels(text: str) -> tuple[str, int]: + text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE) + text = re.sub( + r"^(#{3,6})\s+(\d{1,3})\s+(.+)$", + lambda m: f"### {m.group(2)}. {m.group(3)}", + text, + flags=re.MULTILINE, + ) + text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE) + return text, 0 + + +def _t_remove_header_bold(text: str) -> tuple[str, int]: + text = re.sub( + r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$", + r"\1 \2", + text, flags=re.MULTILINE, + ) + return text, 0 + + +def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]: + def _norm(m: re.Match) -> str: + hashes, content = m.group(1), m.group(2).strip() + letters = [c for c in content if c.isalpha()] + if letters and all(c.isupper() for c in letters): + return f"{hashes} {_sentence_case(content)}" + return m.group(0) + + text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE) + return text, 0 diff --git a/conversione/_pipeline/transforms/_helpers.py b/conversione/_pipeline/transforms/_helpers.py new file mode 100644 index 0000000..e91ad1b --- /dev/null +++ b/conversione/_pipeline/transforms/_helpers.py @@ -0,0 +1,153 @@ +"""Funzioni helper pure condivise tra i moduli di trasformazione.""" +import re + +from ._constants import _ORDINALS_IT, _ORDINALS_EN + + +def _sentence_case(s: str) -> str: + if not s: + return s + lower = s.lower() + return lower[0].upper() + lower[1:] + + +def _is_allcaps_line(line: str) -> bool: + stripped = line.strip() + letters = [c for c in stripped if c.isalpha()] + return ( + len(letters) >= 3 + and all(c.isupper() for c in letters) + and not stripped.startswith("#") + and not stripped.startswith("|") + ) + + +def _allcaps_to_header(raw_line: str) -> str: + text = re.sub(r"^[-*+]\s+", "", raw_line.strip()) + text = text.rstrip(".").rstrip("?").strip() + + _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys()) + m = re.match(rf"^CAPITOLO ({_ORD_IT_PAT})\. (.+)", text) + if m: + roman = _ORDINALS_IT[m.group(1)] + titolo = m.group(2).rstrip(".").rstrip("?").strip() + return f"## Capitolo {roman} — {_sentence_case(titolo)}" + + _ORD_EN_PAT = "|".join(_ORDINALS_EN.keys()) + m = re.match(rf"^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)", text) + if m: + n = _ORDINALS_EN.get(m.group(1), m.group(1)) + titolo = m.group(2).rstrip(".").rstrip("?").strip() + return f"## Chapter {n} — {_sentence_case(titolo)}" + + m = re.match(r"^([IVXLCDM]+|[0-9]+)\. (.+)", text) + if m: + return f"## {m.group(1)}. {_sentence_case(m.group(2).rstrip('.').strip())}" + + return f"## {_sentence_case(text)}" + + +def _extract_math_environments(text: str) -> tuple[str, int]: + _ENVS = ( + r"Definizione|Definition|Teorema|Theorem|Lemma|" + r"Proposizione|Proposition|Corollario|Corollary|" + r"Osservazione|Remark|Nota|Note|Esempio|Example" + ) + count = 0 + blocks = text.split("\n\n") + result = [] + + for block in blocks: + stripped = block.strip() + if not stripped or stripped.startswith("#"): + result.append(block) + continue + + m = re.match( + rf"^({_ENVS})\s+((?:\d+\.?){{1,4}})\s*(.*)", + stripped, + re.DOTALL, + ) + if not m: + result.append(block) + continue + + env = m.group(1) + num = m.group(2).rstrip(".") + rest = m.group(3).strip() + + title_m = re.match(r"^(\([^)]{2,60}\))\s+(.*)", rest, re.DOTALL) + if title_m: + header = f"### {env} {num} {title_m.group(1)}" + body = title_m.group(2).strip() + else: + header = f"### {env} {num}." + body = rest + + result.append(f"{header}\n\n{body}" if body else header) + count += 1 + + return "\n\n".join(result), count + + +def _merge_title_headers(text: str) -> tuple[str, int]: + count = 0 + blocks = re.split(r"\n{2,}", text) + result = [] + i = 0 + while i < len(blocks): + block = blocks[i] + stripped = block.strip() + if ( + re.match(r"^#{2,3} \d+\.\s*$", stripped) + and i + 1 < len(blocks) + ): + nxt = blocks[i + 1].strip() + if ( + nxt + and "\n" not in nxt + and len(nxt) <= 80 + and not nxt.startswith("#") + and not re.match(r"^\d+[\.\)]\s", nxt) + ): + result.append(stripped.rstrip() + " " + nxt) + count += 1 + i += 2 + continue + result.append(block) + i += 1 + return re.sub(r"\n{3,}", "\n\n", "\n\n".join(result)), count + + +def _extract_article_headers(text: str) -> tuple[str, int]: + count = 0 + + def _repl(m: re.Match) -> str: + nonlocal count + num = m.group(1) + rest = m.group(2).strip() + + title_m = re.match( + r"^([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda].{1,74}?)\.\s+" + r"([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\(\d].{4,})", + rest, + ) + if title_m: + count += 1 + return ( + f"### Art. {num}. {title_m.group(1)}.\n\n" + f"{title_m.group(2).strip()}" + ) + if rest: + count += 1 + return f"### Art. {num}.\n\n{rest}" + count += 1 + return f"### Art. {num}." + + text = re.sub( + r"^-\s+Art\.\s+([\d]+[a-z\-]*)\.\s*(.*)", + _repl, + text, + flags=re.MULTILINE, + ) + return text, count diff --git a/conversione/_pipeline/transforms/_structure.py b/conversione/_pipeline/transforms/_structure.py new file mode 100644 index 0000000..853c8bb --- /dev/null +++ b/conversione/_pipeline/transforms/_structure.py @@ -0,0 +1,184 @@ +"""Costruzione struttura: TOC, ALLCAPS→##, sezioni numerate, ambienti matematici, articoli.""" +import re + +from ._constants import ( + _TOC_KEYWORDS, _BIB_MARKERS_RE, + _TOC_ITEM_RE, _TOC_HDR_WITH_PAGE_RE, +) +from ._helpers import ( + _is_allcaps_line, _allcaps_to_header, + _extract_math_environments, _extract_article_headers, +) + + +def _t_remove_toc(text: str) -> tuple[str, int]: + lines = text.split("\n") + new_lines = [] + _in_toc = False + removed = False + for line in lines: + bare = re.sub(r"^#+\s*", "", line.strip()) + first_word = bare.split(".")[0].strip().lower() + if first_word in _TOC_KEYWORDS: + removed = True + _in_toc = True + continue + if _in_toc: + if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line): + continue + if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line): + continue + if len(line.strip()) > 200: + _in_toc = False + new_lines.append(line) + continue + _in_toc = False + new_lines.append(line) + return "\n".join(new_lines), 1 if removed else 0 + + +def _t_remove_orphan_toc(text: str) -> tuple[str, int]: + """ + Rimuove voci di sommario senza dot-leader che sfuggono a _t_remove_toc. + Rileva: (a) blocchi di 3+ righe consecutive che matchano il pattern TOC + nei primi 25% del documento; (b) header ### N. Titolo PAGINA il cui corpo + è una lista di voci numerate. + """ + blocks = re.split(r"\n{2,}", text) + total = len(blocks) + cutoff = max(10, min(40, int(total * 0.25))) + to_drop = set() + + i = 0 + while i < cutoff and i < total: + b = blocks[i].strip() + + # (a) Sequenza di 3+ blocchi TOC consecutivi + if _TOC_ITEM_RE.match(b): + j = i + while j < min(cutoff, i + 60) and j < len(blocks) and _TOC_ITEM_RE.match(blocks[j].strip()): + j += 1 + if j - i >= 3: + for k in range(i, j): + to_drop.add(k) + # Rimuovi anche l'header ### precedente se ha numero di pagina + if i > 0 and _TOC_HDR_WITH_PAGE_RE.match(blocks[i - 1].strip()): + to_drop.add(i - 1) + i = j + continue + + # (b) Header ### N. Titolo PAGINA con corpo che è lista di voci numerate + if _TOC_HDR_WITH_PAGE_RE.match(b): + body = blocks[i + 1].strip() if i + 1 < len(blocks) else "" + # Il corpo contiene 2+ occorrenze di "N. Titolo" + toc_hits = re.findall(r"\d+\.?\s+[A-Za-zÀ-ú]", body) + if len(toc_hits) >= 2 and len(body) < 300: + to_drop.add(i) + if i + 1 < total: + to_drop.add(i + 1) + i += 2 + continue + + i += 1 + + if not to_drop: + return text, 0 + + kept = [b for idx, b in enumerate(blocks) if idx not in to_drop] + return re.sub(r"\n{3,}", "\n\n", "\n\n".join(kept)), len(to_drop) + + +def _t_allcaps_to_headers(text: str) -> tuple[str, int]: + count = 0 + blocks = text.split("\n\n") + new_blocks = [] + for block in blocks: + stripped = block.strip() + if "\n" not in stripped and _is_allcaps_line(stripped): + new_blocks.append(_allcaps_to_header(stripped)) + count += 1 + else: + sub_lines = block.split("\n") + converted = [] + for ln in sub_lines: + if _is_allcaps_line(ln) and len(ln.strip()) > 3: + converted.append(_allcaps_to_header(ln)) + count += 1 + else: + converted.append(ln) + new_blocks.append("\n".join(converted)) + return "\n\n".join(new_blocks), count + + +def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]: + count = 0 + + def _num_repl(m: re.Match) -> str: + nonlocal count + content = m.group(2).strip() + if content.endswith(".") and len(content) > 40: + return m.group(0) + if _BIB_MARKERS_RE.search(content): + return m.group(0) + count += 1 + return f"### {m.group(1)}.\n\n{content}" + + text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE) + + def _num_letter_repl(m: re.Match) -> str: + nonlocal count + count += 1 + return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}" + + text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE) + + if not has_exercises: + def _aphorism_repl(m: re.Match) -> str: + nonlocal count + content = m.group(2).strip() + if _BIB_MARKERS_RE.search(content): + return m.group(0) + count += 1 + return f"\n\n### {m.group(1)}.\n\n{content}" + + text = re.sub( + r"^-\s+(\d{1,3})\.\s+(.{10,})$", + _aphorism_repl, + text, + flags=re.MULTILINE, + ) + + def _list_section_repl(m: re.Match) -> str: + nonlocal count + num = m.group(1) + content = m.group(2).strip() + if _BIB_MARKERS_RE.search(content): + return m.group(0) + count += 1 + split = re.search( + r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+" + r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", + content, + ) + if split and split.start() >= 3: + title = content[: split.start()].strip() + body = content[split.end():].strip() + if len(body) >= 20: + return f"\n\n### {num}. {title}\n\n{body}" + return f"\n\n### {num}. {content}" + + text = re.sub( + r"^-\s+(\d{1,3})\s+([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L].{10,})$", + _list_section_repl, + text, + flags=re.MULTILINE, + ) + return text, count + + +def _t_extract_math(text: str) -> tuple[str, int]: + return _extract_math_environments(text) + + +def _t_extract_articles(text: str) -> tuple[str, int]: + return _extract_article_headers(text) diff --git a/conversione/_pipeline/transforms/_text.py b/conversione/_pipeline/transforms/_text.py new file mode 100644 index 0000000..dfa6105 --- /dev/null +++ b/conversione/_pipeline/transforms/_text.py @@ -0,0 +1,109 @@ +"""Trasformazioni sul testo: merge paragrafi, whitespace, poesia, versi.""" +import re + +from ._constants import _VERSE_NUM_RE + + +def _t_merge_paragraphs(text: str) -> tuple[str, int]: + _SENTENCE_END = set(".?!\xbb)\"'") + blocks = text.split("\n\n") + merged = [] + count = 0 + i = 0 + while i < len(blocks): + b = blocks[i] + stripped = b.strip() + while ( + i + 1 < len(blocks) + and stripped + and not stripped.startswith("#") + and not stripped.startswith("|") + and stripped[-1] not in _SENTENCE_END + ): + nxt = blocks[i + 1].strip() + if ( + not nxt + or nxt.startswith("#") + or nxt.startswith("|") + or re.match(r"^\d+\.", nxt) + or re.match(r"^[-*+]\s", nxt) + ): + break + b = stripped + " " + nxt + stripped = b.strip() + count += 1 + i += 1 + merged.append(b) + i += 1 + text = "\n\n".join(merged) + text = re.sub(r"(?m)^\|---\|\s*", "", text) + return text, count + + +def _t_normalize_whitespace(text: str) -> tuple[str, int]: + lines = text.split("\n") + text = "\n".join( + re.sub(r" +", " ", line) if line.strip() else line + for line in lines + ) + return text, 0 + + +def _t_collapse_blank_lines(text: str) -> tuple[str, int]: + return re.sub(r"\n{3,}", "\n\n", text), 0 + + +def _t_restore_poetry_lines(text: str) -> tuple[str, int]: + count = 0 + blocks = text.split("\n\n") + result = [] + + for block in blocks: + stripped = block.strip() + if not stripped or stripped.startswith("#"): + result.append(block) + continue + + matches = list(_VERSE_NUM_RE.finditer(stripped)) + if len(matches) < 2: + result.append(block) + continue + + nums = [int(m.group(2)) for m in matches] + diffs = [nums[i + 1] - nums[i] for i in range(len(nums) - 1)] + if not diffs or len(set(diffs)) > 2 or not (1 <= diffs[0] <= 5): + result.append(block) + continue + + step = diffs[0] + + def _replace_verse_num(m: re.Match) -> str: + n = int(m.group(2)) + sep = "\n\n" if n % (step * 3) == 0 else "\n" + return m.group(1).rstrip() + sep + + new_block = _VERSE_NUM_RE.sub(_replace_verse_num, stripped) + if new_block != stripped: + count += len(matches) + result.append(new_block) + + return "\n\n".join(result), count + + +def _t_demote_verse_headers(text: str) -> tuple[str, int]: + count = 0 + + def _demote(m: re.Match) -> str: + nonlocal count + hashes, content = m.group(1), m.group(2).strip() + if not re.search(r"\s\d{1,4}\s*$", content): + return m.group(0) + inner = re.sub(r"\s\d{1,4}\s*$", "", content) + if not re.search(r'[,;:.!?\xbb"\'][\ ]+[A-Za-z\xc0-\xff\xab""]', inner): + return m.group(0) + count += 1 + clean = re.sub(r"\s\d{1,4}\s*$", "", content) + return clean + + text = re.sub(r"^(#{1,6})\s+(.{20,})$", _demote, text, flags=re.MULTILINE) + return text, count diff --git a/docs/superpowers/plans/2026-04-30-pipeline-ottimizzazione.md b/docs/superpowers/plans/2026-04-30-pipeline-ottimizzazione.md new file mode 100644 index 0000000..91694f9 --- /dev/null +++ b/docs/superpowers/plans/2026-04-30-pipeline-ottimizzazione.md @@ -0,0 +1,560 @@ +# Pipeline ottimizzazione PDF→Markdown — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Eliminare la necessità di revisione manuale del `clean.md` ottimizzando i parametri di opendataloader-pdf e aggiungendo trasformazioni mirate per tutti i tipi di PDF. + +**Architecture:** Quattro file modificati: `converter.py` (parametri adattivi + rilevamento PDF taggato), `transforms.py` (PUA bracket TeX + demozione header-formula), `report.py` (nuova metrica residua), `validator.py` (nuova penalità). Nessun cambio all'API pubblica di `_pipeline`. + +**Tech Stack:** Python 3.12, opendataloader-pdf (Java), PyMuPDF (fitz), regex + +--- + +## File modificati + +| File | Tipo | Responsabilità | +|------|------|----------------| +| `conversione/_pipeline/converter.py` | Modify | `_is_tagged_pdf()` + nuovi parametri convert | +| `conversione/_pipeline/transforms.py` | Modify | PUA bracket TeX + `_t_math_header_demotion` | +| `conversione/_pipeline/report.py` | Modify | `formula_headers_residui` nella sezione residui | +| `conversione/_pipeline/validator.py` | Modify | Penalità formula headers | + +--- + +## Task 1: Converter adattivo — `_is_tagged_pdf()` + nuovi parametri + +**Files:** +- Modify: `conversione/_pipeline/converter.py` + +- [ ] **Step 1: Leggi il file attuale** + +```bash +cat conversione/_pipeline/converter.py +``` + +- [ ] **Step 2: Sostituisci interamente il contenuto** + +Il nuovo `converter.py` aggiunge `_is_tagged_pdf()` (usa fitz per controllare `StructTreeRoot` nel catalog del PDF) e passa i nuovi parametri a `opendataloader_pdf.convert()`: +- `table_method="cluster"` — sempre attivo, migliora tabelle senza bordi +- `content_safety_off=["tiny", "hidden-ocg"]` — evita filtraggio di footnote e layer OCG +- `use_struct_tree=tagged` — attivo solo se PDF è taggato + +```python +from pathlib import Path + + +def _is_tagged_pdf(pdf_path: Path) -> bool: + try: + import fitz + doc = fitz.open(str(pdf_path)) + tagged = "StructTreeRoot" in doc.pdf_catalog() + doc.close() + return tagged + except Exception: + return False + + +def convert_pdf(pdf_path: Path, out_dir: Path) -> Path: + """ + Converte il PDF in Markdown tramite opendataloader-pdf. + Scrive il file nella out_dir e restituisce il percorso. + + Parametri scelti per output RAG-ottimale: + - keep_line_breaks=False → testo fluente, no hard-wrap PDF + - reading_order="xycut" → corregge ordine multi-colonna (XY-Cut++) + - sanitize=False → preserva il testo originale + - image_output="off" → nessuna immagine estratta né referenziata + - table_method="cluster" → rileva tabelle senza bordi visibili + - content_safety_off → evita filtraggio di footnote e layer OCG + - use_struct_tree → attivo se PDF è taggato (Word/InDesign) + """ + import opendataloader_pdf + + out_dir.mkdir(parents=True, exist_ok=True) + tagged = _is_tagged_pdf(pdf_path) + + opendataloader_pdf.convert( + input_path=str(pdf_path), + output_dir=str(out_dir), + format="markdown", + keep_line_breaks=False, + reading_order="xycut", + sanitize=False, + image_output="off", + table_method="cluster", + content_safety_off=["tiny", "hidden-ocg"], + use_struct_tree=tagged, + quiet=True, + ) + + md_file = out_dir / f"{pdf_path.stem}.md" + if not md_file.exists(): + candidates = list(out_dir.glob("*.md")) + if not candidates: + raise RuntimeError(f"Nessun file .md prodotto in {out_dir}") + md_file = candidates[0] + + content = md_file.read_text(encoding="utf-8", errors="replace").strip() + if len(content) < 100: + raise RuntimeError( + f"opendataloader ha prodotto un file .md quasi vuoto ({len(content)} char) " + f"— il PDF potrebbe essere corrotto o non supportato" + ) + + return md_file +``` + +- [ ] **Step 3: Verifica sintattica** + +```bash +.venv/bin/python -c "from conversione._pipeline.converter import convert_pdf, _is_tagged_pdf; print('OK')" +``` + +Atteso: `OK` + +- [ ] **Step 4: Commit** + +```bash +git add conversione/_pipeline/converter.py +git commit -m "feat(converter): parametri adattivi — use_struct_tree, cluster tables, content-safety" +``` + +--- + +## Task 2: Aggiunta PUA bracket TeX (U+F8EB–U+F8FE) + +**Files:** +- Modify: `conversione/_pipeline/transforms.py` (sezione `_SYMBOL_PUA_MAP`, righe ~28–127) + +Questi codepoint sono pezzi di parentesi/bracket grandi del font Computer Modern (TeX), non ricostruibili come singolo simbolo → mappati a `""`. + +- [ ] **Step 1: Aggiungi le entries mancanti alla fine di `_SYMBOL_PUA_MAP`** + +Individua la riga `"": "", # bracket extension piece (non ricostruibile)` (circa riga 122) e aggiungi **dopo** l'ultima entry esistente della mappa (prima della `}`): + +```python + "": "", # TeX large paren left + "": "", # TeX large paren extension + "": "", # TeX large paren right + "": "", # TeX large paren right extension + "": "", # TeX large bracket left + "": "", # TeX large bracket extension + "": "", # TeX brace top-left + "": "", # TeX brace mid + "": "", # TeX brace mid-right + "": "", # TeX brace extension + "": "", # TeX brace right + "": "", # TeX bracket right large + "": "", # TeX bracket right extension + "": "", # TeX bracket right close + "": "", # TeX integral large + "": "", # TeX integral extension + "": "", # TeX integral top + "": "", # TeX radical top + "": "", # TeX radical extension + "": "", # TeX arrowhead +``` + +- [ ] **Step 2: Verifica che _SYMBOL_PUA_RE si aggiorni automaticamente** + +```bash +.venv/bin/python -c " +from conversione._pipeline.transforms import _SYMBOL_PUA_MAP, _SYMBOL_PUA_RE +pua_chars = ['', '', '', ''] +for c in pua_chars: + assert c in _SYMBOL_PUA_MAP, f'Manca {repr(c)}' + assert _SYMBOL_PUA_RE.search(c), f'Regex non cattura {repr(c)}' +print(f'OK — {len(_SYMBOL_PUA_MAP)} PUA chars mappati') +" +``` + +Atteso: `OK — N PUA chars mappati` (N > 90) + +- [ ] **Step 3: Verifica sostituzione su testo di esempio** + +```bash +.venv/bin/python -c " +from conversione._pipeline.transforms import apply_transforms +testo = 'Sia x = f(n) e n la parentesi grande.' +pulito, stats = apply_transforms(testo) +assert '' not in pulito +assert '' not in pulito +print('Testo pulito:', repr(pulito)) +print('PUA corretti:', stats['n_simboli_pua_corretti']) +" +``` + +Atteso: nessun PUA nel testo pulito, `n_simboli_pua_corretti` > 0. + +- [ ] **Step 4: Commit** + +```bash +git add conversione/_pipeline/transforms.py +git commit -m "feat(transforms): aggiungi PUA bracket TeX U+F8EB-F8FE alla mappa simboli" +``` + +--- + +## Task 3: Nuova trasformazione `_t_math_header_demotion` + +**Files:** +- Modify: `conversione/_pipeline/transforms.py` + +Demota a testo semplice gli header `##`/`###` che sono enunciati di esercizi o formule lunghe (non titoli di sezione reali). + +**Criteri di demozione** (almeno uno tra math e exercise deve valere): +- Livello `##` o `###` +- Lunghezza testo (senza `#`) > 100 caratteri +- `math`: ≥ 3 simboli matematici nell'header (da set: `=`, `+`, `∈`, `∀`, `∃`, `≤`, `≥`, `∞`, `∑`, `∫`, `∂`, `→`, `↔`, `⊂`, `⊃`, `∩`, `∪`, lettere greche Unicode U+03B1–U+03C9 e U+0391–U+03A9) +- `exercise`: matcha pattern traccia (`\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that|Compute|Calculate|Dimostrare|Verificare)\b`) + +**Output**: rimuove `#+ `. Se la riga inizia con `N. ` (numero + punto), converte in `**N.** resto`. Altrimenti testo plain. + +- [ ] **Step 1: Aggiungi costante regex a livello di modulo** (dopo le costanti esistenti, prima di `_SYMBOL_PUA_MAP`) + +Trova la riga `_VERSE_NUM_RE = re.compile(` (circa riga 160) e aggiungi **dopo**: + +```python +_MATH_SYMBOLS_RE = re.compile( + r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]" +) +_EXERCISE_TRIGGER_RE = re.compile( + r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that" + r"|Compute|Calculate|Dimostrare|Verificare)\b", + re.IGNORECASE, +) +_MATH_HDR_RE = re.compile(r"^(#{2,3})\s+(.+)$") +_NUMBERED_PREFIX_RE = re.compile(r"^(\d+(?:\.\d+)*[.)])\s+(.+)$", re.DOTALL) +``` + +- [ ] **Step 2: Aggiungi la funzione `_t_math_header_demotion`** (prima dell'orchestratore `apply_transforms`) + +Trova la riga `# ─── Orchestratore` e aggiungi **prima**: + +```python +def _t_math_header_demotion(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + m = _MATH_HDR_RE.match(line) + if not m: + result.append(line) + continue + body = m.group(2) + if len(body) <= 100: + result.append(line) + continue + has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3 + has_exercise = bool(_EXERCISE_TRIGGER_RE.search(body)) + if not (has_math or has_exercise): + result.append(line) + continue + nm = _NUMBERED_PREFIX_RE.match(body) + if nm: + result.append(f"**{nm.group(1)}** {nm.group(2)}") + else: + result.append(body) + count += 1 + return "\n".join(result), count +``` + +- [ ] **Step 3: Registra la trasformazione in `_transforms`** + +Nell'orchestratore `apply_transforms`, trova la riga: + +```python + ("n_garbage_headers_rimossi", _t_remove_garbage_headers), +``` + +e aggiungi **dopo**: + +```python + ("n_formula_headers_demotati", _t_math_header_demotion), +``` + +- [ ] **Step 4: Aggiungi la stat key al print in `runner.py`** + +Trova in `conversione/_pipeline/runner.py` il blocco di print delle statistiche (dopo `apply_transforms`) e aggiungi: + +```python + print(f" Formula-hdr demotati: {t['n_formula_headers_demotati']}") +``` + +- [ ] **Step 5: Verifica su caso sintetico** + +```bash +.venv/bin/python -c " +from conversione._pipeline.transforms import apply_transforms + +# Caso 1: header esercizio lungo → deve essere demotato +testo = '### 3. Si dimostri la formula per le equazioni di secondo grado ax^2 + bx + c = 0 e si analizzi il segno del discriminante b^2 - 4ac per tutti i valori reali.' +pulito, stats = apply_transforms(testo) +assert '###' not in pulito, f'Header non demotato: {pulito!r}' +print('Caso 1 OK:', pulito[:80]) + +# Caso 2: header titolo corto → NON deve essere demotato +testo2 = '### Teorema di Cauchy' +pulito2, _ = apply_transforms(testo2) +assert '###' in pulito2, f'Header legittimo demotato: {pulito2!r}' +print('Caso 2 OK:', pulito2) + +# Caso 3: header con molti simboli math + lungo → demotato +testo3 = '### Sia f: R→R tale che ∀x∈R si abbia f(x) = ∑_{n=0}^{∞} aₙxⁿ con ∫f dx = g(x) + C per ogni x∈[a,b].' +pulito3, stats3 = apply_transforms(testo3) +print('Caso 3:', '###' not in pulito3, stats3.get('n_formula_headers_demotati')) + +print('Stats:', stats.get('n_formula_headers_demotati')) +" +``` + +Atteso: Caso 1 e 3 demotati, Caso 2 intatto. + +- [ ] **Step 6: Commit** + +```bash +git add conversione/_pipeline/transforms.py conversione/_pipeline/runner.py +git commit -m "feat(transforms): aggiungi _t_math_header_demotion per header esercizi e formule" +``` + +--- + +## Task 4: `report.py` — metrica `formula_headers_residui` + +**Files:** +- Modify: `conversione/_pipeline/report.py` + +- [ ] **Step 1: Aggiungi funzione di scan formula-header e integrala nel report** + +Nella funzione `build_report()`, dopo la definizione di `_scan()` (circa riga 53), aggiungi: + +```python + def _scan_formula_headers(max_n: int = 10) -> list[dict]: + _math_sym = re.compile( + r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]" + ) + _ex_trigger = re.compile( + r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that" + r"|Compute|Calculate|Dimostrare|Verificare)\b", + re.IGNORECASE, + ) + hits = [] + for i, line in enumerate(text_lines): + m = re.match(r"^(#{2,3})\s+(.+)$", line) + if not m: + continue + body = m.group(2) + if len(body) <= 100: + continue + has_math = len(_math_sym.findall(body)) >= 3 + has_ex = bool(_ex_trigger.search(body)) + if has_math or has_ex: + hits.append({"riga": i + 1, "testo": line.strip()[:120]}) + if len(hits) >= max_n: + break + return hits +``` + +- [ ] **Step 2: Aggiungi la metrica ai `residui`** + +Trova nel dict `residui` la riga: + +```python + "pua_markers": _scan(r'[-]'), +``` + +e aggiungi **dopo**: + +```python + "formula_headers": _scan_formula_headers(), +``` + +Poi nel dict principale `report["residui"]`, trova la riga: + +```python + "pua_markers_esempi": residui["pua_markers"], +``` + +e aggiungi **dopo**: + +```python + "formula_headers": len(residui["formula_headers"]), + "formula_headers_esempi": residui["formula_headers"], +``` + +- [ ] **Step 3: Verifica** + +```bash +.venv/bin/python -c " +import json +from pathlib import Path +from conversione._pipeline.report import build_report +from conversione._pipeline.transforms import apply_transforms + +testo = open('conversione/analisi1/raw.md').read() +clean, t = apply_transforms(testo) +from conversione._pipeline.structure import analyze + +tmp = Path('/tmp/test_report') +tmp.mkdir(exist_ok=True) +(tmp / 'clean.md').write_text(clean) +profile = analyze(tmp / 'clean.md') +rp = build_report('test', tmp, clean, t, profile, 5.0) +r = json.loads(rp.read_text()) +print('formula_headers residui:', r['residui']['formula_headers']) +print('formula_headers esempi:', len(r['residui']['formula_headers_esempi'])) +" +``` + +Atteso: count numerico (può essere 0 se la demozione ha funzionato bene), nessun errore. + +- [ ] **Step 4: Commit** + +```bash +git add conversione/_pipeline/report.py +git commit -m "feat(report): aggiungi metrica formula_headers_residui" +``` + +--- + +## Task 5: `validator.py` — penalità formula headers + +**Files:** +- Modify: `conversione/_pipeline/validator.py` + +- [ ] **Step 1: Aggiungi la penalità in `_score()`** + +Trova in `_score()` la riga: + +```python + _pen("pua_markers", 2, 20, "caratteri PUA font Symbol") +``` + +e aggiungi **dopo**: + +```python + _pen("formula_headers", 3, 15, "formula/esercizio come header") +``` + +- [ ] **Step 2: Aggiungi colonna `fhdr` nell'output tabellare di `validate()`** + +Trova in `validate()` la riga che costruisce `header`: + +```python + header = ( + f"{'stem':<{col}}" + f"{'h2':>4}{'h3':>5} " + f"{'strategia':<18}" + f"{'bare':>5}{'corte':>6}{'lunghe':>7}" + f"{'btk':>5}{'br':>4}{'enc':>4}{'url':>4}" + f"{'med':>6}" + f" {'voto':>4} grade" + ) +``` + +Sostituiscila con: + +```python + header = ( + f"{'stem':<{col}}" + f"{'h2':>4}{'h3':>5} " + f"{'strategia':<18}" + f"{'bare':>5}{'corte':>6}{'lunghe':>7}" + f"{'btk':>5}{'br':>4}{'enc':>4}{'url':>4}{'fhdr':>5}" + f"{'med':>6}" + f" {'voto':>4} grade" + ) +``` + +Trova il `print(...)` dentro il loop `for r in rows:` e aggiungi `fhdr`: + +```python + print( + f"{r['stem']:<{col}}" + f"{st.get('n_h2', 0):>4}" + f"{st.get('n_h3', 0):>5} " + f"{st.get('strategia_chunking','?'):<18}" + f"{an.get('bare_headers', 0):>5}" + f"{an.get('short_sections', 0):>6}" + f"{an.get('long_sections', 0):>7}" + f"{res.get('backtick', 0):>5}" + f"{res.get('br_inline', 0):>4}" + f"{res.get('simboli_encoding', 0):>4}" + f"{res.get('url', 0):>4}" + f"{res.get('formula_headers', 0):>5}" + f"{dist.get('mediana', 0):>6}" + f" {s:>4} {_grade(s)}" + ) +``` + +Aggiorna anche la riga finale `print("\nColonne: ...")`: + +```python + print( + "\nColonne: bare=header vuoti corte=sez<150ch lunghe=sez>1500ch " + "btk=backtick br=
inline enc=simboli encoding fhdr=formula-header med=mediana chars\n" + ) +``` + +- [ ] **Step 3: Verifica** + +```bash +.venv/bin/python -c " +from conversione._pipeline.validator import _score +r = {'structure': {'livello_struttura': 3}, 'anomalie': {}, 'residui': {'formula_headers': 5}} +score, detail = _score(r) +print(score, detail) +assert any('formula' in d for d in detail), 'Penalità formula non applicata' +print('OK') +" +``` + +Atteso: penalità `formula/esercizio come header ×5 −15` nel detail. + +- [ ] **Step 4: Commit** + +```bash +git add conversione/_pipeline/validator.py +git commit -m "feat(validator): aggiungi penalità formula_headers, colonna fhdr nel report" +``` + +--- + +## Task 6: Test di integrazione su analisi1 + +- [ ] **Step 1: Riesegui la pipeline su analisi1** + +```bash +.venv/bin/python conversione/ --stem analisi1 --force 2>&1 +``` + +Atteso: completamento senza errori, print `Formula-hdr demotati: N` visibile. + +- [ ] **Step 2: Valida e confronta con il report precedente** + +```bash +.venv/bin/python conversione/ validate analisi1 --detail +``` + +Confronta con il vecchio voto del `report.json` originale. Il voto deve essere ≥ al precedente. + +- [ ] **Step 3: Verifica riduzione PUA bracket** + +```bash +python3 -c " +import json +r = json.load(open('conversione/analisi1/report.json')) +pua = r['residui']['pua_markers'] +fhdr = r['residui'].get('formula_headers', 'N/A') +print(f'PUA residui: {pua} (era 10+ prima)') +print(f'Formula headers residui: {fhdr}') +" +``` + +Atteso: `pua_markers` ridotto rispetto al run precedente (era 10 nel report originale). + +- [ ] **Step 4: Commit finale se tutto OK** + +```bash +git add conversione/analisi1/ +git commit -m "chore: rigenera output analisi1 con pipeline ottimizzata" +``` diff --git a/docs/superpowers/specs/2026-04-30-pipeline-ottimizzazione-design.md b/docs/superpowers/specs/2026-04-30-pipeline-ottimizzazione-design.md new file mode 100644 index 0000000..698a7cb --- /dev/null +++ b/docs/superpowers/specs/2026-04-30-pipeline-ottimizzazione-design.md @@ -0,0 +1,80 @@ +# Pipeline ottimizzazione — Design Spec +*2026-04-30* + +## Obiettivo +Eliminare la necessità di revisione manuale del `clean.md` per tutti i tipi di PDF (accademici/matematici, giuridici, tecnici) ottimizzando i parametri di opendataloader-pdf e aggiungendo trasformazioni mirate. + +## Scope +Nessun hybrid backend. Solo Java + trasformazioni Python. + +--- + +## 1. `converter.py` — Parametri adattivi + +### 1.1 Rilevamento PDF taggato +Funzione `_is_tagged_pdf(pdf_path) -> bool` usando PyMuPDF (`fitz`): +```python +doc = fitz.open(str(pdf_path)) +tagged = "StructTreeRoot" in doc.pdf_catalog() +doc.close() +``` + +### 1.2 Nuovi parametri fissi (tutti i PDF) +- `table_method="cluster"` — tabelle senza bordi visibili +- `content_safety_off=["tiny", "hidden-ocg"]` — evita filtraggio di footnote e layer OCG + +### 1.3 Parametro condizionale +- `use_struct_tree=tagged` — attivo solo se il PDF è taggato + +Una sola conversione Java, zero overhead per PDF non taggati. + +--- + +## 2. `transforms.py` — Due aggiunte + +### 2.1 PUA bracket TeX (U+F8EB–F8F8) +Aggiunge al `_SYMBOL_PUA_MAP` i glifoni bracket di Computer Modern font che appaiono come PUA: +`U+F8EB, U+F8EC, U+F8ED, U+F8EE, U+F8EF, U+F8F0, U+F8F1, U+F8F2, U+F8F3, U+F8F4, U+F8F5, U+F8F6, U+F8F7, U+F8F8, U+F8F9, U+F8FA, U+F8FB, U+F8FC, U+F8FD, U+F8FE` +→ tutti mappati a `""` (pezzi di parentesi non ricostruibili come singolo glifo) + +Il `_SYMBOL_PUA_RE` si aggiorna automaticamente essendo costruito dalla mappa. + +### 2.2 Nuova trasformazione `_t_math_header_demotion` +Demota a testo semplice gli header `##`/`###` che sono in realtà enunciati di esercizi o formule lunghe. + +**Criteri di demozione** (tutti devono valere): +- Livello `##` o `###` +- Lunghezza testo > 100 caratteri +- Almeno uno tra: + - ≥ 3 simboli matematici (`=`, `+`, `∈`, `∀`, `∃`, `≤`, `≥`, `∞`, lettere greche Unicode, `lim`, `sup`, `inf`, `∑`, `∫`) + - Matcha pattern traccia esercizio: `(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show|Compute|Calculate)\b` + +**Output**: rimuove `#+ ` iniziale. Se numerata (`N. testo`), converte in `**N.** testo`. Altrimenti testo plain. + +**Posizione in `_transforms`**: gruppo "Rifinitura", dopo `_t_garbage_headers`. + +**Stat key**: `n_formula_headers_demotati` + +--- + +## 3. `report.py` — Nuova metrica residua + +`build_report()` aggiunge contatore `formula_headers_residui`: +- Conta header `##`/`###` nel `clean.md` finale che superano ancora i criteri math (sopra) +- Mostra fino a 3 esempi in `formula_headers_esempi` + +--- + +## 4. `validator.py` — Nuova penalità + +| Problema | Penalità | Cap | +|----------|----------|-----| +| Formula/esercizio come header residuo | −3/cad | −15 | + +--- + +## File modificati +1. `conversione/_pipeline/converter.py` — `_is_tagged_pdf()` + nuovi parametri +2. `conversione/_pipeline/transforms.py` — PUA map + `_t_math_header_demotion` +3. `conversione/_pipeline/report.py` — `formula_headers_residui` +4. `conversione/_pipeline/validator.py` — nuova penalità