From b95312172f1d9bf80a152a8291df57e1f37eafd6 Mon Sep 17 00:00:00 2001 From: yeungadrian Date: Fri, 3 Jan 2025 23:49:37 +0000 Subject: [PATCH] combine xlsx and xls to excel, replace openpxyl/xlrd with calamine --- pyproject.toml | 3 +-- src/markitdown/_markitdown.py | 37 ++++++---------------------------- tests/test_files/test.xlsb | Bin 0 -> 10701 bytes tests/test_files/test.xlsm | Bin 0 -> 11624 bytes tests/test_markitdown.py | 12 ++++++++--- 5 files changed, 16 insertions(+), 36 deletions(-) create mode 100644 tests/test_files/test.xlsb create mode 100644 tests/test_files/test.xlsm diff --git a/pyproject.toml b/pyproject.toml index 9c113ad..bbb1141 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,8 +31,7 @@ dependencies = [ "numpy", "python-pptx", "pandas", - "openpyxl", - "xlrd", + "python-calamine", "pdfminer.six", "puremagic", "pydub", diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 50c83b4..ec4f63f 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -715,42 +715,18 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: return result -class XlsxConverter(HtmlConverter): +class ExcelConverter(HtmlConverter): """ - Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. + Converts Excel (XLSX, XLS, XLSM or XLSB) files to Markdown, with each sheet presented as a separate Markdown table. """ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a XLSX + # Bail if not a XLSX, XLS, XLSM or XLSB extension = kwargs.get("file_extension", "") - if extension.lower() != ".xlsx": + if extension.lower() not in [".xlsx", ".xls", ".xlsb", ".xlsm"]: return None - sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") - md_content = "" - for s in sheets: - md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) - md_content += self._convert(html_content).text_content.strip() + "\n\n" - - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) - - -class XlsConverter(HtmlConverter): - """ - Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. - """ - - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a XLS - extension = kwargs.get("file_extension", "") - if extension.lower() != ".xls": - return None - - sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") + sheets = pd.read_excel(local_path, sheet_name=None, engine="calamine") md_content = "" for s in sheets: md_content += f"## {s}\n" @@ -1376,8 +1352,7 @@ def __init__( self.register_page_converter(YouTubeConverter()) self.register_page_converter(BingSerpConverter()) self.register_page_converter(DocxConverter()) - self.register_page_converter(XlsxConverter()) - self.register_page_converter(XlsConverter()) + self.register_page_converter(ExcelConverter()) self.register_page_converter(PptxConverter()) self.register_page_converter(WavConverter()) self.register_page_converter(Mp3Converter()) diff --git a/tests/test_files/test.xlsb b/tests/test_files/test.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..c002872546c1d070193943a8225cd7fb5623c916 GIT binary patch literal 10701 zcmeHtWmFv767Jv*g9mpDI=H(9*FbOx?iw6|y95jF4#C|C9taQ!!9BQpfBC0^k7v02yGGZ+vGQ0s!EG1pqJs@Q~VK zcDBwYw$A!$?)D~5x=e01)}*=6khIwVNbvRlJN}D3(55DDwZH;w$Gstmtn8bxMJ-6U z*o@5wZcCBU35?0C*M~76M8$bLcn>&YVASr-yw$V*ezKItWyhX&R>3h#0c)Agcvi2u zfv2_M{PN1={`OG&glde|f65@ek5!0tV{1N!gwqu+CIK|3k}?gtsZ@z8t%kuE&D&s3 zL}C>8LH67cS?nR+n6}<(T<{^!V zKQRh8!*Z8p!X=ENxwO*E6U=3toLjz!0stN!pa4q$AOfu(10-)_A;u3H8Q1Png%KLR=)jlRN0-)X=_M?@sQfG8Z==VTuWq_n)f zES${nVhV7X zMitt4TQ@S{eVhqzAJ$N!SuIE4!zrRf8H_<`tPz?9`*e*ewH{2<7L~<_^?~cc9}!5+ z)V}uWR$%e1RIpicnW~B{BGbT_fT&cVZ=o5PhEShnqBg~}&MmllMVov1Z%cpI*SsoU zBE0r8Mp|Bh>r)gYKq4WWmfduzXTMkDoqiY+O_S4$7wlv~UX4XX_Srbc{N297PiJ|1 z2i{lx;9Z9XcBY#(v#Xt>m64sD6_b&LZDd@xteP@beMTix)>q$rEjsJ&%1G>={Kb9pp}j5uxs3 zt$_L+ffofESab!O5t-klbDgnS3QYvWe3g4fo?#c8isRV_x*)R4Me)c7^q*iE8f%_? zLljpuz&krZ-{!&4t$v89(@UL>6&9-1!8x=;9L-&-fyl_X|-?JuT-eAO2VR zDSBzB)?Iiy6mcUhwoUOmVIt?* zvBn|2aiT=_G5&&dJiT82${j(w0_<^t9iuchQ#U~>V%;|vl*cVd`!<-|-2xvm&R=5| z)qUGqIP!z7v%5k0U8Eq~RkPiLBSZ=uDfj?*2ymqQLBfBJm%j)Z0-TY7qw2r=Xp8%( z(8B@@Jd$n;sCQkX{0^VP+fJIVm&5=18E5rWZ$sO#;u0}^Xp(e+4tX>i{g_vcY5n~Z zkCWgx<*ta~3~+uGmr*Vm{#yqInQ;a5QL(?`fdS4-zTpw94H#GEn z>UW}Uf0GXS+8actG=Tb|o3e9+TsE6DhAlyswe7ZvB47|Mx?RGtC1EzgrL!CB0a-QO zzgZ6>I5K*M-$8CYujgoFUzJ58;{EYV(YvEY(fIp{x3$+AS>Q$g*XQ?LtT~-O_}T4< z2mm|*U-2i;IGLN6I6E=_96&z_Cu!Ig6NDA)t0<~UngD^wh+*U>(?>2U+zgMvUZ|w3 z_Cdm?c^`euFjm-~h^u?uco@H$UKS|31o+NZJwSt+6vKW(cpKSyJytE?o^ve1=ymXX z-2T$l+F7tP^&_Njxy$~A%xV;G^YFaE+>!ti@v#J>gI8AjWn}Br%$4`PZ`rZd>UpFn z2i%!O!>$pY5Z~QVt=FqAspAS1mOKEVnb1dxCfYsV%wBDpfbP~wG_GO0567ix`hZ*X z#r}pq8IW687Cw=p33Ml|;Y|WxYa@UmdHLDHCVX@|a7D}s+PV*V;4G%wTtrhAfjEn5 zAp&8hT94E~0{C17GG1;NL!lS?P6?O&=&(vRsTh%QO7{CX*1H~Db0Wv5eQ8Q7?(-F z{NP#;!~(rdlC#W8v+%G*mjs{eIAD|Nb9WTX`(qKA86ikxgv;h+dl6tUEwCyR^=@Arl!p9;EvV6e$__ zMkD{tLvz>k?}F?l!Pk)fz?6=;yV%vgERo z=}S-Y@xN!4L6{xaRylt1jS90%pK)Lc$eEb=a)d4Dsv&hmG{*Cswf;m$NcuslsI+>X z=W1cLvcEAec{$2XBi-A|YxrqSFerjUY^~(&iOMR;Z{t>otHDVV%w)P?*84Yc%lad3 zz2d87`dF}nAM*Qf=v2h3{X+-qLv@&P4;gU@q0LSpdb^eNvk?(nbT4uPV%E0Fo5Ips zLf6Q@!KMmXXo*z|rDuLGqPZ50Cq~o>E%DSR5BZ`H20B+-#$ zu7uE@k=IM8;{9dV#613<86*uP5(T6qavw{Aph}zPCsp{!++n&WpHYG=y4d#;4##yr z*msi}%|7=Jr63TYlpP=x-{0ahZ4C-Hcxw*Kr!JQXvj_BO=7IBpciH5|Z92Vm zzG%j~dZ@h@EDX8tiw9-4Z|J^@88MwaNttacy-1$@zI(x)q2aDg1Pr*hOU^PRAt<$y z&?oGr4X0e1-&L$wMCA;zgWJ=09TE$t@YDb8YPbldSTuu60g?DW$^k!F#M#`$#)SFj zH|R$>VE=`7#JnVy5A(SYd^=yqvzqo^0q%oRLYi&p?~-+zbcy~(fztcDpzyAPjG@5DDz*fs z#l?a%;-E=QdmfhDA>o$(;Gp->F&{9xaofZ&x1M||L^_L{9RdpBREVMD+UM*fea_Ne zXx9_xOF_rhw&rN-t%Ety)*sAm4(xdu`YV0-R1eNt6QSbTtAI$tRuS>fy(DQ zZm@cODqdB7m5Exqm8L7$wE2{RbHZ=GXV1N;;0%#_vRM0`pkumHopp;IOHE8q*D)L+ zqOUvj3ju@A2-Jhg?*H_VR0b7uZ9m=uK8Q;Uw9cYans8I_^!r;OMWT|> zY@$eZt0AQ1<0!JBfz`ag5%I}8thN^XuOIiOi^lXa1aB|Sug@mUgv^}iHG&t3!;4xL zTS+=@_FBg>gnX~>7b7YsOQuhB20HI{A{smI?gg)^*3@a2yv|M!%HG;_+?~dd=xeq$ zVmKG~giyX;XpT}$_hj&g(fYRD^VJBa2Z-JS8MHFL)u*Ej56JC3Qt0r&C0p* z@tPuFk5q2cU(>V$TN3v8HE7N>j04d4RYxl@_pse?91YnUz{coxfTBVJC17blh%`&`nK(!Y(t z8k%fM(i)YR+eXr+>~QTRCPE`jF3zc#TolWaxRW1w+AvvoG4ZiCr6|ErH#y!;6ESY= z4qf5MP84X1GmQ_STtoJTQR#{29NIQH>zgoADbB3hP!Q3qiBtwqf{oyn@S4(q&-<8c z@i$Iy5eUA&PH_$858=;tvuYs$TIb(dlQp5Om8xoChqkdQGBme^r6_j>z+{k?tHIVS z$DW7wn(f}E*4#cs)@Xlm;YhW0wKm@Hw_K75y~v!?o;v0cR+XxD5ef3<75MQpR!qzGRCRh{jYRo2cI(!qY%lU=8Z_Kw>( zHZ91W_Q*K7t4kg;-#R@oY%CNWjx4nswsT*GMm4iJ)6uM-d;fN-YO1k!(%reMf9K4y znITJF^<+i5ykFKw5OoZnTmr6sicRxSuR5dDY=a8I!V?{1&X_u^G)UZrhc+NTbA-$s z2gA}t<9)87q4L|qw@;#knV@EaAxI zK^ovzm%mmoB#7JlvN1GOQ1B{Rj)1bxk?mRWU@>vO7<>NLId&hr)Z*`sX40P z%ew>2Qzi(Av30YZmMwF5{Q4Q7gP#rzG#*p)ov)Hqjg+!<=L zp`z2>MN<#s)xMrqb%LYUMOiep50=FCn4h?L(*0;OpZ6A37SJJz=LtaF9J-+;9U^IbVT` zpOLM*4V$6WxP0*KLh_+xkC47l(Xczyx(N;`;SdXO(y>6bBXZ9pa z>1O99T1Y8}#G`op@LD265B1PdlEG2Sj1eV~rU}VwTK!BkHS0x_IBi;=cGLP=8B5;U zzpDag-<(R4L~u*mMs=gl4&P`W>rzSHs+CV%uBG4iXSJ^*>pAg2lZVcZW zmMd3XLS0Zv^7X^dz5;T)aJ#~opk4Q(8iLt7)qlK;6Pj{=xwc!Z$jJN zcM`1wxGe5!%$)Vo$@rHAjOcV>rpzYXsw&cnDm!&iL4%SS#j6!2irWsep=`a7#A=SQ z;%}VQn%7L<_xVf85PZ}_xI&>&E6LozJcjw5>~qb{q%E|{!rB!!&s{wV&U4vUiPFY6PeS?3!8(Plegc5 za_;Mb)E8+ZVyv^Rw3K=|dBWoULUU*@sW8*-{ z((Q;@RU^S9P*bn9q!^)UT8`8zm{)Sg3pO02jmE$tD;DBs`0=-(el}6OI3?k8OUvsm zVdsP;DjJ%9P-}mel?ghc;F~KAaxXMm*>auX;CM~!ad$px5BEC{+)g3*Zh`N-`~T&E zpDFxL4+O?`%k;ARQAFIdowy_=Ph%Ra8853q15R2HSW}bX6?l6>)oe3i$FTKyJ#f{| zGyC9VCD$P3=OkgnQxbScWt(n(OM-Z zuji@EDs0Oqk&!130pqxIKCqXX@3-P9bEvyiPT0ZL;mUowz*m{(B2}(F$ z{AQ{Y{7^b9;9s7BKmT#6KbQ3HQw>hykL4nH`hE})&?5ZChx8u4w>uIUS=-O;DJGbL zfAe2sD^hhoLWcnx0H6V10p?(PFex~jII5aBJO5yZ$LBx&v#$oSnl}J?H^lW{Oh6;P zU=55dnfcuBIR6fm6xSl17;C&v6%eKlY>In>Eo!7fdq;3bhR0^Qk2biFlkojD!@;du z5LSG5h7-+UtLM?EnJ?P4wK%tEI5gXhl<`ckEFszGS!B5p^8Pv*{X28TidDgBy`=2S ztf_ofQ?Fpd<`HfND}Khn^rRCx68zIFosw;hYB`M)u7m8YJqbEA$s;jeIE|SpO9DHz zt*nQ4jGJXVC+u1Enwa0Yv6?(9lbR2()4cfHDm|th{CM!)pfVofU{m?|ihFwL>jtNG zz~Z2!wn$>h_zUTmu(U|sT%0)`-YxO5bCh9Gq38+wLRMFH6jdQ!B*PRD0}uNN+Q?dE z94e5`08*O*^K0DExFLs8kW1lHtK2lXxX%jxtIKkCYo28`0I0>Pqr>}myo<9&*SNZQ z%Sq|s_v^j)9<`UYXEk>ld^}~vIq&mXrpk^Z$e*@Jk^7PXj0_tDfEO1#8 zG(7>Otp;9Tz2szm#arGdd-;lI9ZPle>Kfs~+cMtdYp*zjvM?49<~M_;q@U1lqoD3A z+~flMix@Z(@W7+7-Aj2BXG3E{XG3N=Ln9OGXSSwxKa{6h>@b)-ffBxb=FGz?8Je68 zP>a|T>Yc(d$_(Qanv>%yH%E!BmacD+oVZ;#hoz7|LS3#oX~Em^O$f8j-Z*Oy^6@@> zrm93$CaO>st4RB`W|VrH1*U*Z$Gf{wR3#3L;kS_yejm`PYJxiKt3w z+!o|3NmgFBiAy$9SMbbmHbC11m1%ji0gJK3JPM6eM7}5U4z?~rYNtzps`s+RYTJgZ*RyrWtf_jTr1JB}Y4ZC+3%SjwXLL>;G381_zN> z+^}r`IHkBsyg?iEWTB2J!~tI9lxvl!l-$g`xI_`@gPm1C@YRPxpt77Q5O9hyt01gI zt2yrwPRmt6AQuVcS8Bru`ha;JXY(EVG}1(-szxO?8wXjd=q)a&lCF}Sf2S1I28kt; zf=qi^PzNZJ%~OG#njxfbhQH17b%dKLp5VbCguY?U0#usz)Q-kZ>9>24|G z_GuF|AfXrnMgm7NH1vZTqq)UB;dWw}Do(>Y-J+KZblS%vgo|*hk^O}h{-$3B?5`1g zJiXiw%Wi^pU-dy0RIemh88l4nL>WxXt9ic5jye(FPSRw3NwV0{2opO0#oeVl;R@9= zao}6yVUu?h1nPUI?ro*@{I=1#|_xys)@3HWj@!w@lj1@(6J znjgjwW(B>#Qg)QzQe8ucy=)#6lKOqW!hO(9v=khRiePdF{yi2A?d|^;ir^Uh^GJ`E zkD2?C@g&}mkS_-iC5htl=&4(m2{@}$&K%5gV;b2QNoCH$-|X!HA#w!Nc5MXpI(_&X zHhOYO;!z>BjvSNZp=G=9^fepDW`{4eZ&IkF=dfEYQ5|1F8>8*6!!@IJ1PSA?gyEn% zJx8yX9sQ&>Zl2!T5RowvBB?6ZlX+WH2TR=UZr6pLg$UZRC$)#cvhV~M2dvlRJ#a9# z?OY$bcv@3b#p%yAG)SuQ7kD&KyPZ=}jm>FF5tn`4>lS9b)}*G`f*tX!a#j;Pb$H4p zIb9X5U92(4DL^@_dE>nY7v=qN!dD>YjF*cf2!H5gF>}b?we| z4QM$?Z6Tt0u&~zo3IS)w`VHg6tti6;ABCyYOw{zk_+5AHseIWY$Vfl;=u7*mZ0ki# z>WUXkqeOFw^K0^@xn^I+!6NQEr7P+(nN|TTvyaPB_d9K<4`L=~VNIAPjFFac`o8|Z z;u`{z5v)>v|8mq{>-pFCn^&cjWd8~9&zCCxGW;-}E<*El}`eyRV*rjI4;FH=Ks$Kprf;IW{6jPO_~{zBjf zXFPu{=YP_Tj{zSGuU~+I;L82K1^gw$9s@pBD8B$r2_6GJ)+>*(9!rQ{Sm=cR0qeKx z{#YXX!t(wD>;FeEJf7g=jPn<009f+;+x+t}%s-Q{UmyTLB{+Ngm(=XB`9GhZe>dkN d|C{-rPg5mX7;s3jq%S0YMH?fG-vK84?0w8wLUb3j+R`mZ+_b zld+AHp0c~0v7-)?o3#~L4)inHEQn{|&;RfGf9!!y19cxH;Qp4Q@JXd!kfmk6GjV^uxEg*PD2$tQ@ZDy@fL%c77!k z@zk{FBEWt9^+byj3hNsfld(LUEL)e|F~oMz-{=ez(ghMR z)CC6mjmj<6v$->emIzWQ3^$T!kZV5pc9PgYy-Ft3SfbOo`ILh8<&pX`KzO)KOCQ7H zAm=tXPuBbJWuuYpTfeCUB>!B<*q$uM<97AZSdEBSHcxIOSf0qUrNpFu`Uu#}ok(@x zK!Z8=g-P#4SM#^WsHqIJcPQe=He79YGRDm^<|SwWGH=-x!wJKM)gC>dwZ0h#8Vk)r zk&;K(icejja>8&2cL-L19N%j5%;$N ziZ6jP7zv!>dJe``j?7FyU;mfd|Bv(e-@bZr+?W&;E4tV+%p>_mGw;00o4om!5P5lY z5GjkSNnN%iHKoFQ>#Gu6oljz;Jw^u$X$|pxR_n!X`5%yPRilbD5l_Nuj=dQ&@mzB) z=@iAw#wrU{ZRq-D&+e>F-rzc#OEF-@(J~3tI3zmOLrPePp(=Ixd4ZgWh&LS~_@R|8 z_YhJVTl}vUMFDUbz)~&zts9ZBF0O>+$2F8FHj6>{uosacj7F@fY~kwqdvtY*RUVAf z=4GD{YXaAWhR}qDKlO%w(WZ0xs$)7~Gg$Dx5g&lq50ol^zd+E`jw6oaBeFYmt!X&C z!XCT2#PHT%1n1tyY3xfce^P9{Yb(Ykdu`^A(}n6xkMTiVADgJXJq(_=`@zGuI^3ZC zDWx6~8IyvcARrJCARy4desr^9cC~e|1lrnK{$$Kz6&>5P*Vx{A=8vKYmlH0%cnUl4 z@UtZoQS|-QRdR`v$Q2U6DHp^^vxi$l9Kjs6Bw4}y)3iChBe)Tcze{edzwVfs;S92s+#f6x#(8tp(GYnK# zEf4$5P+`Kk#~4`r={-oBq9!R~^RvE;8kGYps=S!H^&vXY9pSw?qgyv-EGJIUIqI9r zIE$<@#1>Uz%|xANY6=Y&QvmAEoI-={edk(87FsH}%S@K!J5A7ab)9v!m_Cqq2=BZ1 z0sbXbx%3&5*_|SKkQ{Td0;ie!Z+gF=Ta6gn;l~A2dVZ%4hErv`A}d;XheOl`|6t7V z?3+amrs$DC$R=rh!^P0KfAXF=jZS#!bf&p6o64N3WMamKc9oZYFbfW$i#9}=Q56Yi zQ=pEBHqqSnCfODo0#@&R`Gm}ra#h7AP6=zddo|-pU^EgWMQy>};d^OVS2#-O10m_E z55tpClR=Zc4*I#whEW`G9QB~Ni>^-bY^=y{ zr%1=7%2MoDl+b>&vCEOZ$e34T>wr)+O-+^-*#k1d=rbG#D8$}NwM$%o(q=Eh%Vva& z_6Y>ZQAhvmEnWql2%@wbBkVj%__$TqBN148YO~yM?-rEJuNPzurdpRy^ZV#TulNKr z9#IZSd(H$o7MEPtK_Dx+H0uckd=m@nOm~!Q8y)xEZUg8j>$ynNj9sAq7yfD9;`_d< zN{HG#JbY34Rs+uRqDC6x&SJRJ-_Ke)#$MZCeJ-lFo3y3_v5OTbnSDZ} z+DTWbI(~@{0m+>m9q5<*aB?0k>tf#1p-maP5zU?HSF4mCBfR2)r}&|d(ZIQ-y!u-K z{3Mt}4{GyT|FXG+m*UEP&*$sORy?I#Hr~EifC|%}WNKFI&Xe)`M!4t~b~Em;<1G_4r2`lmZnP z+<>3F1?MdR1Uw`-Xa8ctf6v^1u_7c`@PaAx|M%4tH)PStf*g1l)Dke(Y#;X^-_PSo z7t~xk564o087ymM2oiL^O-VT5D$h$E&86fxWgRrR@e&M`CM_Wk*-8h#zT!W~4 zDW`Tx7E;6Bc!`P=AyHn!#LP7^V+oKm`XWRDdabDcu_GWsS&N9nSRhL}xw}W$k-(54 zpcJi9Ok?vpJ))H}E5$r4T>;&7lc3WFx$oN6dYMBfGP=cwaP$}-sV@{MJ}c^FrN8-| zF0RHi825~9?fBJ>9w?1B0`^2|l!ej_&`NjggFv=A%l4Q_j3J~RWdWXib}eQq$0~Z{ zXzw346$~6hf)@^|k3W$9wA&Z&-x0inhJZN3fPi=o4vD{adq*>4V<$)EUkmHcZ9gqu z);5;~{rzF$g`j#1oMD_Q9U@3IGUr0o`O6nm!}t1Kq!TVJ%IB?zLR@)@ObQqu4ETJO z@I629??qB;9SL)5$!Fvxyh|)qbWs^ywjCHth(ix533S0l^mr#kx+!R5!n-w{4H*KB zz=d5MS}}x=R0;V)WKFaDwGyxGn@+h_VD>rLXP?+HM(J^0lfcR|iQaooWVLCj_V&I= zNWy4zG|e}If*hYwXi{RQS0avfWqIRK* zUoT5oTnGaoIh~pBxXW~euR}*6N;*m&6k&lBo#iZ?Ow`fgS@vy~whcD2>_XCZ_VO&8 zCLjK2`%F)z3s&rS8I*-fNIWuZ&PHsy^TD*PW20!E-%pbhBRQ+?Fsq4&kxatMe2}A$ zCD<4I>plLpJtQPAMy0Z#2kB9*K@xPsK4ZWIt>A@zlW3k(l}@O&)mz60+7~48xE9IL z4ENm;dG?&&-ao)k_c#*>`rh$)x2P$nPi0O=b3U9;DHkqJXxrK@KLp--eDUR5D!K_T zDmbo{Y_*v?3+ z(T`)D zSJc|sV!cs2!jFnQq91sHuQYt0Q&k!{VHfhE7C7UUYW6r-nWMX_(Z^F2hO*ZNISj^T z>L#;M$9eUxP9ZQ`5of;Gg?6+2lW^2= zKD|a<>T4Ic4`~{1Mbu&EukVD5)IY8qEq_@V5Q?8lx~tS1JvsTzd@-f{$1I{!Kd{ z_qeE*KM@u^Nu|5>F(U54>MEv-_iEe5&yRutS97>at9<)J%P{SrjlQ@Iz|xf4DuP;Q zgJaXth2x3#$x`}htA{U`QvMFVN;(M{xpP@YsgXej)m}+ih6)w7QFb*&+J2dlcNNkn zNMCabq!1;qsi>|SSaiiLfKnD1A*RK7F-e(v&(^o+XSb8L@Qm?H^9qt?eiGGxmVHou z&<}86uA_$gue%o8U%S@4&97Zc?FTQ}asamp7Pex|sD)?U=L@_259I0)O($Di`*;0%Kf5HVrMkF(YraTg271bv%s9M?2@8dD?}g@pn7 zed|9~2TrnvrfMOEMMX@JzVS?NC1@`B1Gz@|oz%^%6$V$oS$*0^Dfd{&qm1mZ6UV_K z=EibxL=$}#xgu}gqBPpT-ucc5^?Y9S>_SB8jwB6e@@3b%R3$p3CZ;`Ioc0)dWSJ|S z%j%{5fnfsM0WpdIYfE4L(1+HYJm^<*Ke}Ng3QULU)|t2c*#u&3wpx*}@+ZFPBEz|f zb!En58)GrFw8zP{n_yX8=}<3+Hj??~GDcR-?)CXH#ZSl$h;1-eCU-uj z;mr|BXMuA)OjWMI$&Iq#_icS;g3eE?g8#OC#Yv+EkYS6}0o8?$1MZ(wg32*OBzY=j zS!)-)W8YE=5KnyMZg}6(Z0B0%BJVNk-%4`0HyP(|bO7?)Y9$b_lxwRtx6w0BuecwL zyOYOk8^Ue|%pL`A4Ua2qq(%8FVPP%uC@^9|Tf|*klkKj?pD8$4g zoaD8#n1RPx7E4P(^Xv^#i0&tS-3nMhmmPTo2kGCL!4cVp*v^9f#+PajPF?IxOo1y} z#+`qnRlL~H`~8qsTZRoTTerc8$f|{zswt`_Bk{cZ1DQ6?>~@GLQMML9mE0V0`Z9?@kAv&C|4bfdfsf|# zun-Ut1V7uNzbN0y%-GtP`Pb_&w%^mx3ZECp_F+B~gm30;k*sX)B($0Z%A$sqM(xGsSjwU~R2nyu7$KYp=EBSc-I(49+n#xi= zZ_IVr%W-(@6`oF$6LY~Z27uc@a=t|OQ#~r6+e#!(W4@V_C?Tb^14fEEq!PMrk!tfG ze=%CHF~sFfpyA#-?{-)D`N+TzhY15-$U`A|`i^N|tLUT+DOb4C6KUvpU|U&VPSYo5 z5xt132lxZ+JiU3S4EK=prCnXWH_5GXVQ6zjn0t`v76WkKITUCFzkYOGOdY@xn z*=p@3(q|NH>H!-fvWMkcMEJq@VQnM7V2?%C-nX?c)7z6pu)3#u%PR}V-$%YO$Zsys ze?@~Oe=`Pt|NinNtNDb;&wsy1<2s6zD%|2kZh9t`Kp9N99Y#9QF=;j(Lzk;oe3W#E zLQu2d?&oizu9mz9?8mGqnF2Z`=R-kAkcs(IMyU@UV@GC(iN5Khr5YyrC?@abn81QM zl&EPe59;u}KAlJmgP&`Ys7h*jhp=oG=S3s{=+mxVlxr(063!6xbU8WN>FdywBjM+9 zzj9ovb1U(=D%N8v20Gvw{Rk@@-96BhXnH<)K?%mfljZy_-%7 znX}WedVl1CNN)(GtGmEiPx59*1YLSWEq4aufL;6xV-4)e=RN80BfHqaz4SsPVppu} z7h22YQW4OXc#bh@Ku3J>nJo8!oHtqcfO5MHc9)O}V<%U5*0n~tZn||V`ZV6o9SR^y zUra=V2z^9p%=44VYK%Z-!iw3}eJA{y$j0Dk8OkQy5Y1jRR?c=e{Fl&S)=a^)1s-27 zV?wMb(UNx5d(Io)$&9cV1AP6t>t?m(44O2*mHD^NjJL=oSA@#Ia93Y1h(_%6lJFwG zz#b_{lf>Mak-H*KpmnfeF$g$hKShbXjDbtAlD)L{k95j>30)_o?JOnc*Gm&V5jQaK z;p5?Y8~{U=9%{~3!b2vG{rosh-2~KN@t=e9&?x1`Fp~9+m3s<#4o|P zeJIGx1Dk{A!u5jib+@f}2iAut*8|Skqhw26nmQ#5>m^8+blGP@jC4!u7W=~nx+%z6 z3fA-!!w$`{sc-E*+F~IuFpBkeOMbkH2-zkRaa3PXo8)rpcAREE7{m{DZnaP)1bX$? z&G9p?d2{t$6it3!NHtk&i=~}ocxWWPAfOP1iv*(8x|F%pu-B;RV#L(iiiqqEQYzHN zm=zO{Bb8BPi@Pu}YU1L`?)W5rR>7UlhOpQz{@UK>_g!`Vym^lZqE{wAGi!;~md4=DN8uPt^rs%SpGWy2kQNy^*0c{WSVl`eCb9?~fML#If6Yf2 z&f&SbEUvLhhsWQU%eW%t#KsvC`V}N9x#05sXnIlzE1x_}s|MEpsElg*nZ|WpY(QSB zWcN572@?OlW_W~?3ld_cdC)nC&DXQM+6YPFQka9=McgTN>5V+lCzl@TH91j~?`I7T z8t7s6X%W?lWl$(xvd+Db!le9E)Ha{Ubwp0MJFFT-V?pl06zk~039G2kE78{}eFzun zri)dD7Wmw<9O2yK@;Ka1V4bI@4(;f zYz5m*7)A{g$c#qkFQ(@bmDUH_L~;41MLdwfG&Zi*vvtk2seCgyUuyg?xv5c&j`JSS zJaK&QQFExq&}_30DZ@w+R^)RrW()zR;PlbgLt&pVQ*UijV?KyAd_cxqmyTiW$EkEp z?fq%EsF2eET#azi(ht~t)n;~D_hy@_3B&*!5ME&9QWoCL(A0|GE)iY%G$>ei)HccA z;(1b27RAmht^D_DMb1s)g|nADXeY+V5q$B{e7NYkU6z-&e-wbEHzKJ0;GRDsm|Ic) z;xI=icPnGZpQ^7~W!`3<1=*Xl+!y`qpx+#6$%r84Ss7-EU~ zd$hr|qPz2=iN*kZSFLWMXYj`CP0pnn=)i%O@e{h+@9AEHYGNnarmRA&k#iHzlk>>z zp94bkv{y+C$Oi-=V<-6bQnDs0v!luyRt`oD3@8V>_QlObr=NYQ9ScnQ6j?wmUcwE# zQ!KY8p-W)xyw3!Yyxq7-ECxc%l>Hxm9-J{`}pN0K5Wb8>1tV4jd|uaX%~PXzr6Hp=d;C=`f( z`$q;F(fyRC;Wl|#{20tPm_WuF&Ubyt6LYzXv zl21pK#J0#t5`{)03)jgP4L*U3Qxf||)ccII6d$!mz(#7=RQvea_01**JLgz zx=r(PxV`Ef^BjK0s_m={3@#tag+#}{uik}_?x^7sfXhS^oL zW8@x>-526{(teB^NBaXVpC*%$C|SnNxp=Zd#E=0Sp|(}RpNK(t*cW*Hv+w?&!VPJ- z+VTqgqy$_PVgDWUW(E$%MoLZ&<~F9kV&7_1+9rSn8)no!{{Yhc;CaZk%6nRJi&vve zD}JPqeWYQslLsval!;^=o_o6t8fh&xVJ&v{@tmBJsH~x*XR3fr)%fU3HO;tQcNWY9 zPW5%9;Tg{?j|7-E`RGg=<+V4oAhSCDc+`> zx#d8EZ6t&^6VM*r;paG29bBNlJGC^eaQA$d*K;>TBHn(bURq`;5*|r0INkGAex{<_ zyTkIM#UbWDdWzSYO}Y*ILEN7(ZJ}ouyz(~u}i22 zQ8&-{sUT!grB@X7DqB#%n>pC|$^8R^;Pv>k!6apwlU~^V7 z)Sad8I-GE{__Kxei5^@vGoa>>al>0JfQMgE<6$q#o0QKcw{=+JWF%0MJ4Rn1M}oUo zZ2#mPPzzz!TFPR0&O#!gN@ zqlNj&Ju4H?+-6UCNNPd;IX`>F{&oQ>_JWf_9)tZhay)2J~gmX*k_WmBUZ$(4!(a`9^vPHbs6KuMd2kJ293YZ z;Lzkhz$c>J$zAjrxB^uW`7Q0@jAj9EE){d4Nn3&*>Sk+(K6$6EpFj**h zN7}Id#5m>T6;Ud&-eY{yRG!B;9s=!G@&TcBW|gYB`0DG$7~@8ZM#`q8?6#wIA34Y9D{YO= zWR}Yroz`hl*l`-}6+YaI3BwN{KN}B-7dJ?|Jf#g?<jCtxF0r8E? z>p&hk=gKrr{G4~RWQZZp{;&+@e0}-HM5sOuGd>4D5e0_=K6o{4%3XG#K)W=5^Zw?c`Shyx5 zIlgw?9C(X71a-0Is0nY&`%Q>#_Qpx8mzRfLQc0ewNCZ$GBS*VYIZVCH0+Ua!?cG)& zqWCe&ft+y78(91SC5PVn%w&X$*Ge{uk#`G)Wh^vG%(s;!P)*1fhyDa_-t9{a^E6U^ zhKKO?(R6d>yu3LFfUu=IL5~tm-nBqPBC0&vYjcW~BulT`#3k#=OL%5DYjm3g#c5g7 z9`ljJ+!tyoh`i6|?QNU|mBB)be6^~lGUg_l80bTEmNLsz^*BFG`bS@R^YP}Wg8gI; z))g4wAqv=6@(#9kj?9L(4#q!y1U^ylzu5`cbzX4;KRM>jW#SE5uO|z2v^XyMd3G@Y zND*{1qj7;E+yy%eK=9RrLZGsk%;$HEHZ3J8i>^Ft5lYQbM4%83;gfH|4C==^i?hDN zIf*cqDz8+G$-+g}ESy$jEu$;r;M@5OYmLN`@q$j&Jw2URqjQczpTCcrf%2I$)dG$rpmHo z{W(B&wA+osNt*Fvg;w364&+*?v#t)y_vQ}a@2n3cyf%7`?DjbaDj(E(>nsQ5dVrqaZmhi@@`sAh zv1VT0K4)7GD?Ka{pv0)vXGT@WwE4rtm($}P-&cuN>I&y5UtVRU4@q(vQbHDKXYaGp zMBiraa7sRt(vnzb+}J?&TL>%DyPEm}%Vz;ESzS&}g9EH8;P`}Sfp1b_ZCL2klJs23 zH?Nej#-F!P*RotHTgS3w|Ew1S9_TJhImSNvKq}K-zqSuRET8%K>}cb%}Fu zZCcuOS)d5M!ZY6lDL`SfNXYid7x5;c>X%lKYJ!vX`f>X ziS-bMwI16A7D~cOo8o|z0KRpgozOy8ffdJ`F z0I-}oY@{UV8Mr2US~13-UbyNTV#RH!#@G@h899ELN%jEVQU8)Ckk4L%rR$#$X#VRb z{nz@3>G+lnY!@J~{I8)cBv%_TQ!u5I+!q z8UL5r`%|2!v)A89l;E27zfEGFqCB02{6?7r&vgHQ@=NnRMS0p3{*8hMR*+znr!C^A z08cCO-vIN3zW{#M=TA+a)-}IPM@jxNeOl!_MR= None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) validate_strings(result, XLSX_TEST_STRINGS) + # Test XLSB processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsb")) + validate_strings(result, XLSX_TEST_STRINGS) + + # Test XLSM processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsm")) + validate_strings(result, XLSX_TEST_STRINGS) + # Test XLS processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls")) - for test_string in XLS_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, XLSX_TEST_STRINGS) # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))