From da035d69cfca915318eaf485770a467ca3c2a238 Mon Sep 17 00:00:00 2001 From: Aidan Do Date: Sat, 30 Nov 2024 10:14:05 +0000 Subject: [PATCH 1/2] [#342] RAG - fix PDF format in vector database --- .../providers/tests/memory/fixtures/dummy.pdf | Bin 0 -> 13264 bytes .../tests/memory/test_vector_store.py | 80 ++++++++++++++++++ .../providers/utils/memory/vector_store.py | 18 +++- 3 files changed, 94 insertions(+), 4 deletions(-) create mode 100644 llama_stack/providers/tests/memory/fixtures/dummy.pdf create mode 100644 llama_stack/providers/tests/memory/test_vector_store.py diff --git a/llama_stack/providers/tests/memory/fixtures/dummy.pdf b/llama_stack/providers/tests/memory/fixtures/dummy.pdf new file mode 100644 index 0000000000000000000000000000000000000000..774c2ea70c55104973794121eae56bcad918da97 GIT binary patch literal 13264 zcmaibWmsIxvUW%|5FkJZ7A&~y%m9Oj;I6>~WPrgfxD$eVfZ*=#?hsspJHa(bATYRn zGueBev(G*EKHr+BrK+pDs^6;aH9u<6Dv3$30@ygwX}fZ|TDt1G($Rqw927PN=I8~c_R69-cY5S*jJE@5Wr0JUS6u!J~3#h`{ZMo=LkbbALoD8vfgB}Fh|2>mhOnfS$3 zNV5}8Ox=$fj;C0=UKy*{myZZPRVS|0mqr-HxZAy;()@wxQ}MN`QWAZTXb3Z&Om9W2 zbnA^OWoQbAW|3W^fw#J;YzDato8*`rHQs+@W70D&SyT{wb`SN*3nI z5G%$wJlq932=n{60Eii*9H8dFih2ks?QY=>nAFL=5g^P@#b{YUEHt0S$D7WbX zx%TzvzIK%zpvzLEd9LNr0ch#LFf_(9 zEGt0C9v~%b54vynAc{~;v&2?S(-sTTft@9CABMNFZHtY1W0-99CEbUNfp_yu{LDBz z@8z^$LPN$wX4Hi+dZQs6K3QiKKF0}Nme@EII;;F}IplC(YvT*C3-Oh#(A}e5pIz01 zyR}D2|ftBF0T=1moHZy}$wS*PSCmSzHQ%x z2tCQQCx4jt7w1cuhY69~eH`31KC4)ZZJ^)f=IabocAkBPa zEeg25yPX&9-i_N(Qiq!I3RDrfx&0t^i)&MSQ1D(w%|%#LTNr>1cPiltAYO;6kBn(B?r11c^Bz~#)z5~~V+*`U)lDFtKbZ|;? z&4wTUtK=KE&uQIWUQv1mDE;LIhXXgx44PMa@%Z<7a& zx45^oYSnei^~%}`?!O-+cgfSmn_c?`=Gmm*Z^I(96ve&$zDs|)r84)IEEiE1kfQ$q zm3km*m1)PjdU9nkk9BTlidI1~M|O~WfP7AUu2T}d>5is9l$<%;7r2&Re06w>W$KM~ zqITBTd=Ln>^crw`_N?{ z;2d_=E0n!*NisQ|XYuX9q3+UcqdA(MC45|>2tz^c6HdZOmXTB?X2Elx@_0f)1z&-gS;UxN`>Ll-kWb0X0 zTrQis=w9sJ(q7k|@|k3SA~DJ@uMXP@4(Mgn+LJC+3F~3NHW71pIzY(aHg~{O+squi zWO_|F>78)L5*gcRXXRD9IzQ(ddSxh}E7(8sC~EYrOz$9BkSMBCkGGO9FuZ{#*mW+h zvwE7d)6Ag=a*R5URs>}qdqb_E6g)kN2Wel;pWe9=hZ)XvRZR!RQg&gxAPGj8J0!gR zrdV<2@MZQ?_Ocbd5@0zI?t>$z3eD80_h^{DI)H5lk`T4lbn8kteH3%fOBH^g26#lLN2&P^s zr&d05GDs)u_8OKzCgNxllk5pLC<2wKmghL{zW%}5^}%S$?d=3OzjaSzT3>uWYikZN z2ZcR7*L|%UMs|u)wMi7#vkN?cxlBcyAM80Tyzzv&zHMF1TH9?Mx5&E57P^)^zE5N| z^foq}!--if$Uj=U6Tc>EM!Pv)e^_SZSdvtQ=@>)(ONejQ!XW8u6>ESl<*s^6cH;Q1 z#n}nL{#|{l}}@td^zNSA;R{`3A&Jjr8L9(3^2FSyZ1W9$%;!XP#N2 z-SAzyRfxtgq^py7_3*GJFO%x_v<`xJ46`~S*IukgQDKfLxzFnS&GYL!1LA{I z!c#{A90{k(b*tUfbgjOH>}{#V;%^O+LUU<*#QkLtWzjho*Kb?Cr&wC38%wxpn}^Wy zG6EpV9x3xioCWA6H6=aE3)%jmZePu#Ji7wy0CmkDZNG`a{J1i-2`Bt&UrFb&<~V$^ zy9i`R1<35M&{mtCz144%v#7LKBTPPApjoV}#W-gDc5cn;A@Mbt#zXUK@J9^vj*ME( zo8(%K{c-KDr8n1-I&Mjn)*i|pF|7l*`fXvo8-z&j{$NOfUPM-xILbX1D29IHp|__B zL*JQ8*7-VrZVY*&$!PiE%zv@osg`qx0M8+w9iy7Az7;HYezs;5NRvrdNM~t@o}5Gc zjagk3Y_>6!Ct;ITqhu3FojJO^(^SG-($M4|frkp?4y-QoSmFcw9Z%(z?eC0kGi9@? zm(vAgXU|%!6_)CrnqYL-Hj@B5hA?#8C3G^cjd?0dMSZ!wbe%O4bWvlIG=nwOEInVj zhjzd`Bry8sXBTfIUr+juZH5JyE#7~UQiwR!gmG@wm}aNyo`13xEo)tzP64MWWG|j8 z8u8a2_=C2FdRZ9(eG&Au`@$mY9vvWldP-@wj5@38H0W2V8wnaQO?!)qoS_J=(ieoI zOvH}mkBRh_p1oTW66+?3u-GH2Ex~c=BQiwpJ zJlF7O2PBaCojRRL_mp44*Iq}vcRFpBD>V9M7do5{w&b;4^<_V~Vr{+O_&hz9k5Sm` zq3|%Z(6B5~wz2k0iH-QlafAa>1%ZebdxkR;6SdA?@dK|4Jf8PIO%64Fpw$6RYG2R# zX>Iq(xf`5Xk)79-@;BAQjlWu|w@Ss3sJv3Ew&%lBu-H?vYsC8XPJD!lkv*A~z_-k= zLOaM?B5}$Sf-KF5BWHoB51WFA{GlweQna618{*tqVn)YKUVq?khU_=QER9uW?N17xgAponbjg0W`=>f;sulH3?st)Y_@k$We2-__a>^{E78lUiI13qq!3# zwxMEl75MK1q`~J>ST#?`mUx#vr%-jwpZ+DV;W!0KNkZmO#sK)zt)H@`EQl6RRWhwb z0&E7|fG~@z)wlK1-RsxN#8Gr)D5=xpv=b}=CWPbwz@(9bIhD0Crd-Q>qEo>~Gh{X7 z77AK5>TfF0wK!?7Nx!<5uDy?D{Qg$SEc_R3J9EuH!Z@qmEJ*QRRHd3BPirM6783nv zAnab$>rhdDJ6pO@%Ox(}BYw{Ba<3|=A%Fg5_Hfxj{%CfzZCFO{?%h&=?%CNBvi&p; z(otqN>+5giLLa^*G?xzN30=IgQrV+r7dW4bX;zKtuD)O$UnwAKC?CpkPt{77nUArH ze-jKcCfRrOlp(Q^b&W}mrgt4n%wikNxeSBBE_n>K-IOIzi6!<)xGRYA)wGgqp^s@d46N#krDHPc#9SOgXhI7Vbj?B z%c6@8dCOGPYBoNE#3N7HD^ihbC9*xGm6chu;?fcuv)s01keHHZ1vXl5D;29O7wZBr zyPzyLZHKMtUI%PK+*X2zTFtaDzU1qn(H=hRRj-SoJw7I5i%4b0u=&InEAKgoae-lp zXk0SkjlJ52HruS*1QykTZ&aCN`PbcKuw$1st{peJ@&aF^aR@~{XA@L&YvK%+VU}G4 ze5iuesu&i6=*#nvHbm_v-ZLr5^Ij#|YSAper4XpsH;0x(2h1-tIobIy;0~2a( z!G($SB!iu#P;;hGeI~C`O=-3|d~zoB0!`*JrU-)Ko_X5#kSpy5o^z49RG;{j#l~45 zF?X9Ih4IdviT(8@+q|`BveLTprbESZ6^2I&ew|V3pDXRe9gSyXT)zzqKQ;gCD;p+( zM)2(;YJ%P5)X(N3ZSn>dn6UIcEcvQOXZBn}uD!7V0yXr$f+d@eTSYoquPit2S8cPW zA8t3dX)Cv{0cKF`@e|PP(xS0|z2_R0(P6)#+kC$0^5- z$7Hs|bOQanE z1oJ;uh(dYiDt}mVmtC3&HaGT6-dY429v#ySHJ7V)C8ow=PSmnEI)=b3_RJsU(S*+J zV$p3>RkK?DFvTc;(-T=h!1u~CP!pE=0eSSu#c@N7S0Z57CPg}!5z{QL#`2v?DJDt^ zCGN{0p-&&=)Sb28Xlo;ZXc^CGdwL9prf30uu$y5aPeWD6WIk4%%~DEhTiwOvy!rS% z&3z#DWo2qBA*=M2xIu=_R0sbrmP;Y?_rRa^k}3WYU6n9H^(})Zi-woMKKXfgbab@J zWx3DUr0MLpdDYk_LO8As}d*Z=x^K+uIv#T&SnY6&C$9 zBn1u`G#TBt+n5b%a;Cr0h^sm5Fl^OdxJ^8IebW);DWATq#Ba=#rggj*wNKy5NMzz& zBm`bk9bcSVPJbC`dHrI>o^=LSvTFpT`VAK`x_naOpvS~*l2$1vIk$avBA!|aeZ+7c z$_9Zzh>fc4$uX&w@-$VORCscG(B)OA@SPj>BNY3gxkkcPgNi9bE=?&3A4`3ekrdsb zn~`M;p8I>4?@@ZI{9Afv(tC@pp@Oe5BYUw-%&J_WaTBGls)&d8q?t$i<<@=_CNfH! z4H!ww7#gkp_^`bxZaJI9@C+A9x7@E1ZRoG5PL?w3GDi>`8Qq%I+0ygfT78%{Zt#mP zqX0CzaHKn@hAOQsv=^8UbfpuyFnT8Ht++Vmmx$~09!e{5t8fMkEjr~tfIxMlIpr4zGwvEIWKC2`Q#C)c7QF9wet?hE zLKoU?t@nqm=iBc` z8_((*(i(g}7z)3{%SJ!uya{?Ir-2^Fiap*VC4pF@N zpL5F*DG+(taLhdu4DbyAP(0&60n@%?G~hHugBI^-X6@_YOu}8UqwbQ8V`2vwDRLMz z)aRFo+r1f?5idT9xRF`cjgx$a-IpH3AH|bs$emw}d23*3aU0hYNh4(D0o-Z+wIX{d zeann?lzjgsAt62`er@<$`G755?i7tl%CHNgXp}#j>j&S1n5wZ;ofNbI>B2*4L1}@3 zq(LzPqn()w{KBsX!5*a&=dv<}t=R%II;TcQatbnKM7S4Q1PQIoT=^$#=>Y(m{mBYtl5W z6}|l4kxikOcJ`C3o{TSxIi?8|N6sH7Lkhq5qttl@uBTA|-cBluU$hU0&xYKvNidrL z4q>|j76}G1Db23Fa|XlFm%W&jW0h#7B$_FD-ZhqJ5#7i!0ZmCrereX z|Jlf`<1zR2akFe|boWv-r=}kM03o|%$mZA7Of2T99u~e56~6sh$P=yk9f!H6msn)n zvFOLF?W?iqi6fK9C)a42Sgt0kz4#M6 z-UY6451Er~=V;ITs1O-q*>}{;bs74MMZ(Z&=Z{5#q+i@cw^vI#0|Dh~-Dh-tn2I(S zTXXp-bLEG{p0#BbIqIcTM|DWZmr`&br8u)jQ`CR*^+g_fIX%=K+)x}F%Oak-Uh$6nIHUavnNV5M7YffU80QPRD%y>T{bIzn<6Rsy zb6cW6`?0EwSn;uJddPn@`?^Cry2s(6ccP1ykKr!kmDg2~zbTJq@+e(z5N>ZNr|8$j zPi-~ofp7E|Xx1#H+f@UR@AS}iLP!}}dRwf{u!avAq-_hNw#uaoOD{2jo*eRn8$~bDK`h1&ssOC6ekGV38+hU!KR z+kpnSzT;y#o|V2h|F?SY4-z1MFxz0;)@Lk`H>Cj zSl@fR%*@F79;HJcsX%L8_d!%TwmQyi$|n&C{oBMJ9~Xm!@@#lZdz(WB9SgJ#NIC%@ zy+~ZnI|4E`7f@W0Y9I@N7UTs1fTPD-ZiU%Lr2MnP+2h8AGh?(WGVf>h@W-_M>jRkD z(KNxvo(UJ7)o+*t%fCcM10;2XM$1NAFKwhp(c917^io_ynn-yv58IFIF*UJUw*2Ma zm?a-a1yp9B?WxpLzap-c^$HKkX_IfT_W8Lqaltl*A%vZSZWAe`Kv}vjz}>Tc;Hw9T zA+Nc49X&{WDmxY~ReV0YceXdL!$9mTL$Q@_vXIW6I{G=`$KR7jFcE&IsHwnKX;KldV#YL z(xwKAB5cFiz+r6m*5iJvo&E)XQqVWjmA}BfyVS&dm9&Y%$Sp^sW!JE3iI0v(kQHdo zmhWk|gC!e@CFKPv4BE*U;mYo0y}J0J-Fhu!c%v+paQf9+3Ed2EkfPt(D7|Ok#t)^PGr3Y)RGfvO=k;@Xry=Cf3fLCQ# zi`%oCt+vyB-t{iEgI&+2dczmnMXj>EOmSpMuuL8Ob`1$D;fc$wM6j2HH4Q$ zqaoj&M$2sLhpptdJMbs!krJId=iOd}HdP4Lt@yf42OZ{pOoQ4_gShz_sMoWYX}yQd zDQ8(tc7UvTt%`0#?9K!C^J>GpucEnBhnsWg102Z=uzOlwez^q^j7nV$krID#wC}A$ zcRfc2)T5Y~({6@1`{yL-Lzs;miT@C9|1SIFBMK7cz*E;v2H|EStZphjfb5mGMpw{q z!pl;Vw772tuvDH4o$;j4u8)@=m+&BIf4Ix(u75P?Q{4Y8^uvpq)mCW(enuQc)hx$B zOY{`_*%~bm%k*x6y;)D8_-yYbMsC8y#1H}89X;M=a#*HT>d*NFf}x$pQ&X?nFtvzA zKH|l8y;frsm|&}<%&*}Yu}Yn0M=Jy8qe%<1qXRR%Nut}Aqr+1pQS*D7Cp`+8Y`RO02p14DyVOmSYlEzZ;9&JzYhtybMZ%e4s zlks=V(+aJ!LK-()3ox`%9c)lx#3#y4{ulL6KpG|&>9`n?Uh#m3G-mZy-3h98Scyja zH^3Pb7?P z+2hAkyvg}g$#)n$Gs2fL19JNOZ|~>Nx(|}lmwesC!>?Y~72mpf4XZ8t^TIwbCk;i0 z+a2ymSZ^=OrtrSH!(y#Vn!8KWk#O7<1-!if+`dDDy18U7wS3k$lIeM}Z0fhYqI)+x zo*o4*S$S|hGf6vL>PaQ(OQ_%eskx-G-FV|dXHbTH<#w@RbeIx9I$d$xqHh`{*&d3y zevlYNk)}w@cuu4A$^DYJsOvO7VBaom@Rx@gb$V5IKJ{Xue16H-1H0j=U0brW-aVRG znWCQRkESBmD^4?a7mB@!jf2>(Hs=Bd-;XX1oEilevb9axB^NhIPLO>jl03S+Rw|fx z&oIsIk(~W!4$zzKF|uSR<@S#;{r;fKup)iDaxz_9JouroY>XHcrN(Mm@UHV?-8bCh zXGfY~7U`rCasv(h-R*ava)^ zF1`BMT*n3xQBTdM?`n&h2Ecf*XXuLo7Zyl_El(v~oh>}mK01$%0a@#uzyiX_g>Bav2XWwH%YekAxU%pBT!p*?%cS#zA zv;^eDC#KZP@7o=^GDc_V8<3w>`*L(+=A#(fcH)dGjqM}Vk_el+c>B`{9xm<>IZ-Zm zLL!-Yf*3nju_(8ZGUd9*K`iofWW+BYFnZF&+a|=yxqV?oUOcG#ulnSR$DMs|e5Tph%WW zVjzE3nMh7+rG!}av)+~;o$#+EHyPX zzOUO?^#)Jh*t^b7pTW+I%f;xy&JMPCO&5RR``BmHX-Mw{qoJp9BjKea$;A9%>-iEZ zvuUBm%0j5UWax~`ue!K6dDdip+zs3f{+qQKqH;9C(1Z@95()-Ew=`BdLh2VS3zI8qYGH&&7m9+vpUc+x8l!i-ATXKhw34XL2;ya_VIQz!OL^)8mtqnb?q=~&^h-$;Zn^HRZ2p(gH z39An;`AWT=i&VP0u&CUe7OYW51Icv=q%Vc7%Zm z_uAp9n}osEUdk2*pV)*i`WRSa-FWtCwGqS-75@K#V0)r;+0(0XVp9vnb7lWiMj!q= z>Zf(ioa@gSwA55Jil$lh)%4U<)$j@HTQU2KwuUUsZA*2O^QTKobak8g0Qb~ROMTW7 zfTF2yF*na6i(lQ*Nq^rPen^0>$$b`K!Kp{FVa-VF`kCiXZg0Vtr}i*rcpny_YOR!} z+?Jiv?dWlT`}o$s9Fxt%%684d7ek-q-Q~jS*I5+8HtvSw+Rp!D=+gVr!gqcYy9K74 z&eClx6f6{1Din;ynjz?XZlJ~W7^A@0wiHIt8$aou;f>MYpU%gUlDwAK*nX0#vHtyl z_C=B+ZkOffY|oR^2>(+IlZCTMFirZMhn>bqzR=38hvJpcM4-@gUYY7_k^G*FW9;5r zc9q4c>C?hd{uS3{MThN*(w!3e05e?bI#SNlo$U&%>((Dz0_JeqbG|}!wI$& z%q2JQ)Vas;i0RYqNXW!CC~QK%u$K$beGI zT2KuzMjus26(zmofK;m2gY%d*o~sHBKA#`RBNc9c*-GLmbgh?*9V;^TBSot2E%~Q5 zl+R!WA_h_JT;+irbJ#Z-tSy-;B^t&&dOSwPV(T!CB)no8Y4sP%k(MD^0P!NL1vK&7 z`3luW2$gkI#Zf>IZT2=m4R&e@d zeo#B=Q|9`w8}%|)f%GBjYO01&Dk5qjm$+#1yia#CE=Sh~88Vdp%|VU}0a6mF@JkhUY&~W3f#rHK-1Qdo z>0*z5?#-hQUY}k^X7~1bkI?($-~3#c3mF4Cl@2%|0@1=ARZ z^qlNaN63&>;O_~mmto}?tAhznb}p;GpyIq1Z^yf<_6Ui~cpbbP;uV7W!+ke>wYG-f zPPz2~%UgSs(>vsKFle%uo=WIDYz;BR!doAy)aQ0QCpE_Wz1XK+3Kpr=V_H8w zqzaizn9ALx#?fo-N)_CtENYH*1|ID|x=xa9d#;9~1Wgrcx^8=evrfky*Xj`269~A;kh^O|ewZnM}=SmM7NX=?h#jjLh&1kIT+A z)If4luYo@s+e_L&eRJ$gw1`)>u#efOq=M0iYIPS$GII0z`T56eNxK@~Y%*^~Q&w$1b)jM9Z~kuRc~YX`6r#ySCskW5cq|#a39s;ZiaL~OdEpgu z1k*sKkLZ&?6fAi=)77yKI1xii%)@DG8r}663xkJcwLTj?s`h{GP@_2}`A|;w7zrzk4QOQ*O$(e|M^<`vLD*1^i>Nr*= z+A`y@f{!zLi)ys9OrFM5`Qw0292Ciyq>zC>8(TkG1O;#UUh?#I08kuwpS_vhufJ0v&p^Yr`=^WG7!qVG(8n9u7=J64fr zQq7B|9rzl7s)I_|8UeVp?=cqGILQ}0O(n+^vJz=vFBU9JmG$=DWzi+qCHw@D0a7`M zA`%pmU8+8W{u0{2*^tg&3;I&i`4`{YJe_n8 z{viTJZL?$}#l9w${3mydrW>Z%nY!WXf$HJv5$Zw4F%7^mXWsZ-s&olv31;C*KlH)j z?j?Eika^cI`l>)WJ*ga?%>0HwJm{%<)OP8pdvwMG@fm;Ca`jfy7ixY-sic42*f&ld zJg3(O0~;=Zsp@cdUj@&Zj~#~LX=F5Ws@!Ik0-~(wlbJO6&)S~s6WrAW9lrQ%6+S03 z&P&xJ{;BC%2s%J#uxZy3=Fc}fkwE9(T}QAK9b{FT!L3^PQ~;#X$T|9v&JFq)ru$h|ls zvPxYyWT}V&Dol3#)t6pVE4nIClEq=r++eGcG-tkOW4{n$Ra~3z?`@_gXRUiR`SrhY4K z#>C+t>pNtm>!Zw*;p^qI0|g<)Ob`r0jaN6asw2ZGLT}bMbHnQ$OH8cR7{Rq?=4%&x z2Qe&O`w$~b%fuo>fkgT`PVx=uto@&SdDpIXL)<da|A*x(b?o zdUj^iN+B9%;2{1URo7=%m@r*RJi3fQNO_`AZY;b#tClm;A}NQF#!Y;pMMdh=^fO@9 z>J>Xv^joKJM>M7x=xh!oSLO3JlxVwTn$DPHdGsnkAvB)9d)IE6ZHgd1vd+Z;W1d682CBy4zti z&6;T6!rzSKIy&zKKfAx9J%7q-=Mac{u-_GIYEaZt*`h25Ne?ch`E_c2{pGA<;nVkx z102u6#||N$g5MhA{!rFwaI(;8$S{1DePGc^L~j6?Q$2QMIO09 zPdma#_kX(|;oOau(pX877ac9V4O8x3g{Mdbr6oS)7 zN0v#H_j!bhUNl;q>GrkeA~){;lCg@&Mg5(z%E1HV`d7{>_}@9JZ(VJn>=HKC4q{My zLpw8D2OD@&E}T?=SV7rE-XI?4H+E(aOI8sZOC$NW=!leE6MG6ycn2;fB4XpB!^#Z= zQ?P=-+!R0#4h{+c2LPbUF6{uZG&6i-ZDI+f;6P`8V{ZtxcA((p;6i6ds6r4x005m` z6k;m{H8U}FK+J;+syaZe)G2u2J;eI(G+`)^0+C~@0#BIzJLi_?-}e8NR15?I|34|k zx>2LneiYApj|7nW4k1sp9h-vz^G);Jq7ONB*clw!(IJ2QT3sYWS)>yb_Ual2Um3r5 zw706UJD48HLY73$&Gm=sl|EYND&Uk>VT!eN_p49f6HS<{TU>u{4&#WYh1dwy^E8il ziH`_=$2m8k)y$Q2yDZQluP+AZbND!Yi7Co@fwHnw2pV1bo*=wGx2n7Urt$y1@imz1&#&nK47Nw zT-dLY@^1NHY?5B#-Qf9?`lA_={@NnLpmwJGQG7&oU}0>) ziZ`GdjY(jIKi2Q?e+d=de}nq3pkP;ZG;lyf$Xh!{=x?qF#2$)p%>NM^W_I=tqNWf# zgv;e1fAtY=)-W@2FtyhKb8%3Bfj|mw00#vR4=)857d&XdU z(4fLD4>dA_AWjHkeJ)-u3LZ|NF1w_ijiW6*A6^xXD#Y5}7O{k(E4!#F{9rhl8A4Sg zMcAb&9N>rx39*a9v4(4~r$8jq|MLt0{*hTPYU2nu0sub&aQG~$!9>qU@%LGVw1{ZAdD5crj3WAdl2KV62-uIT7sX=aUZ*>8aV1F3(c z_P=p-FtxG!8!9*^U<3>RcoByeFaipAK|lhB5)AqaI)n^@hmeEwxOw0OKK@%C0pZ{C z5o^F{FbEE(DEt!$_$B<8DlYiaV7ME855ql#Py+_S#o(c8`L;d6lqRR~$cn(zq-4};(pf)4`xt=`PWS`7YO27?$MdgtpDP{`vCa4 z{2x3Z5bm@8-~oUj5Zv+q!Gl}N`CoDX0N4M*gTIpgb1nb?;)Y)s|FIqb0Ot6gw!m#h zTnhg~j+YZ2)c?r?0yzIm4hZ1=FTFrc;D6}=a`OJeW(PY6{AFi{I1;L6ZcsR+>?$@k z@FNVDLEL!K*2XpzfZwk|I3Y%%Lm?mm76XGtKw?0k2(JV$kO#;s#>p!o!6gRf5#f;l j@(7{-|3%=32kuUL2Z)`+Z(jm{U>-0!Ev>ks1p5C2Hj`#V literal 0 HcmV?d00001 diff --git a/llama_stack/providers/tests/memory/test_vector_store.py b/llama_stack/providers/tests/memory/test_vector_store.py new file mode 100644 index 000000000..4afd65dbf --- /dev/null +++ b/llama_stack/providers/tests/memory/test_vector_store.py @@ -0,0 +1,80 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import base64 +import mimetypes +import os +from pathlib import Path + +import pytest +from pytest_httpx import HTTPXMock + +from llama_stack.apis.memory.memory import MemoryBankDocument, URL +from llama_stack.providers.utils.memory.vector_store import content_from_doc + +DUMMY_PDF_PATH = Path(os.path.abspath(__file__)).parent / "fixtures" / "dummy.pdf" + + +def read_file(file_path: str) -> bytes: + with open(file_path, "rb") as file: + return file.read() + + +def data_url_from_file(file_path: str) -> str: + with open(file_path, "rb") as file: + file_content = file.read() + + base64_content = base64.b64encode(file_content).decode("utf-8") + mime_type, _ = mimetypes.guess_type(file_path) + + data_url = f"data:{mime_type};base64,{base64_content}" + + return data_url + + +# Requires pytest-httpx - pip install pytest-httpx +class TestVectorStore: + @pytest.mark.asyncio + async def test_returns_content_from_pdf_data_uri(self): + data_uri = data_url_from_file(DUMMY_PDF_PATH) + doc = MemoryBankDocument( + document_id="dummy", + content=data_uri, + mime_type="application/pdf", + metadata={}, + ) + content = await content_from_doc(doc) + assert content == "Dummy PDF file" + + @pytest.mark.asyncio + async def test_downloads_pdf_and_returns_content(self, httpx_mock: HTTPXMock): + url = "https://example.com/dummy.pdf" + httpx_mock.add_response(url=url, content=read_file(DUMMY_PDF_PATH)) + doc = MemoryBankDocument( + document_id="dummy", + content=url, + mime_type="application/pdf", + metadata={}, + ) + content = await content_from_doc(doc) + assert content == "Dummy PDF file" + + @pytest.mark.asyncio + async def test_downloads_pdf_and_returns_content_with_url_object( + self, httpx_mock: HTTPXMock + ): + url = "https://example.com/dummy.pdf" + httpx_mock.add_response(url=url, content=read_file(DUMMY_PDF_PATH)) + doc = MemoryBankDocument( + document_id="dummy", + content=URL( + uri=url, + ), + mime_type="application/pdf", + metadata={}, + ) + content = await content_from_doc(doc) + assert content == "Dummy PDF file" diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py index 48cb8a99d..eb83aa671 100644 --- a/llama_stack/providers/utils/memory/vector_store.py +++ b/llama_stack/providers/utils/memory/vector_store.py @@ -45,6 +45,13 @@ def get_embedding_model(model: str) -> "SentenceTransformer": return loaded_model +def parse_pdf(data: bytes) -> str: + # For PDF and DOC/DOCX files, we can't reliably convert to string + pdf_bytes = io.BytesIO(data) + pdf_reader = PdfReader(pdf_bytes) + return "\n".join([page.extract_text() for page in pdf_reader.pages]) + + def parse_data_url(data_url: str): data_url_pattern = re.compile( r"^" @@ -88,10 +95,7 @@ def content_from_data(data_url: str) -> str: return data.decode(encoding) elif mime_type == "application/pdf": - # For PDF and DOC/DOCX files, we can't reliably convert to string) - pdf_bytes = io.BytesIO(data) - pdf_reader = PdfReader(pdf_bytes) - return "\n".join([page.extract_text() for page in pdf_reader.pages]) + return parse_pdf(data) else: log.error("Could not extract content from data_url properly.") @@ -105,6 +109,9 @@ async def content_from_doc(doc: MemoryBankDocument) -> str: else: async with httpx.AsyncClient() as client: r = await client.get(doc.content.uri) + if doc.mime_type == "application/pdf": + return parse_pdf(r.content) + else: return r.text pattern = re.compile("^(https?://|file://|data:)") @@ -114,6 +121,9 @@ async def content_from_doc(doc: MemoryBankDocument) -> str: else: async with httpx.AsyncClient() as client: r = await client.get(doc.content) + if doc.mime_type == "application/pdf": + return parse_pdf(r.content) + else: return r.text return interleaved_text_media_as_str(doc.content) From acdf4cd73a3fcd87d4d326ae24bcd36c773c2168 Mon Sep 17 00:00:00 2001 From: Aidan Do Date: Tue, 10 Dec 2024 11:05:50 +0000 Subject: [PATCH 2/2] Use actual http request for test --- .../providers/tests/memory/test_vector_store.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/llama_stack/providers/tests/memory/test_vector_store.py b/llama_stack/providers/tests/memory/test_vector_store.py index 4afd65dbf..1ad7abf0c 100644 --- a/llama_stack/providers/tests/memory/test_vector_store.py +++ b/llama_stack/providers/tests/memory/test_vector_store.py @@ -10,7 +10,6 @@ from pathlib import Path import pytest -from pytest_httpx import HTTPXMock from llama_stack.apis.memory.memory import MemoryBankDocument, URL from llama_stack.providers.utils.memory.vector_store import content_from_doc @@ -35,7 +34,6 @@ def data_url_from_file(file_path: str) -> str: return data_url -# Requires pytest-httpx - pip install pytest-httpx class TestVectorStore: @pytest.mark.asyncio async def test_returns_content_from_pdf_data_uri(self): @@ -50,9 +48,9 @@ async def test_returns_content_from_pdf_data_uri(self): assert content == "Dummy PDF file" @pytest.mark.asyncio - async def test_downloads_pdf_and_returns_content(self, httpx_mock: HTTPXMock): - url = "https://example.com/dummy.pdf" - httpx_mock.add_response(url=url, content=read_file(DUMMY_PDF_PATH)) + async def test_downloads_pdf_and_returns_content(self): + # Using GitHub to host the PDF file + url = "https://raw.githubusercontent.com/meta-llama/llama-stack/da035d69cfca915318eaf485770a467ca3c2a238/llama_stack/providers/tests/memory/fixtures/dummy.pdf" doc = MemoryBankDocument( document_id="dummy", content=url, @@ -63,11 +61,9 @@ async def test_downloads_pdf_and_returns_content(self, httpx_mock: HTTPXMock): assert content == "Dummy PDF file" @pytest.mark.asyncio - async def test_downloads_pdf_and_returns_content_with_url_object( - self, httpx_mock: HTTPXMock - ): - url = "https://example.com/dummy.pdf" - httpx_mock.add_response(url=url, content=read_file(DUMMY_PDF_PATH)) + async def test_downloads_pdf_and_returns_content_with_url_object(self): + # Using GitHub to host the PDF file + url = "https://raw.githubusercontent.com/meta-llama/llama-stack/da035d69cfca915318eaf485770a467ca3c2a238/llama_stack/providers/tests/memory/fixtures/dummy.pdf" doc = MemoryBankDocument( document_id="dummy", content=URL(