From 202fad85ec4240fc995e736e6a9a98b75a6abc5d Mon Sep 17 00:00:00 2001 From: eust-w Date: Mon, 22 Sep 2025 10:13:11 +0000 Subject: [PATCH] :memo: tem push --- __pycache__/baidu_vdb_backend.cpython-310.pyc | Bin 12503 -> 12423 bytes .../faiss_vector_store.cpython-310.pyc | Bin 0 -> 4790 bytes ...multimodal_retrieval_faiss.cpython-310.pyc | Bin 0 -> 10939 bytes ...multimodal_retrieval_local.cpython-310.pyc | Bin 0 -> 17857 bytes .../multimodal_retrieval_vdb.cpython-310.pyc | Bin 13400 -> 15085 bytes .../optimized_file_handler.cpython-310.pyc | Bin 9361 -> 10059 bytes __pycache__/proxy_utils.cpython-310.pyc | Bin 0 -> 1947 bytes app_log.txt | 78 ++ baidu_vdb_backend.py | 26 +- faiss_index_local.index | Bin 0 -> 45 bytes faiss_index_local_metadata.json | 1 + faiss_index_test.index | Bin 0 -> 45 bytes faiss_index_test_metadata.json | 1 + faiss_vector_store.py | 147 +++ local_faiss_index.index | Bin 0 -> 45 bytes local_faiss_index_metadata.json | 1 + local_file_handler.py | 135 +++ model_download_guide.md | 108 ++ multimodal_retrieval_faiss.py | 370 +++++++ multimodal_retrieval_local.py | 607 +++++++++++ multimodal_retrieval_vdb.py | 150 ++- nohup.out | 49 + optimized_file_handler.py | 95 +- templates/local_index.html | 995 ++++++++++++++++++ test_faiss_local.log | 0 test_faiss_simple.py | 58 + test_faiss_with_proxy.py | 164 +++ test_fixes.py | 79 ++ test_local_model.py | 98 ++ test_local_retrieval.py | 229 ++++ web_app.log | 63 ++ web_app_local.py | 466 ++++++++ web_app_vdb.py | 51 + 33 files changed, 3897 insertions(+), 74 deletions(-) create mode 100644 __pycache__/faiss_vector_store.cpython-310.pyc create mode 100644 __pycache__/multimodal_retrieval_faiss.cpython-310.pyc create mode 100644 __pycache__/multimodal_retrieval_local.cpython-310.pyc create mode 100644 __pycache__/proxy_utils.cpython-310.pyc create mode 100644 app_log.txt create mode 100644 faiss_index_local.index create mode 100644 faiss_index_local_metadata.json create mode 100644 faiss_index_test.index create mode 100644 faiss_index_test_metadata.json create mode 100644 faiss_vector_store.py create mode 100644 local_faiss_index.index create mode 100644 local_faiss_index_metadata.json create mode 100644 local_file_handler.py create mode 100644 model_download_guide.md create mode 100644 multimodal_retrieval_faiss.py create mode 100644 multimodal_retrieval_local.py create mode 100644 nohup.out create mode 100644 templates/local_index.html create mode 100644 test_faiss_local.log create mode 100644 test_faiss_simple.py create mode 100644 test_faiss_with_proxy.py create mode 100644 test_fixes.py create mode 100644 test_local_model.py create mode 100644 test_local_retrieval.py create mode 100644 web_app.log create mode 100644 web_app_local.py diff --git a/__pycache__/baidu_vdb_backend.cpython-310.pyc b/__pycache__/baidu_vdb_backend.cpython-310.pyc index 2c09ce01c3ba44a1658b3acb8f0fe5d0f05c2695..4b40d8958fac992fd9521d552ec2ec3cbec56542 100644 GIT binary patch delta 1459 zcmZ{kYiv|S6vyw}yWKvPrTb{RWlP^(+9mB4(`rmWMU73Uv@TjCNX+)yd%JAkPPcZu zZmCpji_$7NDq0|j;A5#;VWVGc6akIagkUskZld996BEKW6Jv1xv&sjfyU8#AIdkUB znKNhZPx`Fx9CkVj6!H7?*Gv1&bIyC$H(V5>^Dc^f6xckN8Elxh^ttA=Y{ODPezT;_)R9>wrJqS%zUXQG=alNwW!`C`^Naw8}f^OktJXC5&qZ ztrqU9XF{e`%ibF^2aHiGH#TIL33D{9@%ymS!Of9@SdUJFu7_C>U2yeLv%AdxgqY(! z6n0nVw+gAM9_FH?yV733^8R$$>6E)ecwcvSZhr`#VPKbl}kDxG1e%wlOJE1wjzZTx^>P~+`z9Rg%Yo}nK%-1ERejHiKT0c^*g z17?5(@H{XJ%mGf|5Fk(af>4z5Z4AB!Q#Ku_7lFgTJg@*{H>ZRCQtEPhY1LosxDNZP z0?AqmmzJC_Yf-C_-rCm!X=bTrZ#u_X_xGL2MiG_iD&NdHX{vmR9ip4%HQSJ)mD4*g ztjBqa&>W4y<=JlCLU0{#gjE)O5$YIlh90j7*k6Y1q`8U~wlT9((XWPNjgG^70yqhr z24orEhLXp52kKqgUER#yqnYZ~Vp$p&tg=37sIzpXy3~+`T*CN&b9DZ9j#%?P@B#24 zATve=RzbF{hjh^*%i>x1NUtfew~5C!8lo>MDl^AxEXK~#cwHr1p`&%%*^`-{>lkBi zkrnuaEo6>tYGE7j;rMrtUKX~K^hNiiT>Y7GN zjXGN-XH$*JyXm&3-`G{E2-W)KMlz_pY1y{fptLX49+DR=tG0m9vS2SkEdxh@72r<5 zqVGd>Y?*!!-S4?33|3wwGN{LfB9UMNzd{|&KRB+!PridOYTObEjADEazWGrzzH>Ml ZH97wlMtQv@P}jtHcIEo>S_+u$=3mLMi?oH?t+Ju-k7A9gC5Z@*$z(%vzhDdQ%lWNhNL zNGr;k{tG@Oe8x=^rlahr67iG2u&KmkIEHz0Fu}*-{rZ^U98U0bGM!8f(Q}0%SF6a< zupH65(<#27@>V)kSYfFU##^hjTzD|3LYf2?*j=B-heij<e?pqOhP^U;i}&s-o}=r>oXrAG+fOKx|i?Li?`)x$ir%N?>>nVnhxa^=(W?x&ca zmfWo^?eOgY_6Zm+Q9-XB-$*o`)Q<6|arX?+2|NoVfG(gL(1AnLUn<1rkz1NA^2 zU;_q#K_CoR0L+pP0S+JqXJNXXKwGdlcqjJjxOoG36L?EZI6#@8&+$8sC_7XAYOz0eH5gMXWj2_b z*eF%+S;#XAZcnu*sSjwP5$3Y3tc@DWCfRAaQdZS;N;n&NJp-vk`mQO;QZ~CDDc}uw z>^*4rvP+9lOEghgtRY=~1-P?v_ay85|77vaRbUzT7?7zW6~n&q3m8g> z)yT;XyQ}<`Cz+y*MCovOD7UxDU~Gx%YeH;=o~U`0jpSx)7-JvM$=c7@E4k*n7%TZ2 zLGM}4RbiW_x%w=-MD}pI^;>a{FuqD%;fL8G&4+hwm{fj_*2A4_oSGu{J@hsry$i@y z@<7R|KY{uj@Bv=|-vD@rUk7ebU39P^!Oqi{4OQwbMn5f7*lSQZKyKv<)K1_hIv%ZI%XBgNxMy7$ z3~M|-m`D!A4`f&PxJIo9C!GLWGk8>6+yAU}n#tyfR)P4)KboPQ6a9Iqd0UDVT& MUuuonnfaf80oo3EW&i*H diff --git a/__pycache__/faiss_vector_store.cpython-310.pyc b/__pycache__/faiss_vector_store.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f974137569bb3bafc465c99a02415df8a86dba5 GIT binary patch literal 4790 zcma)9TXP)66`r2k&aPInEXx-e2iD+FY|6-7QV9`;g2b3|ic%g#fNVl-hV7BO@?NB# zS&UVyia{b)68QqLoq(|;QQ_hTuq&Zr$1*sTH~v5#`Zb!B^ptlhgnXxGcchg?D!bJ^ zeL3CJea`otbEewanbPokwCnT3Z}w=~KdH0!XQ1;0ivJucu5nhQ#X=yBCo9LK^P{n#Sgp7vX+2vTD|rQyT>I-l=Lr=5Z>T(t&Xm*nUEJ{S7S9|u zxy9{iD`)Wpcc!(R&67NZXM%U|G@cIc&?t^a8_n-3$K~)T)c(P-dM)Ji9_g-1ix-?CuR69TB28U&m7t3D7 zhmHzls@9L%I?4}F{0C6gSzXI(bsaV1)Ygj{YNO6Bz-Sj~+^lPJ*1QH&kx`7=Su9Is zn!j-7V)(&t!rAi&pFA`)6rOow<;3amqu+2&`+bqwC#uz8U%BkP zv@f#6%gQ);pRChA28`yhM={9flaTk*sBFh(^iQ+2G10f)qWvxT0)uSpM{N^DS!@)I zKpSOqu*;|c3$cs(JQMc>=A6ZwV>H6Fz@AI2m4O`{*fFW>Xbx#=g#n4w1ub6DLH&6` zfx{t_wQ76t!tmE~%b$L_bn0s3onJLgG;=+qjeKGEEB)4bH(W4~sqLhR z_C^1?IUAX9ATyXvOTFq#Q-Rm^UIE{IrF5AnRYyjOl@VzcE5lW3DP_wPjKm`w2~w@> zitN`iYGm||L&T1QB{z8Hq03BkLr_^Ss^Hf&u?ZEOO&O?H?_s?#PY<3G_ug)xjUduy zE#j7gqOA2S8mrcVo$d?3rBQ=3u5;t0aeC7v3#>W2&Xmp*&%^ndF!q!Iu0j6~YX0t{ zuqFL}Lj1#6(=K%C$S&dQQy^92;LRf60YCBbM z6`C}wV_rp?L=$P!wL~X|FY8K2k`1p{(=N;s7<^(>7u3_m?*7XBYT~Z1QBuneU$31X zVxs`QW)yy>sgEtthdxTEH0z8j7k$!X8uXHg^^#C}xwB0#7|S}f9oTU3)3tIISzWKDQ^*Dn-{(k28~0>m3}4~deV1l?JhyjC7l3KI9y zAUS|l2myM&`?44No^N5+w^8-z+n}V4RMaNlrXr%dg`xm-01YSuG_Ns{2_o9l2vHx- zd0uDe&6z}M0um?HMRNu@0dm;NX`5@56A?Cib2iq}pdi-PwxDWGUi(v1kuz5_*@PmP zAd<+AqoUaIjge2hAH#xFFZ!suld5(4NvZ29|F93!3PLI9#$h{{`Q^kNw?SxCUM3>1 z>dRycQNFZ=7t}<1U`t?Q(z(*xG#Y>wn_{T-DR#sV=>RaBVz1GNHj1%HBQQWmqi#f! z+qEegC4wqS&Lub;sdA-G-JD_vv==*nrcYXvHiD(qr*;XFb$g2K*T%E3LQ1(-mBZA{ z1KO0zU~2!sQQ`>IxV8Y&HiP6E1H=MxW1~>n)I=-#f%QuZ?GZ9o5oE6XHN0_l>Fw#* z%HhoKmgiTaat6Qnb<4NTh8HfiN9O3Tw1W!jhYqaUp!Q8%t$8M(DSr3Lxm$?J;jPzK z<`Y;UOviw1it3@uF~JU@K|6esFBelS!{qD|-T2gZMUHEePeHjS4rEWQhk+ zNyiUd5#)bJr@`R*J9Vr%>vyo+r)br^$zjBt9;4gnWscFMPwe_%ms}xN zBGcnFPaJDrT(To=86o-e*x~#g9T0#VjwX^LFv&2C5U@HU%5eRNF1FR#oI(D2UY|s! zFbemB5sRCMVQ@&Gh%sr7Xp>f8V;y{WAE=<{vlrA+{kq+!O(yCIZXMCY_r~`IXoGrA z9U01~^Rgyhpl99bQ~AndGDy`8+K1Lnv3io*`?b0?*HKRrJ;(Rft!REOJ+IE2P1HWe zMPhUVg^M?DHjaPLIP(r1Ts$5(He%aaoeg9fe|T^C%DWL0;$=jkBLkU;JWC6|ZCrW> zq(lPTCJ}cwBH4Y>vWQxHGt1X5F1^1H^|JB8!4--MD_`L!9!5)0S2`S+bSnif+Zj3N zM`+aY54%8QLU>Wb_B~e=4vWWV-J?_~gi^4VW}%8S)l%dHmdB700MkL0z@AM-@O_Gw z93*LcQAF?aWdgpuSn$xmYigyym#iqBB+2iSB>Xdtu0ynV26NA&_;;g90m)rPn(Z)5 zAUTa#pfg8LA+6|TDW)Iq)+e^#hV1ArZC->_L3CGspK_cge3?j6Ym_1&AdJrz;?MG_ zYvJkhkr^XX4Ju`3^@!DWC8!2&X~QViJ>msT4&@8gS|#`eHdL-=n?AAowzIC6mZ_NP zet}Nh`jHPbP}B`5DdKEWuT#9)G>Pa$w*V!!(6eqlhIq#H?`o6!xQP}!Vv5~}JxpZ) zI*6&_el&wyo3&T5VGN{jW^U!JKQ?Ahgde|s=s@h+C*!l^-;HY{bb|p zmB#F=OBX(AoO&@>+@7>3n{?=9Z!vMN`CxN+2jNQ=ch@_A!jT|A2||;ggh&ihMcflaK54#Gt(L@3slS@0sh0X2qCARD?Dwk% zg3M+o<=Bp;Crn-cPnV&8X`88k>wC;Esjs8I6aB9HcAA3L!WpU0jXh{f3t@5WnD{x` tQTk{jT*4;n4@>bUI;=`0X`hV_C!PI~vK}Ya`;?zM8d=0)yHI-c{{kdSS3dv% literal 0 HcmV?d00001 diff --git a/__pycache__/multimodal_retrieval_faiss.cpython-310.pyc b/__pycache__/multimodal_retrieval_faiss.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..593e6445fed7875aec1a4d3e82acc465417974c2 GIT binary patch literal 10939 zcmdT~X>c4>e(!5~W?G|>EZ-N5jR6l}c{z8ZCB)_oyC7U^L$Z@4v*UR!dE~kLy4zr7 zv=}5`;FAT`U>hXX3${7Tax9K*;B!9YYd-X+RIPg?)mDmBQP~fKBftN9b4ixKNu?^4 zp6dU+`}n{A{kqiNZYg;F<-UK~@~0mt${#7y`sm24!|N!jqA(?;FqLU}HKmGwt)QoL zmCE$IQ3$0%BCX}kR2XGOK2nILqM|I6w+d~kwn8iwE3~KDMVXn87dlcMh0auGVNPmJ zp)1u@n46kgn3tMY8)trMzL;l0Y61Sk`Gti=sYSKA#i_-jZb@nh{v-LNg=MK_h2^Q` zwfd`4SD`$bzq*h}B~-;;Jg@@qN@g8UDK~nJy=qKNUHgE-+E{EvVX%`AYTFSas*q)!*(M z|M2zk4=#*fc+IN5aJIVt>FP^+s)vsQ{t?w89US`38vQXaMZs)VK`qn3g z%sRZzhoB}~0Tn4Jjj1U;szr^c$!=j`+h7qp#G zpUL8^W5h^BMcYo)n5ShHF-J?jFTz%{B>F9?^;<&gzrvn3up7~H zF~$t{Y3wGp2D@Csy4hM_Y$^KzyBYt>n6Xhw-tI4Y*vq@QLWyPaKeJt)v$tpRf)%`L zaI#mx8egt@q-+;A+8LhRvS$5*Yab~)YaV`hja}GmGnOm%ui1X{njhZ1sdtj7#c%5) z>PqK|%zo0hFs+?W7c&Lhk9m%r20Ntt%bw$h0n<6=`0a&UF%^(UjS6pijr0;AwZ-?wTf2JE5;b48*=&>qyYX6`gcQ>5J{c`8rqs} zcr?QFih4{rp^ZVZh`dn=)$=R_teM{JAb2ZLdxE_uf;XQF_P;c<=%_bqv|rErot{RH zs+)M|Nsy>`^||2if$=xro6@KiL1=dLY;gFEOP_o;{=%v1zMa+M`+`Gfk$zyqqf>g- z5Qz@^_XO|0B0E6Lp`5DpIMDDx$ zt9uWBesO1-Y(Tc@}hiTR7u`BH|Z<#}X^EM2b4Up|Lc9NRABV%j0W=$rXcf4|LrBUkJzkwkXv ze4m_j5zV}qz>@kJPAhyB(qFD#%S$D9ZJ}UqUR!7|Uz*psFD+Q4y9^PY2BYNMbUJ~4 z&H@0l-Bjax1V2sJR85V1(;1ql4z8GuiM#6~WDWGLKa?hpA#1?-Gk~j%s;)Ai5ohoy zQ;Xn-F?BC^XRk7#4j4?`sv#FbF2r~2RYuKYq8QV#z5~j)i)G2PA@%dwe77 zv5c>0%brK)RegV~!CaDWs(W8TYsn0k&K;Qe!v`J_#{+kRID_|gOZvQL!=v|SAtf4* zh7=)wE?$R-q8}pY@#v6RAtWo?6|Pm(Bic3?CxT9(cH^1tnOr`zId2Q82nDWn|G?M# zbL_OrNj!^&{92@Z>&`sP0aE05ZJ-%wx+Tisk7uWhDGbqlXTFrpD@Y<-akXOK7Phv1cc<$*<#{3YtDZ9ltC3EuY z(a?|OoOEORzTxgD+kTitAOxB}H{EZ$&1!)iosyRWjqSdav^MydhoW4Pv?hD66H)kWn!z>edin zUeP2os>-%#1+M!H@61PIo(INP1&W|=)N*WcI7D&DoEl5KG4B|{3)gaC~MM|iA*BA|v%4aGB0AApV1VHuN zuJOaW8!X+MG@E$g@Sn*;Xw~Ipl_#t6UaH3A%AoJSSu#vUYSBRcz z)N?f|$SoB*qDzbGan;ZbwNss^M!t@RyHpE;DXtCPGCMuZX2U>?+E2Qd$sZz9oo4(L zz=i4Ts>2$ORMdWTh;$S55)5Qe3DXZ4Lz+JHd zpgIJzU?2>IZ|>4q?Y~c7tc*s+l+oxIB-!66%rqbbhhdx-BR4C~Jx_po==XEH6@%z? zAXb4RNdbTZI!QlJrwY1_z^Q*6HE=CqT#!T4=FP_@#xK4!J}Nx)<8NO&cV3!@n|!V8 z!o?1kGr}?PEth-6Y=*pKKQ3i++J!w@;?5NABizvWlji<{s0({j9&h`FHiUh{(a=W8 zL{Z1oUpKY)X^lUOmg1OC>-+|!eRZp^Zf`kr5{J;+3@*FS+xw9Jg2?15W$;I{o~~WY z!fV1ppLAQ4c4LrUp=|j&40Boo?>w1v{K!_@F2mM#q+t<<8|CDRj+-fFZC`g>&hJNa zKa_KAv>~0~hl{0jKhH4!Q_9WBXB;=ZCC8XugzcMgZB8oE4;8%v>{P8-hAo^!tfAjG ziX~p)KS6C>FZ5DNGgmBoE*!g(=ZZuJZ6Mw766MmGx)>t2xzAyY;EI6wt` z1o{E|PO;4IAi-FT1Vy_S&_&Q-3^W)QG}xtf)@ktCD?$kjT@bzFM+>%_VTb@aS&}XF zhm;V>nS1a$F9B3k$LJ@?hM=0K@VO*`M?j@6e1>8CYLZvvP^Jy5TXlZ5t1)9(eTKpw z2xN>w0=uvkg(Vq>%rb^T2!mSOz>&p7r6s? zD{>K+g++R&Fb_#+IyT~)FFFB)BWDb$Iiu62jhTVW_5yb`81<3cB_@-9| zUE4fC1kUWDGM!#%^#`X+8J&GNx6vs21{}dY_t%L zGc_yp8FAmgQn?2(_`~m{5s8o)nUC06R(}uok>*M2g7N&&6FKH0*s;aV^>1EdV|;Zu1p*5p0@1X}YFfhP&U~w{pk^Im$&*nS$W0daf%jF?25OyfC z99iO10+nqvFp7c~*Q z(IJaGn2NPfS*lc`OuJL5grWbU7EITq{&7?AK_LHcYuw;M&fccCiVn1 z_S)|z&b}^%Xrnc0pshWLiO*gLj=eg5;n&qyk3)|kVm+nBbWBi(J=F^*A(RnEpVG9E znOU8QMay|+)Re*gUBThEpcyV*yf``bk09P)?{2K9G3+={jpyJ0$`3hNx&kzXq?cmd zbi`ecpg>(_=nW%?ArS)E$cL@FXXH54+9Kh=8(hYqHJZ8jQwC@Pd)n={rpOx_VFb17Y z9+FkBoIC_K)yYG4ZeaWW9eMB(V1(}?w3t#FC8%IG0U?+6P^y(Ggs35g7gWLbQi|MN z{w#s#2ndn1pHc@1OcO~R&BGF?oDW{*htP0v^{jGdT9+wC9J)8-g%&c9C%Cd0gH@MQ zy)G2gBB1(fmiW6B5M~eHxry);9;8eHRo`g{zGaEFA4{MfoMUDhSomtr1E!lrA9W zg-V!?P%Yl1;oX34)mIN!UmXede1a2!`9D7PE_OBX(Ma|D;fdGZs=o7EluYhv&$(==j*wm%dbO9TvqT~DJ8Z;VM87zkOe^ehGjU~*f5aCMfkk8w~h)vp_iwe3damd(+D2A$O@-`CiTN5wx-)Nfk&61T1 zmPDcvGqGsZ9BrQQwTbdBUHjTJtgo~ijjtoE`I|J=_)7Fdi@2eE6%SeesjZ|^gT`wq zjb9xxVj)iJpzSsu;$+&L;Fbv_C~TQmhUj0^P%4451tl(f0wN=P`wu@k!JYp{0uXK}d92J5Jp6p)ww1|k)cb+S1K z9fjR!B`i#m*6#&iK^_jkU>hA8A zvCx`m<(TRQUkMDmgc>YXq{8aV* zmxDt`E}eUJ{KVMAoBO=$X$h|#49-0hoIGBA8UFh_=rsA((dzFm1}EPP4!!}~Wa5)I zsvkY=-GR1~M?VO5?W-O=7@T>9M*ryJ>V?x74x-@F`SX`9yp1`6-S1aFI)i@Im!A(# zo; zTvFo}g+weu2d`0;5J5sTMd^Y?d}qYV)HEU@!Su0@C5=VOOY5L%6A0PU5J3JMqCxqg zMB3TmAZ~p>N$Z<1UIcfx5fvXuh(Lg*nqNo4k*G!2C%U2H>I8=2ZAeGdn8x1_3xF8L zg-<%2T+NTtTD#@4=u*17SVVZchzV=C?sDy`J#nGa5_24-p0rEclHzjJ=|ddc<_;&9 zM_eCqIxpYE?G;^3)2T{%P3P1il1W1QTFLQ^0S6ZoxUF!v^zO5gx$nRO&HfNk%_MTF)FNe_mgT41Z z;zx0E4~g$u_=_|ai3xemgshp=fgib3sHEQ+!(nLm+RmD7sXog7XSbN literal 0 HcmV?d00001 diff --git a/__pycache__/multimodal_retrieval_local.cpython-310.pyc b/__pycache__/multimodal_retrieval_local.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8093d61621f5af9ac55e89d486c2137bccd78271 GIT binary patch literal 17857 zcmdsfYj7M@mS$yEX4X@sk}S*e15+3S)-<*;O#^LMnpd0ggpnESZcl5Qtu5=OX0V*@4HGZ;zn*GkXRasS%Z0Lynvnxm4%)F12_nv#t`=kqPZP5V!{?|YK{jS+30)gMrL-St{ z5BKBe{4EL*=nn`@=)+pS##g;Rh-+}z7%}@zjoOT1Yb4Yk8VUD@MR)xt`B$gcQCxp{!U!2;YA}|{av-T#r=!9t-HUQuS@!u z@U^GE$6lOUir+F3I-+R-dvSTWy<%GHzfOehj@(KSxi|1gKtx4se?Y`0_5SOHzAexj zFG?&Wdil~PGcTU39)G8D{A~5)bmjFQ&%C<7a^h9oS0|pS{(NHQ{WoWRb7AJf8`0{E zXR3#vslI%$I&~c7@6gQmm17^{9;IKe?on2b9h-Ukh=M}UFCL9v!CM1;y_#ok9?1;Z zy+JSXK+!EcS`hZI*G9LG$-;o`I0dO5w-4AV9zzzrcb)oeKBYGqlF(PKf68DObZ2-Hbh}f1LS_6SYyXNObKF^vA?v(T!_dED=4p zwuz;7!Zy&`SQ;4A@usBRo=alfGO--vI>ZXJcjh|L1Ld4P*NK(rvnXPSe?U~h4=@#cOWYt4R;kPehgl$xvOf@PzmO>0idHOrRJ zO4_l&Yx*>J4xU3o-G%G*liH6kx*RF#rC`aJ4r&2BMW+LNjn%Gky7K7Y9B4+#tV0(j z@NE_E(ZH`#wZF=t*D7!SxN`XAg?UxIo8SKdIpl2gpQyQkwT-D}{x|#KnaaoeTe>zQ z6`g;2rZV-5+WtN_Q9b@#<>(nO(MKNpUdyN&@-g7>!OH1Z)c|l;^h<4ixN_)~+3Axl z16nqTCN)txdsM-$JbR$>?oZb|@?dYvh{ir{Ol^8#^R{i^)|HDdfiGXa@WGsg6qjH_ z6^mCbm(Lvm%*yeT8&ky_&^CMK!^(j}m(Trp_R=NQuX5uH~g6@gX{ zy^P+9>Od#6AN;1+fsT(nh@)LO`>ZBIDqc zm~vomK>|Wi|*WKm?hqB_7CTVWmQQnL@FM97V z$YG!`GJ4f8kna656wtu zUcKs|Mr}+<>K#mv6&Vddz{&D*79PZn+W_l7)EX7b>!>+x8_xI%w=0G!qJ788bRHDHvZ zwsf=Lt#P;DLJ?6)g=b|(NA0}mH5GQ5MH22jE$c=6p6VG$?Ji+Su5$E@^70nI`?sbgibRkYeL*8113S zh6YE>J?KW})JEx97?Zl(!sAReE*G0K(o!Q^>f%oYnJgXSmmeyC#HE|lT z143sTNXlKd@3a;eN6pwGv@REI?uwGuw1VrJYe;Vu;Mq@WPijvFdJ{}TmFLS1t>~{0 zS5EB5F{%9QWHE)Us5Lg((3Pex(0fR9_*U)Adlz73&#kS#{89BcFKtYD?O8I58!8E_ zZEPy1d^N=fL3B2!tq$l4k%4DqU{Lo#s>GPHrUkRlXpNjlGNVNzJ*AV)TzJ27{uh`S z@yVJ3*tCw?Di2Lno_~Wixmp*-Oy$ITv+uvPG1a@08LuP|N^c^r=Cu#X!U)y`jVP1N z+k$vP&8nBg^1O(@Hc9fw3yQ4tLc20f#&xB9fS{3PBfm)naVjq~2qn(laA)o2o40$x zVLQ)^-ZNRX^K`8FHtK7TY35l2h5TT4$cydD3SsBd4x9>JbT3G^Ak!i{;zhpu)PT(% z1bH0|3p%d!;*Wy~W=Cw+Fi+{s-#pEp%gR<^fI~l;ey6VM2}q6ZU^kSz9>@wKXlO~T zOACLIuo93J@nENxM4v8Vc@z2{#%{klOSqb{+QJqH%|2V$i@gU)KtB@NYx=mRLqUEBf~X{P@jn?jpc$ zr`d~y(r;-Lvy14ybG3DL@H*RD*V$MP^>3_*o_dqY$f;YWO-9*g);c!VR0pP-nHSDN z?^HG$=x#aoP(3u$8mUQ#4^*apRg9qh^0{9sYGp9Zz3U`c>_oT9p<|T`=c_NDRXx_N zTh~`4g>+6mVArYsvjk+hn-Li|vo7bry{5^}%%VHE_U==f7q;^Q1#(6(Z>SuYtiE)% zdgS!gr&)b%diM0$%B7c>ea#tApJ{Iwn^AHpCMIcik}%L>tv)bKb)Ggjm*Q_kuVeT*ccO?xKTK*m#QQQBa$QhjmJwZl(P=_=48~EfqdlRm z_;Mxq!YXR}()j(@>Z?IE!zFD<8z*xEG!6$b zEH)85VvOr-qZz^^8;xv2veTemvKn+W#TJ<*qM$ncg!Y0q4myM2w}f>-7q#nPaA}|% zo`y?j8cHXq#cH9khni`uWVq2>EEg}qO^P{0js!_jb=yenituh-K7kfEPGLWiGc8sm zeSGHP%QIzmDj$F6^11WsoNxE^Q5Rm~&}fF;VP4dg#r!~q9KBvbsnKcI&ck2qv~Wbi z4OzsC@cWeH5pk3!Vm|~w90zWI1+qfH_kPka$iUyXq9;p@Q$hI++vws@e!<0UD8!LTqQMRac za4EV*j+EEyxgD)J->InRBp>CLpldacgim6Fsu2dx>UYp5ButiU5jh%~s)Lk(2`EsJFuSrg0W~KSG#WkYy!g6l72f934`LVC{K`XA|Zgb+{X9a7jUj zSqFI)1#QN0jp5pcYaCZv7muSl(Y8A%%M2^QBid^{ldthAO2KkMz*9n0aBpBwVM-gj z%WcQ1lA;6Hy2v9tX%i{P~c;bkj>l`=4;*yco#?5@3dv(`))%%Q_B~$)AuxEnO z348w){X$~N{8jJZH8YeRNMp0yH4UCr*9MqhEfveaugvn|8ox5R_i~0s_*mh>hn4GY zjNLP-1R*EBlTQPO$WqHyG_jGBcA=ur7+IN zElR&>=|i*qMXjdsmJU^Ot(ichvZk~>Pr5gVHDoG54y^X&Kx6}O>-_s>Nx<{yEroKc z>N^W<@@8n0H%@8$zJ#8I;OH2caR?H*t4on4T({834lr}*k*miePe+JW)E#M(rH~uW z-ITkz6rx~~Sj(~|RHNT-?AE7rc{Awk7PKP-H3W`7uBd)pDcYD-DazDLr)qukY)c`k z=lSaUzF(Vf56}0uuRGu7+4kPvBZuO-@a1}nbd5b?ae-R73OPSaF#@Ua5QaSJzp@n!WT+ z^~jr8Zwe#X4lRX;`L3+bNP@ zUI?mh7dOLW9JluN`^|pxV&&NDm(M?o9RkK{=v!d>6^^G~@H<_3@$|;jRaGdt2QySU zu%tUOcIRkCaxd!sH`PChV(i;rr3^7P7Se{;tJ1o@*-xfsrw=QaL-mD|@V>C?`caCR z`X{LkujQJ`d#_g~-iA9ulh32BNID=l5IPQ?_Xo_WuMdOInVm4K=OB7IS}4FQNGl~s zJ`G@fS4iV4l76XnpRKOTmyQ!HD@x%w7l(qYuax22!J$&qS97?0?%kSWwz>NP4B+Ee z5QeL~@CJoD877hSr8j4u|6pOVZlay)D<5FBS6=)X{PUz*ZkJEdtjUi#7rS(5Dowm& z{;RG!>cp!)$C*EhB4(}cs$Be0<=i>Uh|Xm1nufW@o*((IP%8({dimWZ6*Lv0<^w*5*8RYpPmu!A7{M7Q#Xz-s>!7d~^3!Xv256{?b?{j2{ znn+ico2ekumg7|Xmt968WN;1Cak5nr!_eb!On1U& zjlwof;=W7kVRgM*Bc(lx_(n3=6$Cs4D#^}`LyeCIS7QwQTcsPCF%j1FiIuB+#@omV}6@G!82?lcLdMyT{gz+nu+yYHV#r8ZSQIdEnA z!pyOAm8rvW3%!}l=U$yJ%-4!>R3J41&U1wVRzu(>t8YfjX3j$bImsCbWKS~^EVy-X z<|muu$p{$}>#3tc&x(a`v9pegF1Rb@T^OL4>`&?LSt>GA?4;sF6kZT`krVVtj0Lei zhoT5_ifRNE%D8B^W`%A=@TGN|oQwL*ubn}Ssk6r3wD#I`M1kpWh82kJHD zKj170c%aoNVi?Sn*l&S4#gw+9zX{e*^Ja*MYQb9#W;O8B5zAtzF{(jg6?^b06r2kS zr1kN9b}<|dh1KR0$VGC|anp^tNUtiH>J2s4rtIA|O1lz?Mw@8wp*#!qW?Dy*-r-{a z5X)d{@Z0dGT7ncV27dc@Ys6M36={R|O zEIIa)><-{QNG+v^=(-mrSmz?Qs}v^IiB&Cz9q-R!M_|u1?6dBDpTate)`EPd=w{n@ z2}S{1^%b!1-WZQQ9o;i6_{y;eS{LmkTl!*YjTIxI7OdE!J*wG&-LP42s=cL?vAf)d z&*^W3XSP!rHtatC9la}xcj7$z+);6Xny&cAvvNMe;M2ZEwNO6U{|$EDL@baB$|U0q zIp1f7b>FI@iQHA?D&s#=#E{!;%4si0mkPb#o^M0}yXY-JWV?>DesKlmXjUykTx14x3!L@YC7TC;WBHW5#ufB;UmFCieC7)<;s#Bnh7h?7Msm zH)D5QlSiq=Ehu{kR`X(%*nxEoF+6+l{nXl29W!HXx2AmQ1Pg)qr)s;l*kXCq{EB9u zLy^L$&5dfGYF#+?3;Zk76Lu#yV&=mBB1weiK{GGD0STxgQkx$}dQcGaX2XO{lIE_p zomIJlht4*I(VDCuC22{Y?hUH2vH4_HAPd&oWoL(Wxg1j}u9-87+Ci zV829TF@<6QJgTE#GrW42G}&BiUVV$;vLe8tG*$%Opt_$?k)guZ2IQ~kae@lc2IMh=%GV5JtErMu%)yi3JtD#$QWY5@cJ3wq?m`YZOL zjRguU+@U^3#|n+C97_wMgr*Q&NvTA*PtA*mALyLpmfn1rg273bna5@OhUn{Q1;(Q*e)I@w?0rK%H{hRr!K2HX!{u^)2o_SLZ z_IndQ-n}tZBi9Q*slI+(vCftrO-rD@2dfuOD8C6&;L$TzCZ4NIT-=yiMJn;CTT`o; z$>651khj-R&Lj_M%~$qcu&`~wtDE5o`O>A?>`-xVsM(h5oOsNdKOdn!9wFE z1=b585jcZP!g?_4161Ir8L{3e`9HCe1(4oz73+bS6Jdlp!V#we@woWMs zH%NNVMm*zVjB#>MAgvUdH{eZbrML(|<8DI~A91*P{gS|P)rL>YN+Ia}Y-3<%aBY-y zq@q3tn*`Y~fJ!+r(PC!{D|_y3a^?PUtgJz-{-I~(;Qzrx?@zfA^S|;X{;rT-sdO4j zBJ;umj!Fvl58(-a|MNSZ=?oO4-P_(IlUab1oc2O+50Vek3*ti(&r%0i$!C`BY^9S_ zMS8gn59mMvNdP9N(aC#IjgK%^Jvgj0Mc;y6AOANe3HtZ zC(HgRFH%dXgQag!Zi0NBpth2vBntj5PK9#^ZsHvV9l);9qHQ2f2 z0}x?kts$CDSu%Jp<;frkR>Y1NoVIHC5mm}w+tCtFr7Sx{{2Ozl&(U~xleO7V)>;TD ztyh^H^>56tmf)(i;#xi#Ko8%Fch=Rdi?Gz*R<4ONZk}1Q)311B{$& zuwaZJPql>~g5Jk1FkzEZ_f*KK*>8+5duLBiU;bpe`plV@>0q%qR#-89pRPjj3?}kK zoIYkL2Si|6NjhEV-$(Y!9!NI4#85FDa(=R1*9VW}zs5t&BPom2FbBbrdxz}2{nV(u zZ!KQQdX_$s3XFa0s`O3y|KN;nvzxXK9XKb78A-l6<#k?dt+WK9Fo{aiVSr)(cVt!8)looj<>~IvC3Tjjt;ZmF900 zIlfMxThuGSO*-(p8VmaH*Q#w}YI|>_$zJt|T@Ii##{m@v1wH-)6>F)OgR7YOu7Z44e>rjKMR&@R+E z9ZVkXOOOwJ537M_;IswN=$zaDg9owNDt}L^Z~zO3Enys5UR(*UkpG4Xmew`(muex7 zA!p%QR~`S3`piKgeX2oY)g8%Pd@TGOOsM+$RQ2`!m4laHBEo2$nLZ8KG5a_BtLLX?-~3he z-Jhf8%E9NWFMeG4=!4mVzpOleQVAD@e-#+c0FZJNs{aO|a^N>tUU|3j+>!YMisU^} zFGb|N8I+ϟFdK6I2fP(k3EjyE6R?NrB}7*^SK(Bqu#45jf&)^Iw#o%WJwLWin^ zAPJ`)C$#t<5%D~z|T zrs>1kBEDBld+~I74^+ci3(cu8kWMSlkFgU;6Os%hXoY!K-a^1rRNO%YFMbu>-ADz= zJ$5R2p}gR5xr#*5A(L$KQD8DlSlQi3-vw6$|(+-TgNzen-V+DjFE% zyPK&liGoxF=LCvyz|eGNp3FdhkMc`=$bkrx{)KMnpX+Ay_siCFUl)#qt?-)E^3@%> z10NE4KtqKuDf+p78yN5Bh=g-Z6k{xm(f^!k2@;^ApZdr)o3?rp zNSH1B<+iL)Z^E<}vh#bh5>k5vdYLQNr%gp{MS9cWrQ(;2!9DP%OT|C2@KQRMXD<3k^v+FL$4a(eXi zVB+Kt68%S#j}C#>D#@Ra`8@ zgduTQfSt1N7HM(1WL+%u&W;J2EULU&qH|Q6YGi zLVP8f*hhM1rHm}%MvBIbz~shOq|kB=-KKcTQwc|ZhY`IO9)nU5!Q zd@b(on=n&QlC74Qi>o?Xg_M!Csp;iHzgsQUVHNHt>2*jM08MIED=u(~wYQADn0>w1 zAk9X>n%kGOw63`2_M4aA-s-z$#fqD5XjyK!R;}=@TDvq|XP;(IU-vPdBM3^uUDiGV zt|%~Zi`Xh0C5E+m!OGB5eiDv~hS=C-kkEv~VJM5$P(TkvI=imz0X-kXp%C(f-udj_ zh7z{ZRc%?}mD#hd)p5h!5efJ^d>ca@LCqHlcl3BAx*DYEO$aw5+zJrIMVJ2~M;L?c$$^K^&{U=Ayu*C7l7L^AGNaKe& zT<$ns3MGoC*QBw0tTcYwS3%oBY`*~@S^+RNnM~D}O=%iRW!N=4FTOOGj+iZ85x6c$ zQ^dftCbBsg4)q4<*K%1@&T?}aua3@!QMb}aAgE~(YMPs12o~Yu*>j_h9MRLtVeEA| zO>9d}A(_i|=6GhJX~VX$BjVSa=hGQbqw^5(TA;{-dJt+5s@b`mlDI|FsaK{LA!!SM zksS`|-4VLkXNKu4b65iN@779~+4BEhfu zDF2qS%oe~nv485ayF@n%Lu8*sSQ!%@la(;~f8qQBFez3@Ddo=3T~^SLsug^WGNL_Q zcYf}MUYv+I6|-Jez*QC8BmPTq%ET=ezy*T?dn?h$M-IAOVjV8wQ43uGKL*)skgZ3F zxnP(3)!*6Mc{h+HtSEmvX<+m6ua;-R2)cmL{3@9*@Ez<>e$6yoDT;n>Ly-(RLRu)S>HcsaNY^sCv|gTr zeY4s8(&_8NSVTXgoe12=B3SA|*n&V2Gz1-C%;gL#9EgZ!ShaS4SCCdir`K(+WDh%5 zTL0aZ&|)|Pk?__a)%=((&^xe6!X}e;6g6%|=*AvfC>#z_U(m=7c5Vo&DkKy>HPp#H z)m#a4wb`t)V)7YVS$6I^7qEv|E~q3bP^V6YM3rY?Z9=O6H4EJDQjB0YR;2+8&$X<5 zmNvMwbgL@s5^J1N5x+Vn#;ovIiiI6wsSqYA>9h6LK#m~CjdmY{Qm5 zM@$;B#~i4Q+pxu~U`4*46boR-K4#G|U^Op`Emd$;IC^^EqEO&Aa=|DCatXz}B<5mD z3D*$mQ?@Hxs?1-!hPd)9P{;2R)oBulNti4$aq4ReTcWs^kjO!{me_j&q6wyO2X{im z$xP%j^Ee2#iTjSQH>NhTPo_pHvS<}9KMyH9knp%-dI)xGWA{!gZ|0lMcR0)3VcbZ{ z01SnDx=(}fXxRK69gvi1bS3-!v{}s7zJ<#}K~H4$oK>{}Jl`I?DS&$jCS6 z8S3=61$|qC0hqN$522(J<+0GA3K!rozo$yUdiVO8sf z<)J?Y(cUVEUd8N0`NE}mUa#!#v9Zhp+!tKGjSLf2Duon~JF%5Ll^nY^As;S^y z9TT|9RrBN)`V1Ig*HnEql}J@t4X2XwdXozBc^2UW!bybZ*omsCg(Fy;$dP?eRhIL@ z1bP-vU~xN0Uzr}pj&Hb$p429u@r)7sho@Yfe2>t<>2ZY2eR>HSjv-7WhiyMan4Dl& zb!pC96A13FUb}b`4Edi4PTaXP!Ee`$5!7qS<5wW~Gn{ee?p>MS3n>2LSF(f-qeu(- zX`tO_9%$c&9$x`3elUUj)Y|T;56EN6=A}#!c4wCJz^3U;kCh>O|GY!)z??C!&lkM`2FipMb#1_t$f>u z$S~Z&UWwgRKkYj$JU@s!!AfRyfm9o>AH&ra)P0bn=pjuH1+3W6;jjHr^ID>LAe=n4WAy0D$p`OA_CILe zDlPm~B6FU2tC~^J6y|@DK<_i1nBmTq zX0r9FuCg}NdHGJzH=!+G+~2USe3D+n;qwKrj5wyS1v7K)sZmGSZ8PV@*Fv8mZHPoVD6Y3*359j)fD)}o zz&~avzWwpH1&Z0GVGFB%O8q@_8@AgJ&;sS*<_RnvK|tDumA}Xyh1~&V& z9!(E+`aW9b6IAK2kJMR~{-QfoL8nwfFbu{Y;<3(Q?y+!;}$9l>kiKnvO$ QW|>1$`D@K4_nF(yONUkfkzA7D-Jp}$cId4nQc?OHO~re0$# z3Lln4)n%xT*OFdPOFvRt=`#kOly1cFM4-wYUxM6SFc8>0>OqPubF8Rhf>0RIm?;b~ zO@XMq+ z?w|LB)~vv<+rvYVU#0eF&>Z0MTWi$AReOBFFFHkaY`^d;+@=eg}MOpZ21X%^Lu&GdP$ z@d&c_;?LM7JR%69sER(1>IsRGC^3=Ij~?}NNp-&^=KHVYBieUYlqE-^^lnfJvBk{y zqSo0;y&rZ~x76>Mo@JNmxpZFBlcs*mOzumW>A|y;&sRU|Q8CXp$)cYg#(mZi$wRk0&fVE!GG*rpZNbp#@u&WGNG@=)*Q6k1Lj9`7G~* zr|7q&aj)ekZO$5sCF3f}Nfs7x9q-JFzVou%Ck}I|iMRk(nK#A*sC1E!xg%6s34}V&kz|G1c`8w6vTh5l^ZVD+{ zBLvOxT%;-Cq{hiZiJJ!sH~%hztvZF7G_nQV_NIo1lUdE+eeiLlqkT5!Geu^X7-pWk zC2gQL!oR8Vt4d%)^N2ErB--{`^B+b24BU)H0{#4Zw9sjSgYd8D*GIokG7H;#ilmwm z>c`}1fFj-LeYc8;u)8q*wR=bTVBhz-#Jtc zEwNMH~OuoczV!n7}I`RID!##I?M&6`8ed4Z%#NqJ9(K+ z&OP(@9e4EXL`RGu&Vx^v?`Zob5?|4UQ>{xK7yR>Fc(b(|K5LDXf6)c#h}Gi0L35|7 z%r%#6{wcxB1V1DAIpkw?l^2Pe?+1*>YW^8H#huW?#r)8{1t))9RhXb693pt9#*eB)B8&l#)EQBCDQCK zt3fmG)ZmL};@j9&7>LK${+gy-TVUQj!NYk@@Gju`d=jM#?YIk@;ji%s>bV<_FMSo6 z)3(g|ImZ^*+EJst?97MZjz@9Y>mBjRE41=!^OlFtJ8DZV&MdV84s|StMV&Hi?yL=| zD2$sLoAHD|7cn^1S^u?|=r*cgMmLjqf|^;w%%+VwC-Tgu4BP%>QqP$>PbP0L@OSN8^d`#V zU6H2YMI{Qku2{&qdHPWDSk_D?-(qmFtJ^mv_!IbBSJ&tk6t|^)`TQ_fi1cK0Cif%7 zTL|cX3-@{%A!;vz?agT^&Qr(TL(EHj3LkfN?vIE%PvAqaW#=Z!sXDn+UQ`?0$@FiC z>iaF>?+9Kepo}_M;S?sv8U7Yg?-0zEkefpI^Xj(dm~XMqU)>#vMhBuT_>{%*iIzvD zI6hT)&-6)?P2C$ATMk-JHq=UrVgzjjE8wG^S{8@@^gPV^;Qlqy8YgCD=|MexAZZ*o zOnoGo&F#;>!=Sh(!rq@8U-KRd{ek8>sxn0Fg%fMHR6dBLEe#p@ob4UXXEJn^?tzcj z_Ex@2LUjZIg7*n-L87;+oGK76V_2q!dw_0*-Mz8JZsOLB(el}3S%)FKhu^+VO>tyQYFNW8V`Lusp4Y9D%@dU-^B=mV-mQR)BB*{s(Kk-FCT zJePm|`Df<;zklYPCq61in~_LRg5L+HetT*B=ue^_5ptG$q}VP|jp`d@O1BJpfSQ&` z16F_rt>COqLo~dht&^z`jnL?ZG!>>XD>5t7I8AJ*Q&Fe|p_Zg6sKsa+#>K3-m9PR< za#qeud1+Rq8JdOO)NOi)+B65<>DzSYX#sjOblA#TfxHCjvjggjbOa=ZXo-%(^gFsH zfGn6lMt5##Q#r9H?ENwW51Vus-3==8y>1+=(mix9v=-zR(VxHMac%uCdpGu3*Y9BH1oGAu@>eE-Z1G zup^Gv(Os1)oAR|tOLt*IT!Q_R5|vfhvvq|?4fBX(-?;*<`1c}w4Xq9XO;<(HwbQP4 z0U9SIS7DbOZ9dQm;v{TsEelei?u0mb*e^SgPLzTn?@8~=wim)7u2-lGhXEf<^7Fn)ezBJMeEsGb&@x^#R>+Eke2caPl3ZLd-BROkL%@oO9__8R%3C&#*-JP$#O!lN+~O1NuQ<7C9C0SE3D>N z{0;rd2Te8#J!}^Os%3jn&_ETHS?UEuVX5j|^2n@5EVct>1dQ>|_3X%GNlw9)GMP`-VOK zB{oHBcEw&qgHYWR&xTH?!XSs7Dhz{jm)1VyjnGEhiOfekF^Gq_tAm4St_-dSG0-v` z#WkqDj?rMcW+&k)T?bXSMT6BJ4X*CW)NqonvQDT8;{(_AR=`QQ5)H1)>|K~A&EBP< z-dt_zeYFKl)5*As8+7GOr4@2RAQ*H5G<-sGfz9(0l%ppkp(X}ueh6x^KF$w8U;HY> zo?~`$G=b*Rq;CL}Qt$gA4rYNdCQV;gCxOWzzKm55oL>uy05dxQn&BUY3cM4Jku3jl z_=&y3bv#mG`|u33+9EKRr!QKIO}2VN_moPFvUB{d$e|NKpXJa~Sq6mHqC(g{>s zW4dWq7OYiIS!Q*=1_hP{SzeEv7`mZ)hB$as35{$L;R63CvQNpgGyIdtfv1FHqMv%2 z#aNSh3OKZfzKFg%Dul)BYmkD*eukDrVd+hTSvh16{MUbkpD zRa$kbo?2_nHcR0?|3ag@urRSpyzCj&CIaFjmW0JevGg*+3jkgam(47Dx$X=`%@F@X zbVz>%r0fg)pV5)~3J|~|cr}^GM28RX(|9lrIY5O>5GHXsN{TZ4M4$Ot)Zj;Br!q+P zA^3z}zX93pZL&7T-xvTIOc(Xt5D@Cx%MyMe9{S0 zLF5j2DAmCCV=coqfUHR;vMF`_U*G2ckM{fj+OQJ^n%6hUwRTv9O>9%-1G;_C%l(5U zrN&r3(`x}pNj-g7hIAm>k*$U4fpxik zZBHu*6|K7t2rF1e@V4vt!FH3jCJ;IAr~SQ-)Jb9bYK2`D;yvhw=hG0=8Vy6VN1O~q zdt|*WvyVlbeoUi%+pe~v*xDHas)z#3#M~IjMcoLELA1wc9HKo!6A8HUFOTi&!rNH?}CtxD_7;T999YnUr3D|&uMXSJt#^+RaTKl)Gy;1xJDFuJfnVg;7{>; zZ+?)vgOtZ{YH?M-g|b%>+5n~Q9YLgG!QTV+rTXlHYcQl{=6E*`Wqw2^o<${i$R(4V z$Fdkdfu-jWE+8PA3McTw4(w&+76-os?Qom8h}|0qLU4(fh7Rh!kMNg2%3UTe-27GUBq=aZaR#n8yHOQh36~N6e*Mic{?~kt zKTvpP2xF-f+)8+C8D)?2?-g=qw+;xc7A{>feAC7Dd<$FG5YYR;yO(FscfN4BCtN)hU?V@bv~y9u1P4G#~U+ z6J(gXAfbifse|_>sS^RdRN>jD*?SGSFd!XlAtTy%Wvkk{3b?F4KkXh;$Yqyort4L zu)PSdT3}WFx6v2;qsR<8!Pih=fCKZ`RzeU&n8VTm1i^;*jOb%S;Xhk%fkW6U{6Dar z*Y50k(M1&a3PSH#zvxT9+fj@k$Fn>Lg0h?@VI|G~y<>trz{kgaz3?Co$Cr%A66I~K zz4b~Fb{|2h0UPhfwucc0h$nIDangI3O!C&w(*`NSJH9-@f3x#$@)ZBi&XM*`91XW8 zxL!iYLd4=0dg^quS@)g&5v(hgci2Q@aAi!*}Ge@#KgY!A|l& z?_yKiH424Zy!WQMRBcd5wiX@&T5Bpg&_BBl#nKM;9r)*a(hQc+VLYSWoSUmP=GY~^ fx%;gW#}2XaXeE0O0Y?w4gpKj%_Z(BsCFTDCxLqU? delta 4167 zcma)8YitzP6`niqeee3c>-Pi0Qotk?0jh=w67y5>y)&-yYL-o~ z(dh12vOJl75XP_X`@P2)l^k0QGQhF&rziQL8R)Bs*P0j!4G=QoyA@oP-|4P@(L}e?c6fb9)di59(rG8{ zTG7UuX}8rPcAw^;NqcDTN?ILDuDO89D($0Nz;KH8TPA$tR=N#ltyJf|%JfT<1P`(L zhhrI@m*KJ7032yqa-}SBWbvyws`%BYG%VdFnY7LyDf`1|;;FWkpSmrv6n{Pj#CZz6)i16??DP`2V`F=sWOk)q%wSasXe7j^OL@Kdmo( z5zg-Z&wu*hdmle|kN;5Lp%c~)OZC2>}p6OV#^HaIJh#01{XBsD zT3iVeSq_pro#q6&*ad5ubjVZlIolB)us&E~sKB-&Y(vX1gd zD`Og!3|Fc|T*Wn9c~vQyu4#`#0;-TaAVCc$>MF~Gny?Q<2CNahWt0MtzOk9O8=#>Z zkg$$fNzgFbqY+;oM$r#6aT~yj&^2l7mNqI4H)3KLOO>_EHY))d=YNl+NrJaVkL~FK zpEJd1ZgzayvIl0Z*#cY2dL-*0g3#nEYmnke&PyiFg5Mf)Vb6>r}a zC#s?o^^vOTL{?=Y6ZjRV|2rCNli^RyWnw;zn*0wvojjcAD@VXW2p$pPKVpP`k0e!* zA(TK=NvWc&N+kt?rMe20Z;+cG0d0|rMg>thx~o!kS-u%7X|4t|MJnd1^q7HEh|{t{ zq&f3BV8BHPGyb24r`YNQDxh0lB{#z*-PJ3>Rai#a+Msl_R%uk4+X8_PE!hxw1B@h{ zFi@JvLTb3iCQ37|8MXFsEC>$Pkzt&u6RVWttI{$lnUI_&BwV1H(3qTwN}a2^fmOK_ zbfii>l3yw0nvl>zH#E~gLvFx@It=L?fpiXoM&nGA8>Uf_&f!Wkjfr%Qqcqq~RFfGj z2+a_jTwVE=J_^NZ{i`9nB{;0#Z2ayjnTMACT>bJo@#Xd61yB9*jo+GXSU?uK|I1%} z_U?y|`Si{EzxbGcTHku46Sf#uO;28&$gn}2b^#Q_)e9tukR8T#%+D%qV&1Z2xOg1l z6hGgvvmY~(jR2%|CNg;Ky7+0F!TkaLPQ%V}%Q{DTrahm}A;o%HzOZO9_8iK+h;RE2 zMyDg2uciVHF+s_fQRrEB(+l`a3&x+krgs~=BX5DWJq94l4Kjb&*tILH#Yt2ND*mlh zYi??)=BCO-{m-^fqdV`MY3e6?@QV5+1ifVz!q)-P)!MN}DH~|wN6jtdlRI~tKO`eZ zQ3C^(HrZEjEMoIj9GyTAu|Z~ICjq=L#=|dtKuo7#395~V={HbD2%hD?P3{mh#76i( zllz7?#=z69xyb^x*d^2$M_4;^PamJ3x8`WgCy97sDAOT76gIz>YT2`Zdg8@C?Fzpp zmK@8jp~feQ=Dk!axywIGT_rUVJ$ zmW67;kSfr!QgszehTwhJ3q7M`>|G?~^_o#7d+c0AkLyA!fOH8ZA-XhHB}N0dI#%tE zjq3y&$|~IpsjD^g8HbXA?FO_ZPZo|f`XmPL>K=(>?yxnUWYc`5`eG9(>UGaA-1jT1%%pmq`R+) ze2MSt8#Lw22y1|CzS6h5d*cH-cX!R+Co(oT>;Vgx3K)O%TKdpLyQEUayDUc7 z=rinr@N4Zp08>(-lQI=kmbE5gCZeHexI^uDp<{1PplzVPr72#Q5*lnDzqy6QHyh-M z{>D8=@#xV>6MHhkz74~4Cwm)y{B${wqiz7t$QP!kb92+|B7e32Tj@7(LmW7I#cm>? P_7jft*w+2Z?mGFu_6dRx diff --git a/__pycache__/proxy_utils.cpython-310.pyc b/__pycache__/proxy_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38cffb91efc2a3ad455a4d21c9f974b0c2f53f5f GIT binary patch literal 1947 zcmZ`(&2Jk;6rY*>@cJWd8kImwp++2#55y@#Ay@$cg%B=jgo~gSXykY{iG%HRW@keZ z*-}LVNG%`^2vjLZjXwv<@zFc~!d!6@dg7i#;k{YMI00rgZ{K_KX6AkT-UI^! zDFUNC@!R6v1R=kp&_8StUV<4;0TD#d8Y#tpTB0Dc8mn6+ixQC*wr|ff;RtubDLEn` zybV%vUne3dQX531uCr1?&?3oY?!#GRCP%e*xv8o_qg*>Yx2ROJkk4Oy{+ZE>;}bAP zpBpb+oERG$)5)d`R@ZnWYZZz+yTgLN2lEola0N(!;D}n%iceWlIU}b^pn?iSUZVCMva7paj3GM)Lv5C8IW6p31`b zW9Qz%uRGIMuFU3<6gxkEd+^1_X33D30;?#!6KN4ed#PIr2jVs-ho;P%52NG71$33!T7YtA40YFGdRs)yBenx#IK1 zXig_9K_gV<25dYtJ;kTrdWH8nIMk;q9y$F+y%|XLnv{XeSrTEV6JdO(JOk1;kq^V~ za3n8-K;`Roe=grmpsLow(dL>y75a+*6R!j2VHQX#k)rr{mP;99)O)~Ok9zbJ%}~(% zB}c%h0<$+T83-=}tt>EtzDnLpx4XbvA#wnr)*>&EWhNM4+KPt2Y=8Ho`?tP}iy32K z+m`5ubdf-V5s8qU$uXHmAp->Zzq3oLdK%NKj znCBn^O1nM#EHFO7J}ogpOO{9p%eQ@Jgg~F#!ht?@g)0)!rvQ7AguiE^N$bok_S~hQ zq5M@9Jq-tSc5ifcZ|;Bf$-&MRfErK=_&oUXPUol3JG*!Hzy2`}I1YZsRItKe-_bxG zr^)FcFfGU*CsoKP2M>#6n_^l_138X&Z~{8;yec@hPc^< zO0PfE@BZkK&9UbLeL;KU&loT*vP9%?x6)M6C3q%UbPK#(vRm+|EIZ0Uo9mQm z!nwqHYb#VzVk5~XksxUL{4&MC=97;Bc?xFSZ^Wi&sb^=BMxUe7f~A}nXJP<&?LzqQ n0GdnWC}>ROO?#R>|8CS#vp#OEf&r@$i^{~+_u54nE>o?_fp literal 0 HcmV?d00001 diff --git a/app_log.txt b/app_log.txt new file mode 100644 index 0000000..2f676af --- /dev/null +++ b/app_log.txt @@ -0,0 +1,78 @@ +nohup: ignoring input +INFO:baidu_bos_manager:✅ BOS连接测试成功 +INFO:baidu_bos_manager:✅ BOS客户端初始化成功: dmtyz-demo +INFO:mongodb_manager:✅ MongoDB连接成功: mmeb +INFO:mongodb_manager:✅ MongoDB索引创建完成 +INFO:__main__:初始化多模态检索系统... +INFO:multimodal_retrieval_local:使用GPU: [0, 1] +INFO:multimodal_retrieval_local:加载本地模型和处理器: /root/models/Ops-MM-embedding-v1-7B +The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release. +You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0. +INFO:multimodal_retrieval_local:Processor类型: +INFO:multimodal_retrieval_local:Processor方法: ['__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_auto_class', '_check_special_mm_tokens', '_create_repo', '_get_arguments_from_pretrained', '_get_files_timestamps', '_get_num_multimodal_tokens', '_merge_kwargs', '_upload_modified_files', 'apply_chat_template', 'attributes', 'audio_tokenizer', 'batch_decode', 'chat_template', 'check_argument_for_proper_class', 'decode', 'feature_extractor_class', 'from_args_and_dict', 'from_pretrained', 'get_possibly_dynamic_module', 'get_processor_dict', 'image_processor', 'image_processor_class', 'image_token', 'image_token_id', 'model_input_names', 'optional_attributes', 'optional_call_args', 'post_process_image_text_to_text', 'push_to_hub', 'register_for_auto_class', 'save_pretrained', 'to_dict', 'to_json_file', 'to_json_string', 'tokenizer', 'tokenizer_class', 'validate_init_kwargs', 'video_processor', 'video_processor_class', 'video_token', 'video_token_id'] +INFO:multimodal_retrieval_local:Image processor类型: +INFO:multimodal_retrieval_local:Image processor方法: ['__backends', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slotnames__', '__str__', '__subclasshook__', '__weakref__', '_auto_class', '_create_repo', '_further_process_kwargs', '_fuse_mean_std_and_rescale_factor', '_get_files_timestamps', '_prepare_image_like_inputs', '_prepare_images_structure', '_preprocess', '_preprocess_image_like_inputs', '_process_image', '_processor_class', '_set_processor_class', '_upload_modified_files', '_valid_kwargs_names', '_validate_preprocess_kwargs', 'center_crop', 'compile_friendly_resize', 'convert_to_rgb', 'crop_size', 'data_format', 'default_to_square', 'device', 'disable_grouping', 'do_center_crop', 'do_convert_rgb', 'do_normalize', 'do_rescale', 'do_resize', 'fetch_images', 'filter_out_unused_kwargs', 'from_dict', 'from_json_file', 'from_pretrained', 'get_image_processor_dict', 'get_number_of_image_patches', 'image_mean', 'image_processor_type', 'image_std', 'input_data_format', 'max_pixels', 'merge_size', 'min_pixels', 'model_input_names', 'normalize', 'patch_size', 'preprocess', 'push_to_hub', 'register_for_auto_class', 'resample', 'rescale', 'rescale_and_normalize', 'rescale_factor', 'resize', 'return_tensors', 'save_pretrained', 'size', 'temporal_patch_size', 'to_dict', 'to_json_file', 'to_json_string', 'unused_kwargs', 'valid_kwargs'] + Loading checkpoint shards: 0%| | 0/4 [00:00 +INFO:multimodal_retrieval_local:encode_image: 图像列表,长度: 1 +INFO:multimodal_retrieval_local:encode_image: 处理图像输入 +INFO:multimodal_retrieval_local:encode_image: 图像 0 格式: JPEG, 模式: RGB, 大小: (939, 940) +INFO:multimodal_retrieval_local:encode_image: 使用image_processor处理图像 +INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:52:40] "GET / HTTP/1.1" 200 - +INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:52:41] "GET /api/system_info HTTP/1.1" 200 - +INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:52:41] "GET /api/system_info HTTP/1.1" 200 - +INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:52:42] "GET /favicon.ico HTTP/1.1" 404 - +INFO:multimodal_retrieval_local:encode_image: 处理后的输入键: ['pixel_values'] +INFO:__main__:处理图像: 微信图片_20250910164839_1_13.jpg (99396 字节) +INFO:__main__:成功加载图像: 20250910164839_1_13.jpg, 格式: JPEG, 模式: RGB, 大小: (939, 940) +INFO:multimodal_retrieval_local:add_images: 开始添加图像,数量: 1 +INFO:multimodal_retrieval_local:add_images: 编码图像 +INFO:multimodal_retrieval_local:encode_image: 开始编码图像,类型: +INFO:multimodal_retrieval_local:encode_image: 图像列表,长度: 1 +INFO:multimodal_retrieval_local:encode_image: 处理图像输入 +INFO:multimodal_retrieval_local:encode_image: 图像 0 格式: JPEG, 模式: RGB, 大小: (939, 940) +INFO:multimodal_retrieval_local:encode_image: 使用image_processor处理图像 +INFO:multimodal_retrieval_local:encode_image: 运行模型推理 +INFO:multimodal_retrieval_local:Model类型: +INFO:multimodal_retrieval_local:Model属性: ['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_auto_class', '_backward_compatibility_gradient_checkpointing', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_can_compile_fullgraph', '_can_record_outputs', '_can_set_attn_implementation', '_check_and_adjust_attn_implementation', '_checkpoint_conversion_mapping', '_compiled_call_impl', '_convert_head_mask_to_5d', '_copy_lm_head_original_to_resized', '_create_repo', '_dispatch_accelerate_model', '_fix_state_dict_key_on_load', '_fix_state_dict_key_on_save', '_fix_state_dict_keys_on_save', '_flash_attn_2_can_dispatch', '_flash_attn_3_can_dispatch', '_flex_attn_can_dispatch', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_from_config', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_files_timestamps', '_get_key_renaming_mapping', '_get_name', '_get_no_split_modules', '_get_resized_embeddings', '_get_resized_lm_head', '_hf_hook', '_hf_peft_config_loaded', '_hook_rss_memory_post_forward', '_hook_rss_memory_pre_forward', '_init_added_embeddings_weights_with_mean', '_init_added_lm_head_bias_with_mean', '_init_added_lm_head_weights_with_mean', '_init_weights', '_initialize_missing_keys', '_initialize_weights', '_input_embed_layer', '_is_full_backward_hook', '_is_hf_initialized', '_is_stateful', '_keep_in_fp32_modules', '_keep_in_fp32_modules', '_keep_in_fp32_modules_strict', '_keep_in_fp32_modules_strict', '_keys_to_ignore_on_load_missing', '_keys_to_ignore_on_load_unexpected', '_keys_to_ignore_on_save', '_load_from_flax', '_load_from_state_dict', '_load_from_tf', '_load_pretrained_model', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_move_missing_keys_from_meta_to_cpu', '_named_members', '_no_split_modules', '_no_split_modules', '_non_persistent_buffers_set', '_old_forward', '_parameters', '_pp_plan', '_pp_plan', '_register_load_state_dict_pre_hook', '_register_state_dict_hook', '_replicate_for_data_parallel', '_resize_token_embeddings', '_save_to_state_dict', '_sdpa_can_dispatch', '_set_default_torch_dtype', '_set_gradient_checkpointing', '_skip_keys_device_placement', '_slow_forward', '_state_dict_hooks', '_state_dict_pre_hooks', '_supports_attention_backend', '_supports_flash_attn', '_supports_flex_attn', '_supports_sdpa', '_tie_encoder_decoder_weights', '_tie_or_clone_weights', '_tied_weights_keys', '_tp_plan', '_tp_size', '_upload_modified_files', '_version', '_wrapped_call_impl', 'active_adapter', 'active_adapters', 'add_adapter', 'add_memory_hooks', 'add_model_tags', 'add_module', 'apply', 'base_model', 'base_model_prefix', 'bfloat16', 'buffers', 'call_super_init', 'can_generate', 'can_record_outputs', 'children', 'compile', 'config', 'config_class', 'cpu', 'create_extended_attention_mask_for_decoder', 'cuda', 'cuda', 'delete_adapter', 'dequantize', 'device', 'disable_adapters', 'disable_input_require_grads', 'double', 'dtype', 'dummy_inputs', 'dump_patches', 'enable_adapters', 'enable_input_require_grads', 'estimate_tokens', 'eval', 'extra_repr', 'float', 'floating_point_ops', 'forward', 'forward', 'framework', 'from_pretrained', 'generation_config', 'get_adapter_state_dict', 'get_buffer', 'get_compiled_call', 'get_correct_attn_implementation', 'get_decoder', 'get_extended_attention_mask', 'get_extra_state', 'get_head_mask', 'get_image_features', 'get_init_context', 'get_input_embeddings', 'get_memory_footprint', 'get_output_embeddings', 'get_parameter', 'get_parameter_or_buffer', 'get_placeholder_mask', 'get_position_embeddings', 'get_rope_index', 'get_submodule', 'get_video_features', 'gradient_checkpointing_disable', 'gradient_checkpointing_enable', 'half', 'hf_device_map', 'init_weights', 'initialize_weights', 'invert_attention_mask', 'ipu', 'is_backend_compatible', 'is_gradient_checkpointing', 'is_parallelizable', 'language_model', 'load_adapter', 'load_state_dict', 'loss_function', 'loss_type', 'main_input_name', 'model_tags', 'modules', 'mtia', 'name_or_path', 'named_buffers', 'named_children', 'named_modules', 'named_parameters', 'num_parameters', 'parameters', 'post_init', 'prune_heads', 'push_to_hub', 'register_backward_hook', 'register_buffer', 'register_for_auto_class', 'register_forward_hook', 'register_forward_pre_hook', 'register_full_backward_hook', 'register_full_backward_pre_hook', 'register_load_state_dict_post_hook', 'register_load_state_dict_pre_hook', 'register_module', 'register_parameter', 'register_state_dict_post_hook', 'register_state_dict_pre_hook', 'requires_grad_', 'reset_memory_hooks_state', 'resize_position_embeddings', 'resize_token_embeddings', 'retrieve_modules_from_names', 'reverse_bettertransformer', 'rope_deltas', 'save_pretrained', 'set_adapter', 'set_attn_implementation', 'set_decoder', 'set_extra_state', 'set_input_embeddings', 'set_output_embeddings', 'set_submodule', 'share_memory', 'smart_apply', 'state_dict', 'supports_gradient_checkpointing', 'supports_pp_plan', 'supports_tp_plan', 'tie_weights', 'to', 'to', 'to_bettertransformer', 'to_empty', 'tp_size', 'train', 'training', 'type', 'visual', 'warn_if_padding_and_no_attention_mask', 'warnings_issued', 'xpu', 'zero_grad'] +ERROR:multimodal_retrieval_local:encode_image: 处理图像时出错: embedding(): argument 'indices' (position 2) must be Tensor, not NoneType +ERROR:multimodal_retrieval_local:add_images: 图像编码失败,返回空数组 +INFO:multimodal_retrieval_local:索引保存成功: /root/mmeb/local_faiss_index.index +INFO:multimodal_retrieval_local:元数据保存成功: /root/mmeb/local_faiss_index_metadata.json +INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:52:46] "POST /api/add_image HTTP/1.1" 200 - +INFO:multimodal_retrieval_local:encode_image: 处理后的输入键: ['pixel_values'] +INFO:multimodal_retrieval_local:encode_image: 运行模型推理 +INFO:multimodal_retrieval_local:Model类型: +INFO:multimodal_retrieval_local:Model属性: ['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_auto_class', '_backward_compatibility_gradient_checkpointing', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_can_compile_fullgraph', '_can_record_outputs', '_can_set_attn_implementation', '_check_and_adjust_attn_implementation', '_checkpoint_conversion_mapping', '_compiled_call_impl', '_convert_head_mask_to_5d', '_copy_lm_head_original_to_resized', '_create_repo', '_dispatch_accelerate_model', '_fix_state_dict_key_on_load', '_fix_state_dict_key_on_save', '_fix_state_dict_keys_on_save', '_flash_attn_2_can_dispatch', '_flash_attn_3_can_dispatch', '_flex_attn_can_dispatch', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_from_config', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_files_timestamps', '_get_key_renaming_mapping', '_get_name', '_get_no_split_modules', '_get_resized_embeddings', '_get_resized_lm_head', '_hf_hook', '_hf_peft_config_loaded', '_hook_rss_memory_post_forward', '_hook_rss_memory_pre_forward', '_init_added_embeddings_weights_with_mean', '_init_added_lm_head_bias_with_mean', '_init_added_lm_head_weights_with_mean', '_init_weights', '_initialize_missing_keys', '_initialize_weights', '_input_embed_layer', '_is_full_backward_hook', '_is_hf_initialized', '_is_stateful', '_keep_in_fp32_modules', '_keep_in_fp32_modules', '_keep_in_fp32_modules_strict', '_keep_in_fp32_modules_strict', '_keys_to_ignore_on_load_missing', '_keys_to_ignore_on_load_unexpected', '_keys_to_ignore_on_save', '_load_from_flax', '_load_from_state_dict', '_load_from_tf', '_load_pretrained_model', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_move_missing_keys_from_meta_to_cpu', '_named_members', '_no_split_modules', '_no_split_modules', '_non_persistent_buffers_set', '_old_forward', '_parameters', '_pp_plan', '_pp_plan', '_register_load_state_dict_pre_hook', '_register_state_dict_hook', '_replicate_for_data_parallel', '_resize_token_embeddings', '_save_to_state_dict', '_sdpa_can_dispatch', '_set_default_torch_dtype', '_set_gradient_checkpointing', '_skip_keys_device_placement', '_slow_forward', '_state_dict_hooks', '_state_dict_pre_hooks', '_supports_attention_backend', '_supports_flash_attn', '_supports_flex_attn', '_supports_sdpa', '_tie_encoder_decoder_weights', '_tie_or_clone_weights', '_tied_weights_keys', '_tp_plan', '_tp_size', '_upload_modified_files', '_version', '_wrapped_call_impl', 'active_adapter', 'active_adapters', 'add_adapter', 'add_memory_hooks', 'add_model_tags', 'add_module', 'apply', 'base_model', 'base_model_prefix', 'bfloat16', 'buffers', 'call_super_init', 'can_generate', 'can_record_outputs', 'children', 'compile', 'config', 'config_class', 'cpu', 'create_extended_attention_mask_for_decoder', 'cuda', 'cuda', 'delete_adapter', 'dequantize', 'device', 'disable_adapters', 'disable_input_require_grads', 'double', 'dtype', 'dummy_inputs', 'dump_patches', 'enable_adapters', 'enable_input_require_grads', 'estimate_tokens', 'eval', 'extra_repr', 'float', 'floating_point_ops', 'forward', 'forward', 'framework', 'from_pretrained', 'generation_config', 'get_adapter_state_dict', 'get_buffer', 'get_compiled_call', 'get_correct_attn_implementation', 'get_decoder', 'get_extended_attention_mask', 'get_extra_state', 'get_head_mask', 'get_image_features', 'get_init_context', 'get_input_embeddings', 'get_memory_footprint', 'get_output_embeddings', 'get_parameter', 'get_parameter_or_buffer', 'get_placeholder_mask', 'get_position_embeddings', 'get_rope_index', 'get_submodule', 'get_video_features', 'gradient_checkpointing_disable', 'gradient_checkpointing_enable', 'half', 'hf_device_map', 'init_weights', 'initialize_weights', 'invert_attention_mask', 'ipu', 'is_backend_compatible', 'is_gradient_checkpointing', 'is_parallelizable', 'language_model', 'load_adapter', 'load_state_dict', 'loss_function', 'loss_type', 'main_input_name', 'model_tags', 'modules', 'mtia', 'name_or_path', 'named_buffers', 'named_children', 'named_modules', 'named_parameters', 'num_parameters', 'parameters', 'post_init', 'prune_heads', 'push_to_hub', 'register_backward_hook', 'register_buffer', 'register_for_auto_class', 'register_forward_hook', 'register_forward_pre_hook', 'register_full_backward_hook', 'register_full_backward_pre_hook', 'register_load_state_dict_post_hook', 'register_load_state_dict_pre_hook', 'register_module', 'register_parameter', 'register_state_dict_post_hook', 'register_state_dict_pre_hook', 'requires_grad_', 'reset_memory_hooks_state', 'resize_position_embeddings', 'resize_token_embeddings', 'retrieve_modules_from_names', 'reverse_bettertransformer', 'rope_deltas', 'save_pretrained', 'set_adapter', 'set_attn_implementation', 'set_decoder', 'set_extra_state', 'set_input_embeddings', 'set_output_embeddings', 'set_submodule', 'share_memory', 'smart_apply', 'state_dict', 'supports_gradient_checkpointing', 'supports_pp_plan', 'supports_tp_plan', 'tie_weights', 'to', 'to', 'to_bettertransformer', 'to_empty', 'tp_size', 'train', 'training', 'type', 'visual', 'warn_if_padding_and_no_attention_mask', 'warnings_issued', 'xpu', 'zero_grad'] +ERROR:multimodal_retrieval_local:encode_image: 处理图像时出错: embedding(): argument 'indices' (position 2) must be Tensor, not NoneType +ERROR:multimodal_retrieval_local:add_images: 图像编码失败,返回空数组 +INFO:multimodal_retrieval_local:索引保存成功: /root/mmeb/local_faiss_index.index +INFO:multimodal_retrieval_local:元数据保存成功: /root/mmeb/local_faiss_index_metadata.json +INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:52:59] "POST /api/add_image HTTP/1.1" 200 - +INFO:multimodal_retrieval_local:索引保存成功: /root/mmeb/local_faiss_index.index +INFO:multimodal_retrieval_local:元数据保存成功: /root/mmeb/local_faiss_index_metadata.json +INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:53:00] "POST /api/save_index HTTP/1.1" 200 - +INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:53:01] "GET /api/system_info HTTP/1.1" 200 - diff --git a/baidu_vdb_backend.py b/baidu_vdb_backend.py index f17975e..d293724 100644 --- a/baidu_vdb_backend.py +++ b/baidu_vdb_backend.py @@ -118,30 +118,29 @@ class BaiduVDBBackend: try: logger.info(f"创建文本向量表: {self.text_table_name}") - # 定义字段 - 使用最简单的配置 + # 定义字段 - 移除可能导致问题的复杂配置 fields = [ - Field("id", FieldType.STRING, primary_key=True, partition_key=True, not_null=True), + Field("id", FieldType.STRING, primary_key=True, not_null=True), Field("text_content", FieldType.STRING, not_null=True), Field("vector", FieldType.FLOAT_VECTOR, not_null=True, dimension=self.vector_dimension) ] - # 定义索引 + # 定义索引 - 简化配置 indexes = [ VectorIndex( index_name="text_vector_idx", index_type=IndexType.HNSW, field="vector", metric_type=MetricType.COSINE, - params=HNSWParams(m=32, efconstruction=200), + params=HNSWParams(m=16, efconstruction=100), auto_build=True ) ] - # 创建表 + # 创建表 - 简化配置 self.text_table = self.db.create_table( table_name=self.text_table_name, - replication=2, # 双副本 - partition=Partition(partition_num=3), # 3个分区 + replication=1, # 单副本 schema=Schema(fields=fields, indexes=indexes) ) @@ -156,30 +155,29 @@ class BaiduVDBBackend: try: logger.info(f"创建图像向量表: {self.image_table_name}") - # 定义字段 - 使用最简单的配置 + # 定义字段 - 移除可能导致问题的复杂配置 fields = [ - Field("id", FieldType.STRING, primary_key=True, partition_key=True, not_null=True), + Field("id", FieldType.STRING, primary_key=True, not_null=True), Field("image_path", FieldType.STRING, not_null=True), Field("vector", FieldType.FLOAT_VECTOR, not_null=True, dimension=self.vector_dimension) ] - # 定义索引 + # 定义索引 - 简化配置 indexes = [ VectorIndex( index_name="image_vector_idx", index_type=IndexType.HNSW, field="vector", metric_type=MetricType.COSINE, - params=HNSWParams(m=32, efconstruction=200), + params=HNSWParams(m=16, efconstruction=100), auto_build=True ) ] - # 创建表 + # 创建表 - 简化配置 self.image_table = self.db.create_table( table_name=self.image_table_name, - replication=2, # 双副本 - partition=Partition(partition_num=3), # 3个分区 + replication=1, # 单副本 schema=Schema(fields=fields, indexes=indexes) ) diff --git a/faiss_index_local.index b/faiss_index_local.index new file mode 100644 index 0000000000000000000000000000000000000000..27dba4eb5bb23b61900e75e1e8e2a7d95b8beb95 GIT binary patch literal 45 YcmeaQa5G}yV?Y7|P&R~SWJHnz06M_|Y5)KL literal 0 HcmV?d00001 diff --git a/faiss_index_local_metadata.json b/faiss_index_local_metadata.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/faiss_index_local_metadata.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/faiss_index_test.index b/faiss_index_test.index new file mode 100644 index 0000000000000000000000000000000000000000..27dba4eb5bb23b61900e75e1e8e2a7d95b8beb95 GIT binary patch literal 45 YcmeaQa5G}yV?Y7|P&R~SWJHnz06M_|Y5)KL literal 0 HcmV?d00001 diff --git a/faiss_index_test_metadata.json b/faiss_index_test_metadata.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/faiss_index_test_metadata.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/faiss_vector_store.py b/faiss_vector_store.py new file mode 100644 index 0000000..cacfcf0 --- /dev/null +++ b/faiss_vector_store.py @@ -0,0 +1,147 @@ +import os +import json +import numpy as np +import faiss +from typing import List, Dict, Any, Optional, Tuple +import logging + +class FaissVectorStore: + def __init__(self, index_path: str = "faiss_index", dimension: int = 3584): + """ + 初始化FAISS向量存储 + + 参数: + index_path: 索引文件路径 + dimension: 向量维度 + """ + self.index_path = index_path + self.dimension = dimension + self.index = None + self.metadata = {} + self.metadata_path = f"{index_path}_metadata.json" + + # 加载现有索引或创建新索引 + self._load_or_create_index() + + def _load_or_create_index(self): + """加载现有索引或创建新索引""" + if os.path.exists(f"{self.index_path}.index"): + logging.info(f"加载现有索引: {self.index_path}") + self.index = faiss.read_index(f"{self.index_path}.index") + self._load_metadata() + else: + logging.info(f"创建新索引,维度: {self.dimension}") + self.index = faiss.IndexFlatL2(self.dimension) # 使用L2距离 + + def _load_metadata(self): + """加载元数据""" + if os.path.exists(self.metadata_path): + with open(self.metadata_path, 'r', encoding='utf-8') as f: + self.metadata = json.load(f) + + def _save_metadata(self): + """保存元数据到文件""" + with open(self.metadata_path, 'w', encoding='utf-8') as f: + json.dump(self.metadata, f, ensure_ascii=False, indent=2) + + def save_index(self): + """保存索引和元数据""" + if self.index is not None: + faiss.write_index(self.index, f"{self.index_path}.index") + self._save_metadata() + logging.info(f"索引已保存到 {self.index_path}.index") + + def add_vectors( + self, + vectors: np.ndarray, + metadatas: List[Dict[str, Any]] + ) -> List[str]: + """ + 添加向量和元数据 + + 参数: + vectors: 向量数组 + metadatas: 对应的元数据列表 + + 返回: + 添加的向量ID列表 + """ + if len(vectors) != len(metadatas): + raise ValueError("vectors和metadatas长度必须相同") + + start_id = len(self.metadata) + ids = list(range(start_id, start_id + len(vectors))) + + # 添加向量到索引 + self.index.add(vectors.astype('float32')) + + # 保存元数据 + for idx, vector_id in enumerate(ids): + self.metadata[str(vector_id)] = metadatas[idx] + + # 保存索引和元数据 + self.save_index() + + return [str(id) for id in ids] + + def search( + self, + query_vector: np.ndarray, + k: int = 5 + ) -> Tuple[List[Dict[str, Any]], List[float]]: + """ + 相似性搜索 + + 参数: + query_vector: 查询向量 + k: 返回结果数量 + + 返回: + (结果列表, 距离列表) + """ + if self.index is None: + return [], [] + + # 确保输入是2D数组 + if len(query_vector.shape) == 1: + query_vector = query_vector.reshape(1, -1) + + # 执行搜索 + distances, indices = self.index.search(query_vector.astype('float32'), k) + + # 处理结果 + results = [] + for i in range(len(indices[0])): + idx = indices[0][i] + if idx < 0: # FAISS可能返回-1表示无效索引 + continue + + vector_id = str(idx) + if vector_id in self.metadata: + result = self.metadata[vector_id].copy() + result['distance'] = float(distances[0][i]) + results.append(result) + + return results, distances[0].tolist() + + def get_vector_count(self) -> int: + """获取向量数量""" + return self.index.ntotal if self.index is not None else 0 + + def delete_vectors(self, vector_ids: List[str]) -> bool: + """ + 删除指定ID的向量 + + 注意: FAISS不支持直接删除向量,这里实现为逻辑删除 + """ + deleted_count = 0 + for vector_id in vector_ids: + if vector_id in self.metadata: + del self.metadata[vector_id] + deleted_count += 1 + + if deleted_count > 0: + self._save_metadata() + logging.warning("FAISS不支持直接删除向量,已从元数据中移除,但索引中仍保留") + + return deleted_count > 0 diff --git a/local_faiss_index.index b/local_faiss_index.index new file mode 100644 index 0000000000000000000000000000000000000000..27dba4eb5bb23b61900e75e1e8e2a7d95b8beb95 GIT binary patch literal 45 YcmeaQa5G}yV?Y7|P&R~SWJHnz06M_|Y5)KL literal 0 HcmV?d00001 diff --git a/local_faiss_index_metadata.json b/local_faiss_index_metadata.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/local_faiss_index_metadata.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/local_file_handler.py b/local_file_handler.py new file mode 100644 index 0000000..00039c2 --- /dev/null +++ b/local_file_handler.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +本地文件处理器 +简化版的文件处理器,不依赖外部服务 +""" + +import os +import io +import tempfile +import logging +from contextlib import contextmanager +from typing import Dict, List, Optional, Any, Union, BinaryIO +from pathlib import Path + +logger = logging.getLogger(__name__) + +class LocalFileHandler: + """本地文件处理器""" + + # 小文件阈值 (5MB) + SMALL_FILE_THRESHOLD = 5 * 1024 * 1024 + + # 支持的图像格式 + SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'} + + def __init__(self, temp_dir: str = None): + """ + 初始化本地文件处理器 + + Args: + temp_dir: 临时文件目录 + """ + self.temp_dir = temp_dir or tempfile.gettempdir() + self.temp_files = set() # 跟踪临时文件 + + # 确保临时目录存在 + os.makedirs(self.temp_dir, exist_ok=True) + + @contextmanager + def temp_file_context(self, content: bytes = None, suffix: str = None, delete_on_exit: bool = True): + """临时文件上下文管理器,确保自动清理""" + temp_fd, temp_path = tempfile.mkstemp(suffix=suffix, dir=self.temp_dir) + self.temp_files.add(temp_path) + + try: + os.close(temp_fd) # 关闭文件描述符 + + # 如果提供了内容,写入文件 + if content is not None: + with open(temp_path, 'wb') as f: + f.write(content) + + yield temp_path + finally: + if delete_on_exit and os.path.exists(temp_path): + try: + os.unlink(temp_path) + self.temp_files.discard(temp_path) + logger.debug(f"🗑️ 临时文件已清理: {temp_path}") + except Exception as e: + logger.warning(f"⚠️ 临时文件清理失败: {temp_path}, {e}") + + def cleanup_all_temp_files(self): + """清理所有跟踪的临时文件""" + for temp_path in list(self.temp_files): + if os.path.exists(temp_path): + try: + os.unlink(temp_path) + logger.debug(f"🗑️ 清理临时文件: {temp_path}") + except Exception as e: + logger.warning(f"⚠️ 清理临时文件失败: {temp_path}, {e}") + self.temp_files.clear() + + def get_file_size(self, file_obj) -> int: + """获取文件大小""" + if hasattr(file_obj, 'content_length') and file_obj.content_length: + return file_obj.content_length + + # 通过读取内容获取大小 + current_pos = file_obj.tell() + file_obj.seek(0, 2) # 移动到文件末尾 + size = file_obj.tell() + file_obj.seek(current_pos) # 恢复原位置 + return size + + def is_small_file(self, file_obj) -> bool: + """判断是否为小文件""" + return self.get_file_size(file_obj) <= self.SMALL_FILE_THRESHOLD + + def get_temp_file_for_model(self, file_obj, filename: str) -> Optional[str]: + """为模型处理获取临时文件路径(确保文件存在于本地)""" + try: + ext = os.path.splitext(filename)[1].lower() + + # 创建临时文件(不自动删除,供模型使用) + temp_fd, temp_path = tempfile.mkstemp(suffix=ext, dir=self.temp_dir) + self.temp_files.add(temp_path) + + try: + # 写入文件内容 + file_obj.seek(0) + with os.fdopen(temp_fd, 'wb') as temp_file: + temp_file.write(file_obj.read()) + + logger.debug(f"📁 为模型创建临时文件: {temp_path}") + return temp_path + + except Exception as e: + os.close(temp_fd) + raise e + + except Exception as e: + logger.error(f"❌ 为模型创建临时文件失败: {filename}, {e}") + return None + + def cleanup_temp_file(self, temp_path: str): + """清理指定的临时文件""" + if temp_path and os.path.exists(temp_path): + try: + os.unlink(temp_path) + self.temp_files.discard(temp_path) + logger.debug(f"🗑️ 清理临时文件: {temp_path}") + except Exception as e: + logger.warning(f"⚠️ 清理临时文件失败: {temp_path}, {e}") + +# 全局实例 +file_handler = None + +def get_file_handler(temp_dir: str = None) -> LocalFileHandler: + """获取文件处理器实例""" + global file_handler + if file_handler is None: + file_handler = LocalFileHandler(temp_dir=temp_dir) + return file_handler diff --git a/model_download_guide.md b/model_download_guide.md new file mode 100644 index 0000000..f2a2f20 --- /dev/null +++ b/model_download_guide.md @@ -0,0 +1,108 @@ +# 多模态模型下载指南 + +## 下载 OpenSearch-AI/Ops-MM-embedding-v1-7B 模型 + +### 方法1:使用 git-lfs + +```bash +# 安装 git-lfs +apt-get install git-lfs +# 或 +curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash +apt-get install git-lfs + +# 初始化 git-lfs +git lfs install + +# 克隆模型仓库 +mkdir -p ~/models +git clone https://huggingface.co/OpenSearch-AI/Ops-MM-embedding-v1-7B ~/models/Ops-MM-embedding-v1-7B +``` + +### 方法2:使用 huggingface-cli + +```bash +# 安装 huggingface-hub +pip install huggingface-hub + +# 下载模型 +mkdir -p ~/models +huggingface-cli download OpenSearch-AI/Ops-MM-embedding-v1-7B --local-dir ~/models/Ops-MM-embedding-v1-7B +``` + +### 方法3:手动下载关键文件 + +如果上述方法不可行,可以手动下载以下关键文件: + +1. 访问 https://huggingface.co/OpenSearch-AI/Ops-MM-embedding-v1-7B/tree/main +2. 下载以下文件: + - `config.json` + - `pytorch_model.bin` (或分片文件 `pytorch_model-00001-of-00002.bin` 等) + - `tokenizer.json` + - `tokenizer_config.json` + - `special_tokens_map.json` + - `vocab.txt` + +## 下载替代轻量级模型 + +如果主模型太大,可以下载这些较小的替代模型: + +### CLIP 模型 + +```bash +mkdir -p ~/models/clip-ViT-B-32 +huggingface-cli download openai/clip-vit-base-patch32 --local-dir ~/models/clip-ViT-B-32 +``` + +### 多语言CLIP模型 + +```bash +mkdir -p ~/models/clip-multilingual +huggingface-cli download sentence-transformers/clip-ViT-B-32-multilingual-v1 --local-dir ~/models/clip-multilingual +``` + +## 传输模型文件 + +下载完成后,使用以下方法将模型传输到目标服务器: + +### 使用 scp + +```bash +# 从当前机器传输到目标服务器 +scp -r ~/models/Ops-MM-embedding-v1-7B user@target-server:/root/models/ +``` + +### 使用压缩文件 + +```bash +# 压缩 +tar -czvf model.tar.gz ~/models/Ops-MM-embedding-v1-7B + +# 传输压缩文件 +scp model.tar.gz user@target-server:/root/ + +# 在目标服务器上解压 +ssh user@target-server +mkdir -p /root/models +tar -xzvf /root/model.tar.gz -C /root/models +``` + +## 验证模型文件 + +模型下载完成后,目录结构应类似于: + +``` +/root/models/Ops-MM-embedding-v1-7B/ +├── config.json +├── pytorch_model.bin (或分片文件) +├── tokenizer.json +├── tokenizer_config.json +├── special_tokens_map.json +└── vocab.txt +``` + +使用以下命令验证文件完整性: + +```bash +ls -la /root/models/Ops-MM-embedding-v1-7B/ +``` diff --git a/multimodal_retrieval_faiss.py b/multimodal_retrieval_faiss.py new file mode 100644 index 0000000..f5949bc --- /dev/null +++ b/multimodal_retrieval_faiss.py @@ -0,0 +1,370 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +基于FAISS的多模态检索系统 +支持文搜文、文搜图、图搜文、图搜图四种检索模式 +""" + +import torch +import torch.nn as nn +from torch.nn.parallel import DataParallel +import numpy as np +from PIL import Image +from transformers import AutoModel, AutoProcessor, AutoTokenizer +from typing import List, Union, Tuple, Dict, Any, Optional +import os +import json +from pathlib import Path +import logging +import gc +from concurrent.futures import ThreadPoolExecutor, as_completed +import threading + +from faiss_vector_store import FaissVectorStore + +# 设置日志 +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class MultimodalRetrievalFAISS: + """基于FAISS的多模态检索系统""" + + def __init__(self, model_name: str = "OpenSearch-AI/Ops-MM-embedding-v1-7B", + use_all_gpus: bool = True, gpu_ids: List[int] = None, + min_memory_gb: int = 12, index_path: str = "faiss_index"): + """ + 初始化多模态检索系统 + + Args: + model_name: 模型名称 + use_all_gpus: 是否使用所有可用GPU + gpu_ids: 指定使用的GPU ID列表 + min_memory_gb: 最小可用内存(GB) + index_path: FAISS索引文件路径 + """ + self.model_name = model_name + self.index_path = index_path + + # 设置GPU设备 + self._setup_devices(use_all_gpus, gpu_ids, min_memory_gb) + + # 清理GPU内存 + self._clear_all_gpu_memory() + + # 加载模型和处理器 + self._load_model_and_processor() + + # 初始化FAISS向量存储 + self.vector_store = FaissVectorStore( + index_path=index_path, + dimension=3584 # OpenSearch-AI/Ops-MM-embedding-v1-7B的向量维度 + ) + + logger.info(f"多模态检索系统初始化完成,使用模型: {model_name}") + logger.info(f"向量存储路径: {index_path}") + + def _setup_devices(self, use_all_gpus: bool, gpu_ids: List[int], min_memory_gb: int): + """设置GPU设备""" + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.use_gpu = self.device.type == "cuda" + + if self.use_gpu: + self.available_gpus = self._get_available_gpus(min_memory_gb) + + if not self.available_gpus: + logger.warning(f"没有可用的GPU或GPU内存不足{min_memory_gb}GB,将使用CPU") + self.device = torch.device("cpu") + self.use_gpu = False + else: + if gpu_ids: + self.gpu_ids = [gid for gid in gpu_ids if gid in self.available_gpus] + if not self.gpu_ids: + logger.warning(f"指定的GPU {gpu_ids}不可用或内存不足,将使用可用的GPU: {self.available_gpus}") + self.gpu_ids = self.available_gpus + elif use_all_gpus: + self.gpu_ids = self.available_gpus + else: + self.gpu_ids = [self.available_gpus[0]] + + logger.info(f"使用GPU: {self.gpu_ids}") + self.device = torch.device(f"cuda:{self.gpu_ids[0]}") + + def _get_available_gpus(self, min_memory_gb: int) -> List[int]: + """获取可用的GPU列表""" + available_gpus = [] + for i in range(torch.cuda.device_count()): + total_mem = torch.cuda.get_device_properties(i).total_memory / (1024 ** 3) # GB + if total_mem >= min_memory_gb: + available_gpus.append(i) + return available_gpus + + def _clear_all_gpu_memory(self): + """清理GPU内存""" + if torch.cuda.is_available(): + torch.cuda.empty_cache() + gc.collect() + + def _load_model_and_processor(self): + """加载模型和处理器""" + logger.info(f"加载模型和处理器: {self.model_name}") + + # 加载tokenizer和processor + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.processor = AutoProcessor.from_pretrained(self.model_name) + + # 加载模型 + self.model = AutoModel.from_pretrained( + self.model_name, + torch_dtype=torch.float16 if self.use_gpu else torch.float32, + device_map="auto" if len(self.gpu_ids) > 1 else None + ) + + # 如果使用多GPU,包装模型 + if len(self.gpu_ids) > 1: + self.model = DataParallel(self.model, device_ids=self.gpu_ids) + + self.model.eval() + self.model.to(self.device) + + logger.info("模型和处理器加载完成") + + def encode_text(self, text: Union[str, List[str]]) -> np.ndarray: + """编码文本为向量""" + if isinstance(text, str): + text = [text] + + inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt") + inputs = {k: v.to(self.device) for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.model(**inputs) + # 获取[CLS]标记的隐藏状态作为句子表示 + text_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy() + + # 归一化向量 + text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, axis=1, keepdims=True) + return text_embeddings[0] if len(text) == 1 else text_embeddings + + def encode_image(self, image: Union[Image.Image, List[Image.Image]]) -> np.ndarray: + """编码图像为向量""" + if isinstance(image, Image.Image): + image = [image] + + inputs = self.processor(images=image, return_tensors="pt") + inputs = {k: v.to(self.device) for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.model.vision_model(**inputs) + # 获取[CLS]标记的隐藏状态作为图像表示 + image_embeddings = outputs.pooler_output.cpu().numpy() + + # 归一化向量 + image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True) + return image_embeddings[0] if len(image) == 1 else image_embeddings + + def add_texts( + self, + texts: List[str], + metadatas: Optional[List[Dict[str, Any]]] = None + ) -> List[str]: + """ + 添加文本到检索系统 + + Args: + texts: 文本列表 + metadatas: 元数据列表,每个元素是一个字典 + + Returns: + 添加的文本ID列表 + """ + if not texts: + return [] + + if metadatas is None: + metadatas = [{} for _ in range(len(texts))] + + if len(texts) != len(metadatas): + raise ValueError("texts和metadatas长度必须相同") + + # 编码文本 + text_embeddings = self.encode_text(texts) + + # 准备元数据 + for i, text in enumerate(texts): + metadatas[i].update({ + "text": text, + "type": "text" + }) + + # 添加到向量存储 + vector_ids = self.vector_store.add_vectors(text_embeddings, metadatas) + + logger.info(f"成功添加{len(vector_ids)}条文本到检索系统") + return vector_ids + + def add_images( + self, + images: List[Image.Image], + metadatas: Optional[List[Dict[str, Any]]] = None + ) -> List[str]: + """ + 添加图像到检索系统 + + Args: + images: PIL图像列表 + metadatas: 元数据列表,每个元素是一个字典 + + Returns: + 添加的图像ID列表 + """ + if not images: + return [] + + if metadatas is None: + metadatas = [{} for _ in range(len(images))] + + if len(images) != len(metadatas): + raise ValueError("images和metadatas长度必须相同") + + # 编码图像 + image_embeddings = self.encode_image(images) + + # 准备元数据 + for i, image in enumerate(images): + metadatas[i].update({ + "type": "image", + "width": image.width, + "height": image.height + }) + + # 添加到向量存储 + vector_ids = self.vector_store.add_vectors(image_embeddings, metadatas) + + logger.info(f"成功添加{len(vector_ids)}张图像到检索系统") + return vector_ids + + def search_by_text( + self, + query: str, + k: int = 5, + filter_condition: Optional[Dict[str, Any]] = None + ) -> List[Dict[str, Any]]: + """ + 文本搜索 + + Args: + query: 查询文本 + k: 返回结果数量 + filter_condition: 过滤条件 + + Returns: + 搜索结果列表,每个元素包含相似项和分数 + """ + # 编码查询文本 + query_embedding = self.encode_text(query) + + # 执行搜索 + results, distances = self.vector_store.search(query_embedding, k) + + # 处理结果 + search_results = [] + for i, (result, distance) in enumerate(zip(results, distances)): + result["score"] = 1.0 / (1.0 + distance) # 将距离转换为相似度分数 + search_results.append(result) + + return search_results + + def search_by_image( + self, + image: Image.Image, + k: int = 5, + filter_condition: Optional[Dict[str, Any]] = None + ) -> List[Dict[str, Any]]: + """ + 图像搜索 + + Args: + image: 查询图像 + k: 返回结果数量 + filter_condition: 过滤条件 + + Returns: + 搜索结果列表,每个元素包含相似项和分数 + """ + # 编码查询图像 + query_embedding = self.encode_image(image) + + # 执行搜索 + results, distances = self.vector_store.search(query_embedding, k) + + # 处理结果 + search_results = [] + for i, (result, distance) in enumerate(zip(results, distances)): + result["score"] = 1.0 / (1.0 + distance) # 将距离转换为相似度分数 + search_results.append(result) + + return search_results + + def get_vector_count(self) -> int: + """获取向量数量""" + return self.vector_store.get_vector_count() + + def save_index(self): + """保存索引""" + self.vector_store.save_index() + logger.info("索引已保存") + + def __del__(self): + """析构函数,确保资源被正确释放""" + if hasattr(self, 'model'): + del self.model + self._clear_all_gpu_memory() + if hasattr(self, 'vector_store'): + self.save_index() + + +def test_faiss_system(): + """测试FAISS多模态检索系统""" + import time + from PIL import Image + import numpy as np + + # 初始化检索系统 + print("初始化多模态检索系统...") + retrieval = MultimodalRetrievalFAISS( + model_name="OpenSearch-AI/Ops-MM-embedding-v1-7B", + use_all_gpus=True, + index_path="faiss_index_test" + ) + + # 测试文本 + texts = [ + "一只可爱的橘色猫咪在沙发上睡觉", + "城市夜景中的高楼大厦和车流", + "阳光明媚的海滩上,人们在冲浪和晒太阳", + "美味的意大利面配红酒和沙拉", + "雪山上滑雪的运动员" + ] + + # 添加文本 + print("\n添加文本到检索系统...") + text_ids = retrieval.add_texts(texts) + print(f"添加了{len(text_ids)}条文本") + + # 测试文本搜索 + print("\n测试文本搜索...") + query_text = "一只猫在睡觉" + print(f"查询: {query_text}") + results = retrieval.search_by_text(query_text, k=2) + for i, result in enumerate(results): + print(f"结果 {i+1}: {result.get('text', 'N/A')} (分数: {result.get('score', 0):.4f})") + + # 测试保存和加载 + print("\n保存索引...") + retrieval.save_index() + + print("\n测试完成!") + + +if __name__ == "__main__": + test_faiss_system() diff --git a/multimodal_retrieval_local.py b/multimodal_retrieval_local.py new file mode 100644 index 0000000..8ff9208 --- /dev/null +++ b/multimodal_retrieval_local.py @@ -0,0 +1,607 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +使用本地模型的多模态检索系统 +支持文搜文、文搜图、图搜文、图搜图四种检索模式 +""" + +import torch +import numpy as np +from PIL import Image +from transformers import AutoModel, AutoProcessor, AutoTokenizer +from typing import List, Union, Tuple, Dict, Any, Optional +import os +import json +from pathlib import Path +import logging +import gc +import faiss +import time + +# 设置日志 +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# 设置离线模式 +os.environ['TRANSFORMERS_OFFLINE'] = '1' + +class MultimodalRetrievalLocal: + """使用本地模型的多模态检索系统""" + + def __init__(self, + model_path: str = "/root/models/Ops-MM-embedding-v1-7B", + use_all_gpus: bool = True, + gpu_ids: List[int] = None, + min_memory_gb: int = 12, + index_path: str = "local_faiss_index"): + """ + 初始化多模态检索系统 + + Args: + model_path: 本地模型路径 + use_all_gpus: 是否使用所有可用GPU + gpu_ids: 指定使用的GPU ID列表 + min_memory_gb: 最小可用内存(GB) + index_path: FAISS索引文件路径 + """ + self.model_path = model_path + self.index_path = index_path + + # 检查模型路径 + if not os.path.exists(model_path): + logger.error(f"模型路径不存在: {model_path}") + logger.info("请先下载模型到指定路径") + raise FileNotFoundError(f"模型路径不存在: {model_path}") + + # 设置GPU设备 + self._setup_devices(use_all_gpus, gpu_ids, min_memory_gb) + + # 清理GPU内存 + self._clear_all_gpu_memory() + + # 加载模型和处理器 + self._load_model_and_processor() + + # 初始化FAISS索引 + self._init_index() + + logger.info(f"多模态检索系统初始化完成,使用本地模型: {model_path}") + logger.info(f"向量存储路径: {index_path}") + + def _setup_devices(self, use_all_gpus: bool, gpu_ids: List[int], min_memory_gb: int): + """设置GPU设备""" + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.use_gpu = self.device.type == "cuda" + + if self.use_gpu: + self.available_gpus = self._get_available_gpus(min_memory_gb) + + if not self.available_gpus: + logger.warning(f"没有可用的GPU或GPU内存不足{min_memory_gb}GB,将使用CPU") + self.device = torch.device("cpu") + self.use_gpu = False + else: + if gpu_ids: + self.gpu_ids = [gid for gid in gpu_ids if gid in self.available_gpus] + if not self.gpu_ids: + logger.warning(f"指定的GPU {gpu_ids}不可用或内存不足,将使用可用的GPU: {self.available_gpus}") + self.gpu_ids = self.available_gpus + elif use_all_gpus: + self.gpu_ids = self.available_gpus + else: + self.gpu_ids = [self.available_gpus[0]] + + logger.info(f"使用GPU: {self.gpu_ids}") + self.device = torch.device(f"cuda:{self.gpu_ids[0]}") + else: + logger.warning("没有可用的GPU,将使用CPU") + self.gpu_ids = [] + + def _get_available_gpus(self, min_memory_gb: int) -> List[int]: + """获取可用的GPU列表""" + available_gpus = [] + for i in range(torch.cuda.device_count()): + total_mem = torch.cuda.get_device_properties(i).total_memory / (1024 ** 3) # GB + if total_mem >= min_memory_gb: + available_gpus.append(i) + return available_gpus + + def _clear_all_gpu_memory(self): + """清理GPU内存""" + if torch.cuda.is_available(): + torch.cuda.empty_cache() + gc.collect() + + def _load_model_and_processor(self): + """加载模型和处理器""" + logger.info(f"加载本地模型和处理器: {self.model_path}") + + try: + # 加载模型和处理器 + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.processor = AutoProcessor.from_pretrained(self.model_path) + + # 输出处理器信息 + logger.info(f"Processor类型: {type(self.processor)}") + logger.info(f"Processor方法: {dir(self.processor)}") + + # 检查是否有图像处理器 + if hasattr(self.processor, 'image_processor'): + logger.info(f"Image processor类型: {type(self.processor.image_processor)}") + logger.info(f"Image processor方法: {dir(self.processor.image_processor)}") + + # 加载模型 + self.model = AutoModel.from_pretrained( + self.model_path, + torch_dtype=torch.float16 if self.use_gpu else torch.float32, + device_map="auto" if len(self.gpu_ids) > 1 else None + ) + + if len(self.gpu_ids) == 1: + self.model.to(self.device) + + self.model.eval() + + # 获取向量维度 + self.vector_dim = self.model.config.hidden_size + logger.info(f"向量维度: {self.vector_dim}") + + logger.info("模型和处理器加载成功") + + except Exception as e: + logger.error(f"模型加载失败: {str(e)}") + raise RuntimeError(f"模型加载失败: {str(e)}") + + def _init_index(self): + """初始化FAISS索引""" + index_file = f"{self.index_path}.index" + if os.path.exists(index_file): + logger.info(f"加载现有索引: {index_file}") + try: + self.index = faiss.read_index(index_file) + logger.info(f"索引加载成功,包含{self.index.ntotal}个向量") + except Exception as e: + logger.error(f"索引加载失败: {str(e)}") + logger.info("创建新索引...") + self.index = faiss.IndexFlatL2(self.vector_dim) + else: + logger.info(f"创建新索引,维度: {self.vector_dim}") + self.index = faiss.IndexFlatL2(self.vector_dim) + + # 加载元数据 + self.metadata = {} + metadata_file = f"{self.index_path}_metadata.json" + if os.path.exists(metadata_file): + try: + with open(metadata_file, 'r', encoding='utf-8') as f: + self.metadata = json.load(f) + logger.info(f"元数据加载成功,包含{len(self.metadata)}条记录") + except Exception as e: + logger.error(f"元数据加载失败: {str(e)}") + + def encode_text(self, text: Union[str, List[str]]) -> np.ndarray: + """编码文本为向量""" + if isinstance(text, str): + text = [text] + + inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt") + inputs = {k: v.to(self.device) for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.model(**inputs) + # 获取[CLS]标记的隐藏状态作为句子表示 + text_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy() + + # 归一化向量 + text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, axis=1, keepdims=True) + return text_embeddings[0] if len(text) == 1 else text_embeddings + + def encode_image(self, image: Union[Image.Image, List[Image.Image]]) -> np.ndarray: + """编码图像为向量""" + try: + logger.info(f"encode_image: 开始编码图像,类型: {type(image)}") + + if isinstance(image, Image.Image): + logger.info(f"encode_image: 单个图像,大小: {image.size}") + image = [image] + else: + logger.info(f"encode_image: 图像列表,长度: {len(image)}") + + # 检查图像是否为空 + if not image or len(image) == 0: + logger.error("encode_image: 图像列表为空") + # 返回一个空的二维数组 + return np.zeros((0, self.vector_dim)) + + # 检查图像是否有效 + for i, img in enumerate(image): + if not isinstance(img, Image.Image): + logger.error(f"encode_image: 第{i}个元素不是有效的PIL图像,类型: {type(img)}") + # 返回一个空的二维数组 + return np.zeros((0, self.vector_dim)) + + logger.info("encode_image: 处理图像输入") + + # 检查图像格式 + for i, img in enumerate(image): + logger.info(f"encode_image: 图像 {i} 格式: {img.format}, 模式: {img.mode}, 大小: {img.size}") + # 转换为RGB模式,如果不是 + if img.mode != 'RGB': + logger.info(f"encode_image: 将图像 {i} 从 {img.mode} 转换为 RGB") + image[i] = img.convert('RGB') + + try: + # 直接使用image_processor处理图像 + if hasattr(self.processor, 'image_processor'): + logger.info("encode_image: 使用image_processor处理图像") + pixel_values = self.processor.image_processor(images=image, return_tensors="pt").pixel_values + inputs = {"pixel_values": pixel_values} + else: + logger.info("encode_image: 使用processor处理图像") + inputs = self.processor(images=image, return_tensors="pt") + + if not inputs or len(inputs) == 0: + logger.error("encode_image: processor返回了空的输入") + return np.zeros((0, self.vector_dim)) + + logger.info(f"encode_image: 处理后的输入键: {list(inputs.keys())}") + inputs = {k: v.to(self.device) for k, v in inputs.items()} + + logger.info("encode_image: 运行模型推理") + logger.info(f"Model类型: {type(self.model)}") + logger.info(f"Model属性: {dir(self.model)}") + + # 检查模型结构 + try: + logger.info(f"Model配置: {self.model.config}") + logger.info(f"Model配置属性: {dir(self.model.config)}") + else: + visual_outputs = self.model.visual(**inputs) + + if hasattr(visual_outputs, 'pooler_output'): + image_embeddings = visual_outputs.pooler_output.cpu().numpy() + elif hasattr(visual_outputs, 'last_hidden_state'): + image_embeddings = visual_outputs.last_hidden_state[:, 0, :].cpu().numpy() + else: + logger.error("encode_image: 无法从视觉模型输出中获取图像向量") + raise ValueError("无法从视觉模型输出中获取图像向量") + else: + # 尝试直接使用模型进行推理 + logger.info("encode_image: 尝试直接使用模型进行推理") + with torch.no_grad(): + # 使用空文本输入,只提供图像 + if 'pixel_values' in inputs: + outputs = self.model(pixel_values=inputs['pixel_values'], input_ids=None) + else: + outputs = self.model(**inputs, input_ids=None) + + # 尝试从输出中获取图像向量 + if hasattr(outputs, 'image_embeds'): + image_embeddings = outputs.image_embeds.cpu().numpy() + elif hasattr(outputs, 'vision_model_output') and hasattr(outputs.vision_model_output, 'pooler_output'): + image_embeddings = outputs.vision_model_output.pooler_output.cpu().numpy() + elif hasattr(outputs, 'pooler_output'): + image_embeddings = outputs.pooler_output.cpu().numpy() + elif hasattr(outputs, 'last_hidden_state'): + image_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy() + else: + logger.error("encode_image: 无法从模型输出中获取图像向量") + raise ValueError("无法从模型输出中获取图像向量") + except Exception as e: + logger.error(f"encode_image: 处理图像时出错: {str(e)}") + raise e + return np.zeros((0, self.vector_dim)) + + # 归一化向量 + image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True) + + # 始终返回二维数组,即使只有一个图像 + if len(image) == 1: + result = np.array([image_embeddings[0]]) + logger.info(f"encode_image: 返回单个图像向量,形状: {result.shape}") + return result + else: + logger.info(f"encode_image: 返回多个图像向量,形状: {image_embeddings.shape}") + return image_embeddings + + except Exception as e: + logger.error(f"encode_image: 异常: {str(e)}") + # 返回一个空的二维数组 + return np.zeros((0, self.vector_dim)) + + def add_texts( + self, + texts: List[str], + metadatas: Optional[List[Dict[str, Any]]] = None + ) -> List[str]: + """ + 添加文本到检索系统 + + Args: + texts: 文本列表 + metadatas: 元数据列表,每个元素是一个字典 + + Returns: + 添加的文本ID列表 + """ + if not texts: + return [] + + if metadatas is None: + metadatas = [{} for _ in range(len(texts))] + + if len(texts) != len(metadatas): + raise ValueError("texts和metadatas长度必须相同") + + # 编码文本 + text_embeddings = self.encode_text(texts) + + # 准备元数据 + start_id = self.index.ntotal + ids = list(range(start_id, start_id + len(texts))) + + # 添加到索引 + self.index.add(np.array(text_embeddings).astype('float32')) + + # 保存元数据 + for i, id in enumerate(ids): + self.metadata[str(id)] = { + "text": texts[i], + "type": "text", + **metadatas[i] + } + + logger.info(f"成功添加{len(ids)}条文本到检索系统") + return [str(id) for id in ids] + + def add_images( + self, + images: List[Image.Image], + metadatas: Optional[List[Dict[str, Any]]] = None, + image_paths: Optional[List[str]] = None + ) -> List[str]: + """ + 添加图像到检索系统 + + Args: + images: PIL图像列表 + metadatas: 元数据列表,每个元素是一个字典 + image_paths: 图像路径列表,用于保存到元数据 + + Returns: + 添加的图像ID列表 + """ + try: + logger.info(f"add_images: 开始添加图像,数量: {len(images) if images else 0}") + + # 检查图像列表 + if not images or len(images) == 0: + logger.warning("add_images: 图像列表为空") + return [] + + # 准备元数据 + if metadatas is None: + logger.info("add_images: 创建默认元数据") + metadatas = [{} for _ in range(len(images))] + + # 检查长度一致性 + if len(images) != len(metadatas): + logger.error(f"add_images: 长度不一致 - images: {len(images)}, metadatas: {len(metadatas)}") + raise ValueError("images和metadatas长度必须相同") + + # 编码图像 + logger.info("add_images: 编码图像") + image_embeddings = self.encode_image(images) + + # 检查编码结果 + if image_embeddings.shape[0] == 0: + logger.error("add_images: 图像编码失败,返回空数组") + return [] + + # 准备元数据 + start_id = self.index.ntotal + ids = list(range(start_id, start_id + len(images))) + logger.info(f"add_images: 生成索引ID: {start_id} - {start_id + len(images) - 1}") + + # 添加到索引 + logger.info(f"add_images: 添加向量到FAISS索引,形状: {image_embeddings.shape}") + self.index.add(np.array(image_embeddings).astype('float32')) + + # 保存元数据 + for i, id in enumerate(ids): + try: + metadata = { + "type": "image", + "width": images[i].width, + "height": images[i].height, + **metadatas[i] + } + + if image_paths and i < len(image_paths): + metadata["path"] = image_paths[i] + + self.metadata[str(id)] = metadata + logger.debug(f"add_images: 保存元数据成功 - ID: {id}") + except Exception as e: + logger.error(f"add_images: 保存元数据失败 - ID: {id}, 错误: {str(e)}") + + logger.info(f"add_images: 成功添加{len(ids)}张图像到检索系统") + return [str(id) for id in ids] + + except Exception as e: + logger.error(f"add_images: 添加图像异常: {str(e)}") + return [] + + def search_by_text( + self, + query: str, + k: int = 5, + filter_type: Optional[str] = None + ) -> List[Dict[str, Any]]: + """ + 文本搜索 + + Args: + query: 查询文本 + k: 返回结果数量 + filter_type: 过滤类型,可选值: "text", "image", None(不过滤) + + Returns: + 搜索结果列表,每个元素包含相似项和分数 + """ + # 编码查询文本 + query_embedding = self.encode_text(query) + + # 执行搜索 + return self._search(query_embedding, k, filter_type) + + def search_by_image( + self, + image: Image.Image, + k: int = 5, + filter_type: Optional[str] = None + ) -> List[Dict[str, Any]]: + """ + 图像搜索 + + Args: + image: 查询图像 + k: 返回结果数量 + filter_type: 过滤类型,可选值: "text", "image", None(不过滤) + + Returns: + 搜索结果列表,每个元素包含相似项和分数 + """ + # 编码查询图像 + query_embedding = self.encode_image(image) + + # 执行搜索 + return self._search(query_embedding, k, filter_type) + + def _search( + self, + query_embedding: np.ndarray, + k: int = 5, + filter_type: Optional[str] = None + ) -> List[Dict[str, Any]]: + """ + 执行搜索 + + Args: + query_embedding: 查询向量 + k: 返回结果数量 + filter_type: 过滤类型,可选值: "text", "image", None(不过滤) + + Returns: + 搜索结果列表 + """ + if self.index.ntotal == 0: + return [] + + # 确保查询向量是2D数组 + if len(query_embedding.shape) == 1: + query_embedding = query_embedding.reshape(1, -1) + + # 执行搜索,获取更多结果以便过滤 + actual_k = k * 3 if filter_type else k + actual_k = min(actual_k, self.index.ntotal) + distances, indices = self.index.search(query_embedding.astype('float32'), actual_k) + + # 处理结果 + results = [] + for i in range(len(indices[0])): + idx = indices[0][i] + if idx < 0: # FAISS可能返回-1表示无效索引 + continue + + vector_id = str(idx) + if vector_id in self.metadata: + item = self.metadata[vector_id] + + # 如果指定了过滤类型,则只返回该类型的结果 + if filter_type and item.get("type") != filter_type: + continue + + # 添加距离和分数 + result = item.copy() + result["distance"] = float(distances[0][i]) + result["score"] = float(1.0 / (1.0 + distances[0][i])) + results.append(result) + + # 如果已经收集了足够的结果,则停止 + if len(results) >= k: + break + + return results + + def save_index(self): + """保存索引和元数据""" + # 保存索引 + index_file = f"{self.index_path}.index" + try: + faiss.write_index(self.index, index_file) + logger.info(f"索引保存成功: {index_file}") + except Exception as e: + logger.error(f"索引保存失败: {str(e)}") + + # 保存元数据 + metadata_file = f"{self.index_path}_metadata.json" + try: + with open(metadata_file, 'w', encoding='utf-8') as f: + json.dump(self.metadata, f, ensure_ascii=False, indent=2) + logger.info(f"元数据保存成功: {metadata_file}") + except Exception as e: + logger.error(f"元数据保存失败: {str(e)}") + + def get_stats(self) -> Dict[str, Any]: + """获取检索系统统计信息""" + text_count = sum(1 for v in self.metadata.values() if v.get("type") == "text") + image_count = sum(1 for v in self.metadata.values() if v.get("type") == "image") + + return { + "total_vectors": self.index.ntotal, + "text_count": text_count, + "image_count": image_count, + "vector_dimension": self.vector_dim, + "index_path": self.index_path, + "model_path": self.model_path + } + + def clear_index(self): + """清空索引""" + logger.info(f"清空索引: {self.index_path}") + + # 重新创建索引 + self.index = faiss.IndexFlatL2(self.vector_dim) + + # 清空元数据 + self.metadata = {} + + # 保存空索引 + self.save_index() + + logger.info(f"索引已清空: {self.index_path}") + return True + + def list_items(self) -> List[Dict[str, Any]]: + """列出所有索引项""" + items = [] + + for item_id, metadata in self.metadata.items(): + item = metadata.copy() + item['id'] = item_id + items.append(item) + + return items + + def __del__(self): + """析构函数,确保资源被正确释放并自动保存索引""" + try: + if hasattr(self, 'model'): + del self.model + self._clear_all_gpu_memory() + if hasattr(self, 'index') and self.index is not None: + logger.info("系统关闭前自动保存索引") + self.save_index() + except Exception as e: + logger.error(f"析构时保存索引失败: {str(e)}") diff --git a/multimodal_retrieval_vdb.py b/multimodal_retrieval_vdb.py index 0bbbb2b..e3996a0 100644 --- a/multimodal_retrieval_vdb.py +++ b/multimodal_retrieval_vdb.py @@ -60,7 +60,14 @@ class MultimodalRetrievalVDB: "database_name": "multimodal_retrieval" } - self.vdb = BaiduVDBBackend(**vdb_config) + try: + self.vdb = BaiduVDBBackend(**vdb_config) + logger.info("✅ VDB后端初始化成功") + except Exception as e: + logger.error(f"❌ VDB后端初始化失败: {e}") + # 创建一个模拟的VDB后端,避免系统完全崩溃 + self.vdb = None + logger.warning("⚠️ 系统将在无VDB模式下运行,数据将不会持久化") logger.info("多模态检索系统初始化完成") @@ -102,42 +109,102 @@ class MultimodalRetrievalVDB: # 清理GPU内存 self._clear_gpu_memory() - # 加载模型 - if self.num_gpus > 1: - # 多GPU加载 - max_memory = {i: "18GiB" for i in self.device_ids} + # 设置离线模式环境变量 + os.environ['TRANSFORMERS_OFFLINE'] = '1' + os.environ['HF_HUB_OFFLINE'] = '1' + + # 尝试加载模型,如果网络失败则使用本地缓存 + try: + # 加载模型 + if self.num_gpus > 1: + # 多GPU加载 + max_memory = {i: "18GiB" for i in self.device_ids} + + self.model = AutoModel.from_pretrained( + self.model_name, + trust_remote_code=True, + torch_dtype=torch.float16, + device_map="auto", + max_memory=max_memory, + low_cpu_mem_usage=True, + local_files_only=False # 允许从网络下载 + ) + else: + # 单GPU加载 + self.model = AutoModel.from_pretrained( + self.model_name, + trust_remote_code=True, + torch_dtype=torch.float16, + device_map=self.primary_device, + local_files_only=False # 允许从网络下载 + ) - self.model = AutoModel.from_pretrained( - self.model_name, - trust_remote_code=True, - torch_dtype=torch.float16, - device_map="auto", - max_memory=max_memory, - low_cpu_mem_usage=True - ) - else: - # 单GPU加载 - self.model = AutoModel.from_pretrained( - self.model_name, - trust_remote_code=True, - torch_dtype=torch.float16, - device_map=self.primary_device - ) + logger.info("模型从网络加载成功") + + except Exception as network_error: + logger.warning(f"网络加载失败,尝试本地缓存: {network_error}") + + # 尝试从本地缓存加载 + try: + if self.num_gpus > 1: + max_memory = {i: "18GiB" for i in self.device_ids} + + self.model = AutoModel.from_pretrained( + self.model_name, + trust_remote_code=True, + torch_dtype=torch.float16, + device_map="auto", + max_memory=max_memory, + low_cpu_mem_usage=True, + local_files_only=True # 仅使用本地文件 + ) + else: + self.model = AutoModel.from_pretrained( + self.model_name, + trust_remote_code=True, + torch_dtype=torch.float16, + device_map=self.primary_device, + local_files_only=True # 仅使用本地文件 + ) + + logger.info("模型从本地缓存加载成功") + + except Exception as local_error: + logger.error(f"本地缓存加载也失败: {local_error}") + raise local_error # 加载分词器和处理器 - self.tokenizer = AutoTokenizer.from_pretrained( - self.model_name, - trust_remote_code=True - ) + try: + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_name, + trust_remote_code=True, + local_files_only=False + ) + except Exception as e: + logger.warning(f"Tokenizer网络加载失败,尝试本地: {e}") + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_name, + trust_remote_code=True, + local_files_only=True + ) try: self.processor = AutoProcessor.from_pretrained( self.model_name, - trust_remote_code=True + trust_remote_code=True, + local_files_only=False ) except Exception as e: logger.warning(f"Processor加载失败,使用tokenizer: {e}") - self.processor = self.tokenizer + try: + self.processor = AutoProcessor.from_pretrained( + self.model_name, + trust_remote_code=True, + local_files_only=True + ) + except Exception as e2: + logger.warning(f"Processor本地加载也失败,使用tokenizer: {e2}") + self.processor = self.tokenizer logger.info("模型加载完成") return True @@ -274,6 +341,10 @@ class MultimodalRetrievalVDB: Returns: 存储的ID列表 """ + if self.vdb is None: + logger.warning("VDB不可用,文本数据将不会持久化存储") + return [] + logger.info(f"正在存储 {len(texts)} 条文本数据") # 分批处理 @@ -312,6 +383,10 @@ class MultimodalRetrievalVDB: Returns: 存储的ID列表 """ + if self.vdb is None: + logger.warning("VDB不可用,图像数据将不会持久化存储") + return [] + logger.info(f"正在存储 {len(image_paths)} 张图像数据") # 图像处理使用更小的批次 @@ -341,6 +416,10 @@ class MultimodalRetrievalVDB: def search_text_by_text(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]: """文搜文:使用文本查询搜索相似文本""" + if self.vdb is None: + logger.warning("VDB不可用,无法执行搜索") + return [] + logger.info(f"执行文搜文查询: {query}") # 编码查询文本 @@ -358,6 +437,10 @@ class MultimodalRetrievalVDB: def search_images_by_text(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]: """文搜图:使用文本查询搜索相似图像""" + if self.vdb is None: + logger.warning("VDB不可用,无法执行搜索") + return [] + logger.info(f"执行文搜图查询: {query}") # 编码查询文本 @@ -375,6 +458,10 @@ class MultimodalRetrievalVDB: def search_text_by_image(self, query_image: Union[str, Image.Image], top_k: int = 5) -> List[Tuple[str, float]]: """图搜文:使用图像查询搜索相似文本""" + if self.vdb is None: + logger.warning("VDB不可用,无法执行搜索") + return [] + logger.info(f"执行图搜文查询") # 编码查询图像 @@ -392,6 +479,10 @@ class MultimodalRetrievalVDB: def search_images_by_image(self, query_image: Union[str, Image.Image], top_k: int = 5) -> List[Tuple[str, float]]: """图搜图:使用图像查询搜索相似图像""" + if self.vdb is None: + logger.warning("VDB不可用,无法执行搜索") + return [] + logger.info(f"执行图搜图查询") # 编码查询图像 @@ -426,10 +517,15 @@ class MultimodalRetrievalVDB: def get_statistics(self) -> Dict[str, Any]: """获取系统统计信息""" + if self.vdb is None: + return {"error": "VDB不可用"} return self.vdb.get_statistics() def clear_all_data(self): """清空所有数据""" + if self.vdb is None: + logger.warning("VDB不可用,无法清空数据") + return self.vdb.clear_all_data() def close(self): diff --git a/nohup.out b/nohup.out new file mode 100644 index 0000000..b11bb79 --- /dev/null +++ b/nohup.out @@ -0,0 +1,49 @@ +INFO:baidu_bos_manager:✅ BOS连接测试成功 +INFO:baidu_bos_manager:✅ BOS客户端初始化成功: dmtyz-demo +INFO:mongodb_manager:✅ MongoDB连接成功: mmeb +INFO:mongodb_manager:✅ MongoDB索引创建完成 +INFO:__main__:初始化多模态检索系统... +INFO:multimodal_retrieval_local:使用GPU: [0, 1] +INFO:multimodal_retrieval_local:加载本地模型和处理器: /root/models/Ops-MM-embedding-v1-7B +The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release. +You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0. +INFO:multimodal_retrieval_local:Processor类型: +INFO:multimodal_retrieval_local:Processor方法: ['__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_auto_class', '_check_special_mm_tokens', '_create_repo', '_get_arguments_from_pretrained', '_get_files_timestamps', '_get_num_multimodal_tokens', '_merge_kwargs', '_upload_modified_files', 'apply_chat_template', 'attributes', 'audio_tokenizer', 'batch_decode', 'chat_template', 'check_argument_for_proper_class', 'decode', 'feature_extractor_class', 'from_args_and_dict', 'from_pretrained', 'get_possibly_dynamic_module', 'get_processor_dict', 'image_processor', 'image_processor_class', 'image_token', 'image_token_id', 'model_input_names', 'optional_attributes', 'optional_call_args', 'post_process_image_text_to_text', 'push_to_hub', 'register_for_auto_class', 'save_pretrained', 'to_dict', 'to_json_file', 'to_json_string', 'tokenizer', 'tokenizer_class', 'validate_init_kwargs', 'video_processor', 'video_processor_class', 'video_token', 'video_token_id'] +INFO:multimodal_retrieval_local:Image processor类型: +INFO:multimodal_retrieval_local:Image processor方法: ['__backends', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slotnames__', '__str__', '__subclasshook__', '__weakref__', '_auto_class', '_create_repo', '_further_process_kwargs', '_fuse_mean_std_and_rescale_factor', '_get_files_timestamps', '_prepare_image_like_inputs', '_prepare_images_structure', '_preprocess', '_preprocess_image_like_inputs', '_process_image', '_processor_class', '_set_processor_class', '_upload_modified_files', '_valid_kwargs_names', '_validate_preprocess_kwargs', 'center_crop', 'compile_friendly_resize', 'convert_to_rgb', 'crop_size', 'data_format', 'default_to_square', 'device', 'disable_grouping', 'do_center_crop', 'do_convert_rgb', 'do_normalize', 'do_rescale', 'do_resize', 'fetch_images', 'filter_out_unused_kwargs', 'from_dict', 'from_json_file', 'from_pretrained', 'get_image_processor_dict', 'get_number_of_image_patches', 'image_mean', 'image_processor_type', 'image_std', 'input_data_format', 'max_pixels', 'merge_size', 'min_pixels', 'model_input_names', 'normalize', 'patch_size', 'preprocess', 'push_to_hub', 'register_for_auto_class', 'resample', 'rescale', 'rescale_and_normalize', 'rescale_factor', 'resize', 'return_tensors', 'save_pretrained', 'size', 'temporal_patch_size', 'to_dict', 'to_json_file', 'to_json_string', 'unused_kwargs', 'valid_kwargs'] + Loading checkpoint shards: 0%| | 0/4 [00:00 +INFO:multimodal_retrieval_local:encode_image: 图像列表,长度: 1 +INFO:multimodal_retrieval_local:encode_image: 处理图像输入 +INFO:multimodal_retrieval_local:encode_image: 图像 0 格式: JPEG, 模式: RGB, 大小: (939, 940) +ERROR:multimodal_retrieval_local:encode_image: 处理图像时出错: argument of type 'NoneType' is not iterable +ERROR:multimodal_retrieval_local:add_images: 图像编码失败,返回空数组 +INFO:multimodal_retrieval_local:索引保存成功: /root/mmeb/local_faiss_index.index +INFO:multimodal_retrieval_local:元数据保存成功: /root/mmeb/local_faiss_index_metadata.json +INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 04:02:50] "POST /api/add_image HTTP/1.1" 200 - +INFO:multimodal_retrieval_local:索引保存成功: /root/mmeb/local_faiss_index.index +INFO:multimodal_retrieval_local:元数据保存成功: /root/mmeb/local_faiss_index_metadata.json +INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 04:02:50] "POST /api/save_index HTTP/1.1" 200 - +INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 04:02:51] "GET /api/system_info HTTP/1.1" 200 - diff --git a/optimized_file_handler.py b/optimized_file_handler.py index fed7384..1679669 100644 --- a/optimized_file_handler.py +++ b/optimized_file_handler.py @@ -30,19 +30,30 @@ class OptimizedFileHandler: # 支持的图像格式 SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'} - def __init__(self): + def __init__(self, local_storage_dir=None): self.bos_manager = get_bos_manager() self.mongodb_manager = get_mongodb_manager() self.temp_files = set() # 跟踪临时文件 + self.local_storage_dir = local_storage_dir or tempfile.gettempdir() + + # 确保本地存储目录存在 + if self.local_storage_dir: + os.makedirs(self.local_storage_dir, exist_ok=True) @contextmanager - def temp_file_context(self, suffix: str = None, delete_on_exit: bool = True): + def temp_file_context(self, content: bytes = None, suffix: str = None, delete_on_exit: bool = True): """临时文件上下文管理器,确保自动清理""" - temp_fd, temp_path = tempfile.mkstemp(suffix=suffix) + temp_fd, temp_path = tempfile.mkstemp(suffix=suffix, dir=self.local_storage_dir) self.temp_files.add(temp_path) - try: + # 如果提供了内容,写入文件 + if content is not None: + with os.fdopen(temp_fd, 'wb') as f: + f.write(content) + else: os.close(temp_fd) # 关闭文件描述符 + + try: yield temp_path finally: if delete_on_exit and os.path.exists(temp_path): @@ -96,17 +107,13 @@ class OptimizedFileHandler: logger.error(f"❌ 图像验证失败: {filename}, {e}") return None - # 生成唯一ID和BOS键 + # 生成唯一ID file_id = str(uuid.uuid4()) - bos_key = f"images/memory_{file_id}_{filename}" - # 直接上传到BOS(从内存) - bos_result = self._upload_to_bos_from_memory( - file_content, bos_key, filename - ) - - if not bos_result: - return None + # 保存到本地存储 + local_path = os.path.join(self.local_storage_dir, f"{file_id}_{filename}") + with open(local_path, 'wb') as f: + f.write(file_content) # 存储元数据到MongoDB metadata = { @@ -115,18 +122,25 @@ class OptimizedFileHandler: "file_type": "image", "file_size": len(file_content), "processing_method": "memory", - "bos_key": bos_key, - "bos_url": bos_result["url"] + "local_path": local_path } - self.mongodb_manager.store_file_metadata(metadata=metadata) + # 如果有BOS管理器,也上传到BOS + if hasattr(self, 'bos_manager') and self.bos_manager: + bos_key = f"images/memory_{file_id}_{filename}" + bos_result = self._upload_to_bos_from_memory(file_content, bos_key, filename) + if bos_result: + metadata["bos_key"] = bos_key + metadata["bos_url"] = bos_result["url"] + + if hasattr(self, 'mongodb_manager') and self.mongodb_manager: + self.mongodb_manager.store_file_metadata(metadata=metadata) logger.info(f"✅ 内存处理图像成功: {filename} ({len(file_content)} bytes)") return { "file_id": file_id, "filename": filename, - "bos_key": bos_key, - "bos_result": bos_result, + "local_path": local_path, "processing_method": "memory" } @@ -140,6 +154,12 @@ class OptimizedFileHandler: # 获取文件扩展名 ext = os.path.splitext(filename)[1].lower() + # 生成唯一ID + file_id = str(uuid.uuid4()) + + # 创建永久文件路径 + permanent_path = os.path.join(self.local_storage_dir, f"{file_id}_{filename}") + with self.temp_file_context(suffix=ext) as temp_path: # 保存到临时文件 file_obj.seek(0) @@ -154,35 +174,41 @@ class OptimizedFileHandler: logger.error(f"❌ 图像验证失败: {filename}, {e}") return None - # 生成唯一ID和BOS键 - file_id = str(uuid.uuid4()) - bos_key = f"images/temp_{file_id}_{filename}" + # 复制到永久存储位置 + with open(temp_path, 'rb') as src, open(permanent_path, 'wb') as dst: + dst.write(src.read()) - # 上传到BOS - bos_result = self.bos_manager.upload_file(temp_path, bos_key) + # 获取文件信息 + file_stat = os.stat(permanent_path) - # 存储元数据到MongoDB - file_stat = os.stat(temp_path) + # 存储元数据 metadata = { "_id": file_id, "filename": filename, "file_type": "image", "file_size": file_stat.st_size, "processing_method": "temp_file", - "bos_key": bos_key, - "bos_url": bos_result["url"] + "local_path": permanent_path } - self.mongodb_manager.store_file_metadata(metadata=metadata) + # 如果有BOS管理器,也上传到BOS + if hasattr(self, 'bos_manager') and self.bos_manager: + bos_key = f"images/temp_{file_id}_{filename}" + bos_result = self.bos_manager.upload_file(temp_path, bos_key) + if bos_result: + metadata["bos_key"] = bos_key + metadata["bos_url"] = bos_result["url"] + + # 存储元数据到MongoDB + if hasattr(self, 'mongodb_manager') and self.mongodb_manager: + self.mongodb_manager.store_file_metadata(metadata=metadata) logger.info(f"✅ 临时文件处理图像成功: {filename} ({file_stat.st_size} bytes)") return { "file_id": file_id, "filename": filename, - "bos_key": bos_key, - "bos_result": bos_result, - "processing_method": "temp_file", - "temp_path": temp_path # 返回临时路径供模型处理 + "local_path": permanent_path, + "processing_method": "temp_file" } except Exception as e: @@ -290,8 +316,11 @@ class OptimizedFileHandler: try: ext = os.path.splitext(filename)[1].lower() + # 生成唯一ID + file_id = str(uuid.uuid4()) + # 创建临时文件(不自动删除,供模型使用) - temp_fd, temp_path = tempfile.mkstemp(suffix=ext) + temp_fd, temp_path = tempfile.mkstemp(suffix=ext, dir=self.local_storage_dir) self.temp_files.add(temp_path) try: diff --git a/templates/local_index.html b/templates/local_index.html new file mode 100644 index 0000000..e279750 --- /dev/null +++ b/templates/local_index.html @@ -0,0 +1,995 @@ + + + + + + 本地多模态检索系统 - FAISS + + + + + + +
+
+ 未初始化 +
+
+ +
+
+ +
+

本地多模态检索系统

+

基于本地模型和FAISS向量数据库,支持文搜图、文搜文、图搜图、图搜文四种检索模式

+
+ +
+ +
+ +
+ + +
+
+
+ +
文搜文
+

文本查找相似文本

+
+
+
+
+ +
文搜图
+

文本查找相关图片

+
+
+
+
+ +
图搜文
+

图片查找相关文本

+
+
+
+
+ +
图搜图
+

图片查找相似图片

+
+
+
+ + +
+
+
+
+
数据管理
+ 上传和管理检索数据库 +
+
+
+ +
+
+
批量上传图片
+
+ +

拖拽多张图片到此处或点击选择

+ + +
+ +
+
+ + +
+
+
批量上传文本
+
+ +
+
+ + + +
+
+
+
+ + +
+
+
+ + + +
+
+
+
+ + 图片: 0 张 | + 文本: 0 条 + +
+
+
+
+
+
+
+ + + + + +
+
+ Loading... +
+

正在搜索中...

+
+ + +
+
+
+
+ + + + + diff --git a/test_faiss_local.log b/test_faiss_local.log new file mode 100644 index 0000000..e69de29 diff --git a/test_faiss_simple.py b/test_faiss_simple.py new file mode 100644 index 0000000..d48a949 --- /dev/null +++ b/test_faiss_simple.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +FAISS多模态检索系统简单测试 +""" + +import sys +import os +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +from multimodal_retrieval_faiss import MultimodalRetrievalFAISS +from PIL import Image +import numpy as np + +def test_text_retrieval(): + print("=== 测试文本检索 ===") + + # 初始化检索系统 + print("初始化检索系统...") + retrieval = MultimodalRetrievalFAISS( + model_name="OpenSearch-AI/Ops-MM-embedding-v1-7B", + use_all_gpus=True, + index_path="faiss_index_test" + ) + + # 测试文本 + texts = [ + "一只可爱的橘色猫咪在沙发上睡觉", + "城市夜景中的高楼大厦和车流", + "阳光明媚的海滩上,人们在冲浪和晒太阳", + "美味的意大利面配红酒和沙拉", + "雪山上滑雪的运动员" + ] + + # 添加文本 + print("\n添加文本到检索系统...") + text_ids = retrieval.add_texts(texts) + print(f"添加了{len(text_ids)}条文本") + print(f"当前向量数量: {retrieval.get_vector_count()}") + + # 测试文本搜索 + print("\n测试文本搜索...") + queries = ["一只猫在睡觉", "都市风光", "海边的景色"] + + for query in queries: + print(f"\n查询: {query}") + results = retrieval.search_by_text(query, k=2) + for i, result in enumerate(results): + print(f" 结果 {i+1}: {result.get('text', 'N/A')} (分数: {result.get('score', 0):.4f})") + + # 保存索引 + print("\n保存索引...") + retrieval.save_index() + + print("\n测试完成!") + +if __name__ == "__main__": + test_text_retrieval() diff --git a/test_faiss_with_proxy.py b/test_faiss_with_proxy.py new file mode 100644 index 0000000..d7af08b --- /dev/null +++ b/test_faiss_with_proxy.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +FAISS多模态检索系统简单测试 - 带代理设置 +""" + +import sys +import os +import logging + +# 设置代理 +os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890' # 根据实际情况修改 +os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890' # 根据实际情况修改 + +# 设置日志 +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# 设置离线模式,避免下载模型 +os.environ['TRANSFORMERS_OFFLINE'] = '1' + +# 添加当前目录到路径 +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +# 使用简单的向量模型替代大型多模态模型 +from sentence_transformers import SentenceTransformer +import faiss +import numpy as np + +class SimpleFaissRetrieval: + """简化版FAISS检索系统,使用sentence-transformers""" + + def __init__(self, model_name="paraphrase-multilingual-MiniLM-L12-v2", index_path="simple_faiss_index"): + """ + 初始化简化版检索系统 + + Args: + model_name: 模型名称,使用轻量级模型 + index_path: 索引文件路径 + """ + self.model_name = model_name + self.index_path = index_path + + logger.info(f"加载模型: {model_name}") + try: + # 尝试加载模型 + self.model = SentenceTransformer(model_name) + self.dimension = self.model.get_sentence_embedding_dimension() + logger.info(f"模型加载成功,向量维度: {self.dimension}") + except Exception as e: + logger.error(f"模型加载失败: {str(e)}") + logger.info("使用随机向量模拟...") + self.model = None + self.dimension = 384 # 默认维度 + + # 初始化索引 + self.index = faiss.IndexFlatL2(self.dimension) + self.metadata = {} + + logger.info("检索系统初始化完成") + + def encode_text(self, text): + """编码文本为向量""" + if self.model is None: + # 如果模型加载失败,使用随机向量 + if isinstance(text, list): + vectors = np.random.rand(len(text), self.dimension).astype('float32') + return vectors + else: + return np.random.rand(self.dimension).astype('float32') + else: + # 使用模型编码 + return self.model.encode(text, convert_to_numpy=True) + + def add_texts(self, texts, metadatas=None): + """添加文本到索引""" + if not texts: + return [] + + if metadatas is None: + metadatas = [{} for _ in range(len(texts))] + + # 编码文本 + vectors = self.encode_text(texts) + + # 添加到索引 + start_id = len(self.metadata) + ids = list(range(start_id, start_id + len(texts))) + + self.index.add(np.array(vectors).astype('float32')) + + # 保存元数据 + for i, id in enumerate(ids): + self.metadata[str(id)] = { + "text": texts[i], + "type": "text", + **metadatas[i] + } + + logger.info(f"添加了{len(ids)}条文本,当前索引大小: {self.index.ntotal}") + return [str(id) for id in ids] + + def search(self, query, k=5): + """搜索相似文本""" + # 编码查询 + query_vector = self.encode_text(query) + if len(query_vector.shape) == 1: + query_vector = query_vector.reshape(1, -1) + + # 搜索 + distances, indices = self.index.search(query_vector.astype('float32'), k) + + # 处理结果 + results = [] + for i in range(len(indices[0])): + idx = indices[0][i] + if idx < 0: + continue + + vector_id = str(idx) + if vector_id in self.metadata: + result = self.metadata[vector_id].copy() + result['score'] = float(1.0 / (1.0 + distances[0][i])) + results.append(result) + + return results + +def test_simple_retrieval(): + """测试简化版检索系统""" + print("=== 测试简化版FAISS检索系统 ===") + + # 初始化检索系统 + print("初始化检索系统...") + retrieval = SimpleFaissRetrieval() + + # 测试文本 + texts = [ + "一只可爱的橘色猫咪在沙发上睡觉", + "城市夜景中的高楼大厦和车流", + "阳光明媚的海滩上,人们在冲浪和晒太阳", + "美味的意大利面配红酒和沙拉", + "雪山上滑雪的运动员" + ] + + # 添加文本 + print("\n添加文本到检索系统...") + text_ids = retrieval.add_texts(texts) + print(f"添加了{len(text_ids)}条文本") + + # 测试文本搜索 + print("\n测试文本搜索...") + queries = ["一只猫在睡觉", "都市风光", "海边的景色"] + + for query in queries: + print(f"\n查询: {query}") + results = retrieval.search(query, k=2) + for i, result in enumerate(results): + print(f" 结果 {i+1}: {result.get('text', 'N/A')} (分数: {result.get('score', 0):.4f})") + + print("\n测试完成!") + +if __name__ == "__main__": + test_simple_retrieval() diff --git a/test_fixes.py b/test_fixes.py new file mode 100644 index 0000000..f8e29e9 --- /dev/null +++ b/test_fixes.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +测试修复后的系统功能 +""" + +import requests +import time +import json + +def test_system(): + """测试系统功能""" + base_url = "http://localhost:5000" + + print("🧪 开始测试修复后的系统...") + print("=" * 50) + + # 测试1: 检查系统状态 + print("1. 测试系统状态...") + try: + response = requests.get(f"{base_url}/api/status", timeout=10) + if response.status_code == 200: + status = response.json() + print(f" ✅ 系统状态: {status}") + else: + print(f" ❌ 状态检查失败: {response.status_code}") + except Exception as e: + print(f" ❌ 状态检查异常: {e}") + + # 测试2: 检查数据统计 + print("\n2. 测试数据统计...") + try: + response = requests.get(f"{base_url}/api/data/stats", timeout=10) + if response.status_code == 200: + stats = response.json() + print(f" ✅ 数据统计: {stats}") + else: + print(f" ❌ 统计检查失败: {response.status_code}") + except Exception as e: + print(f" ❌ 统计检查异常: {e}") + + # 测试3: 检查数据列表 + print("\n3. 测试数据列表...") + try: + response = requests.get(f"{base_url}/api/data/list", timeout=10) + if response.status_code == 200: + data_list = response.json() + print(f" ✅ 数据列表: {data_list}") + else: + print(f" ❌ 列表检查失败: {response.status_code}") + except Exception as e: + print(f" ❌ 列表检查异常: {e}") + + # 测试4: 测试文本搜索(如果系统已初始化) + print("\n4. 测试文本搜索...") + try: + search_data = { + "query": "测试查询", + "top_k": 3 + } + response = requests.post(f"{base_url}/api/search/text_to_text", + json=search_data, timeout=10) + if response.status_code == 200: + result = response.json() + print(f" ✅ 文本搜索: {result}") + else: + print(f" ❌ 文本搜索失败: {response.status_code}") + except Exception as e: + print(f" ❌ 文本搜索异常: {e}") + + print("\n" + "=" * 50) + print("🎉 测试完成!") + +if __name__ == "__main__": + # 等待系统启动 + print("⏳ 等待系统启动...") + time.sleep(5) + + test_system() diff --git a/test_local_model.py b/test_local_model.py new file mode 100644 index 0000000..f3f370a --- /dev/null +++ b/test_local_model.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +使用本地模型的FAISS多模态检索系统测试 +""" + +import os +import sys +import logging +from pathlib import Path +import numpy as np +import faiss +from typing import List, Dict, Any, Optional, Union +import json + +# 设置日志 +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# 设置离线模式 +os.environ['TRANSFORMERS_OFFLINE'] = '1' + +def test_local_model(): + """测试本地模型加载""" + from transformers import AutoModel, AutoTokenizer, AutoProcessor + import torch + from PIL import Image + + # 这里替换为您实际下载的模型路径 + local_model_path = "/root/models/Ops-MM-embedding-v1-7B" + + if not os.path.exists(local_model_path): + logger.error(f"模型路径不存在: {local_model_path}") + logger.info("请先下载模型到指定路径") + return + + logger.info(f"加载本地模型: {local_model_path}") + + try: + # 加载tokenizer + logger.info("加载tokenizer...") + tokenizer = AutoTokenizer.from_pretrained(local_model_path) + + # 加载processor + logger.info("加载processor...") + processor = AutoProcessor.from_pretrained(local_model_path) + + # 加载模型 + logger.info("加载模型...") + model = AutoModel.from_pretrained( + local_model_path, + torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, + device_map="auto" if torch.cuda.device_count() > 0 else None + ) + + logger.info("模型加载成功!") + + # 测试文本编码 + logger.info("测试文本编码...") + text = "这是一个测试文本" + inputs = tokenizer(text, return_tensors="pt") + if torch.cuda.is_available(): + inputs = {k: v.to("cuda") for k, v in inputs.items()} + + with torch.no_grad(): + outputs = model(**inputs) + text_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy() + + logger.info(f"文本编码维度: {text_embedding.shape}") + + # 如果有图像处理功能,测试图像编码 + try: + logger.info("测试图像编码...") + # 创建一个简单的测试图像 + image = Image.new('RGB', (224, 224), color='red') + image_inputs = processor(images=image, return_tensors="pt") + + if torch.cuda.is_available(): + image_inputs = {k: v.to("cuda") for k, v in image_inputs.items()} + + with torch.no_grad(): + image_outputs = model.vision_model(**image_inputs) + image_embedding = image_outputs.pooler_output.cpu().numpy() + + logger.info(f"图像编码维度: {image_embedding.shape}") + + except Exception as e: + logger.error(f"图像编码测试失败: {str(e)}") + + logger.info("本地模型测试完成!") + + except Exception as e: + logger.error(f"模型加载失败: {str(e)}") + logger.error("请确保模型文件已正确下载") + +if __name__ == "__main__": + test_local_model() diff --git a/test_local_retrieval.py b/test_local_retrieval.py new file mode 100644 index 0000000..19ca50b --- /dev/null +++ b/test_local_retrieval.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +测试本地模型和FAISS向量数据库的多模态检索系统 +""" + +import os +import sys +import logging +from pathlib import Path +import time +from PIL import Image +import numpy as np +from multimodal_retrieval_local import MultimodalRetrievalLocal + +# 设置日志 +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# 设置离线模式 +os.environ['TRANSFORMERS_OFFLINE'] = '1' + +def test_text_retrieval(): + """测试文本检索功能""" + print("\n=== 测试文本检索 ===") + + # 初始化检索系统 + print("初始化检索系统...") + retrieval = MultimodalRetrievalLocal( + model_path="/root/models/Ops-MM-embedding-v1-7B", + use_all_gpus=True, + index_path="local_faiss_text_test" + ) + + # 测试文本 + texts = [ + "一只可爱的橘色猫咪在沙发上睡觉", + "城市夜景中的高楼大厦和车流", + "阳光明媚的海滩上,人们在冲浪和晒太阳", + "美味的意大利面配红酒和沙拉", + "雪山上滑雪的运动员" + ] + + # 添加文本 + print("\n添加文本到检索系统...") + text_ids = retrieval.add_texts(texts) + print(f"添加了{len(text_ids)}条文本") + + # 获取统计信息 + stats = retrieval.get_stats() + print(f"检索系统统计信息: {stats}") + + # 测试文本搜索 + print("\n测试文本搜索...") + queries = ["一只猫在睡觉", "都市风光", "海边的景色"] + + for query in queries: + print(f"\n查询: {query}") + results = retrieval.search_by_text(query, k=2) + for i, result in enumerate(results): + print(f" 结果 {i+1}: {result.get('text', 'N/A')} (分数: {result.get('score', 0):.4f})") + + # 保存索引 + print("\n保存索引...") + retrieval.save_index() + + print("\n文本检索测试完成!") + return retrieval + +def test_image_retrieval(): + """测试图像检索功能""" + print("\n=== 测试图像检索 ===") + + # 初始化检索系统 + print("初始化检索系统...") + retrieval = MultimodalRetrievalLocal( + model_path="/root/models/Ops-MM-embedding-v1-7B", + use_all_gpus=True, + index_path="local_faiss_image_test" + ) + + # 创建测试图像 + print("\n创建测试图像...") + images = [] + colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (0, 255, 255)] + image_paths = [] + + for i, color in enumerate(colors): + img = Image.new('RGB', (224, 224), color=color) + images.append(img) + + # 保存图像 + img_path = f"/tmp/test_image_{i}.png" + img.save(img_path) + image_paths.append(img_path) + print(f"创建图像: {img_path}") + + # 添加图像 + print("\n添加图像到检索系统...") + metadatas = [{"description": f"测试图像 {i+1}"} for i in range(len(images))] + image_ids = retrieval.add_images(images, metadatas, image_paths) + print(f"添加了{len(image_ids)}张图像") + + # 获取统计信息 + stats = retrieval.get_stats() + print(f"检索系统统计信息: {stats}") + + # 测试图像搜索 + print("\n测试图像搜索...") + query_image = Image.new('RGB', (224, 224), color=(255, 0, 0)) # 红色图像 + + print("\n使用图像查询图像:") + results = retrieval.search_by_image(query_image, k=2, filter_type="image") + for i, result in enumerate(results): + print(f" 结果 {i+1}: {result.get('description', 'N/A')} (分数: {result.get('score', 0):.4f})") + + # 保存索引 + print("\n保存索引...") + retrieval.save_index() + + print("\n图像检索测试完成!") + return retrieval + +def test_cross_modal_retrieval(): + """测试跨模态检索功能""" + print("\n=== 测试跨模态检索 ===") + + # 初始化检索系统 + print("初始化检索系统...") + retrieval = MultimodalRetrievalLocal( + model_path="/root/models/Ops-MM-embedding-v1-7B", + use_all_gpus=True, + index_path="local_faiss_cross_modal_test" + ) + + # 添加文本 + texts = [ + "一只红色的苹果", + "绿色的草地", + "蓝色的大海", + "黄色的向日葵", + "青色的天空" + ] + print("\n添加文本到检索系统...") + text_ids = retrieval.add_texts(texts) + print(f"添加了{len(text_ids)}条文本") + + # 添加图像 + print("\n添加图像到检索系统...") + images = [] + colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (0, 255, 255)] + descriptions = ["红色图像", "绿色图像", "蓝色图像", "黄色图像", "青色图像"] + + for i, color in enumerate(colors): + img = Image.new('RGB', (224, 224), color=color) + images.append(img) + + metadatas = [{"description": desc} for desc in descriptions] + image_ids = retrieval.add_images(images, metadatas) + print(f"添加了{len(image_ids)}张图像") + + # 获取统计信息 + stats = retrieval.get_stats() + print(f"检索系统统计信息: {stats}") + + # 测试文搜图 + print("\n测试文搜图...") + query_text = "红色" + print(f"查询文本: {query_text}") + results = retrieval.search_by_text(query_text, k=2, filter_type="image") + for i, result in enumerate(results): + print(f" 结果 {i+1}: {result.get('description', 'N/A')} (分数: {result.get('score', 0):.4f})") + + # 测试图搜文 + print("\n测试图搜文...") + query_image = Image.new('RGB', (224, 224), color=(0, 0, 255)) # 蓝色图像 + print("查询图像: 蓝色图像") + results = retrieval.search_by_image(query_image, k=2, filter_type="text") + for i, result in enumerate(results): + print(f" 结果 {i+1}: {result.get('text', 'N/A')} (分数: {result.get('score', 0):.4f})") + + # 保存索引 + print("\n保存索引...") + retrieval.save_index() + + print("\n跨模态检索测试完成!") + return retrieval + +def main(): + """主函数""" + print("=== 本地多模态检索系统测试 ===") + + # 检查模型路径 + model_path = "/root/models/Ops-MM-embedding-v1-7B" + if not os.path.exists(model_path): + print(f"错误: 模型路径不存在: {model_path}") + print("请先下载模型到指定路径") + return + + # 检查模型文件 + config_file = os.path.join(model_path, "config.json") + if not os.path.exists(config_file): + print(f"错误: 模型配置文件不存在: {config_file}") + print("请确保模型文件已正确下载") + return + + print(f"模型路径验证成功: {model_path}") + + # 运行测试 + try: + # 测试文本检索 + test_text_retrieval() + + # 测试图像检索 + test_image_retrieval() + + # 测试跨模态检索 + test_cross_modal_retrieval() + + print("\n所有测试完成!") + + except Exception as e: + print(f"测试过程中发生错误: {str(e)}") + import traceback + traceback.print_exc() + +if __name__ == "__main__": + main() diff --git a/web_app.log b/web_app.log new file mode 100644 index 0000000..0cc7af7 --- /dev/null +++ b/web_app.log @@ -0,0 +1,63 @@ +nohup: ignoring input +INFO:__main__:🚀 启动时自动初始化VDB多模态检索系统... +INFO:multimodal_retrieval_vdb:检测到 2 个GPU +INFO:multimodal_retrieval_vdb:使用GPU: [0, 1], 主设备: cuda:0 +INFO:multimodal_retrieval_vdb:GPU内存已清理 +INFO:multimodal_retrieval_vdb:正在加载模型到GPU: [0, 1] +INFO:multimodal_retrieval_vdb:GPU内存已清理 +🚀 启动VDB多模态检索Web应用 +============================================================ +访问地址: http://localhost:5000 +新功能: + 🗄️ 百度VDB - 向量数据库存储 + 📊 实时统计 - VDB数据统计信息 + 🔄 数据同步 - 本地文件到VDB存储 +支持功能: + 📝 文搜文 - 文本查找相似文本 + 🖼️ 文搜图 - 文本查找相关图片 + 📝 图搜文 - 图片查找相关文本 + 🖼️ 图搜图 - 图片查找相似图片 + 📤 批量上传 - 图片和文本数据管理 +GPU配置: + 🖥️ 检测到 2 个GPU + GPU 0: NVIDIA GeForce RTX 4090 (23.6GB) + GPU 1: NVIDIA GeForce RTX 4090 (23.6GB) +VDB配置: + 🌐 服务器: http://180.76.96.191:5287 + 👤 用户: root + 🗄️ 数据库: multimodal_retrieval +============================================================ + Loading checkpoint shards: 0%| | 0/4 [00:00: Failed to establish a new connection: [Errno 101] Network is unreachable'))"), '(Request ID: 103ac836-6599-4fe2-a569-aed9c945525c)') +The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release. +WARNING:multimodal_retrieval_vdb:Processor加载失败,使用tokenizer: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /api/models/OpenSearch-AI/Ops-MM-embedding-v1-7B/tree/main/additional_chat_templates?recursive=False&expand=False (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 101] Network is unreachable'))"), '(Request ID: 96f18121-7beb-4e1a-87cd-c50edf682933)') +You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0. +INFO:multimodal_retrieval_vdb:模型加载完成 +INFO:baidu_vdb_backend:✅ 成功连接到百度VDB: http://180.76.96.191:5287 +INFO:baidu_vdb_backend:使用现有数据库: multimodal_retrieval +INFO:baidu_vdb_backend:创建文本向量表: text_vectors +ERROR:baidu_vdb_backend:❌ 创建文本表失败: Database.create_table() missing 1 required positional argument: 'partition' +ERROR:baidu_vdb_backend:❌ 表操作失败: Database.create_table() missing 1 required positional argument: 'partition' +ERROR:multimodal_retrieval_vdb:❌ VDB后端初始化失败: Database.create_table() missing 1 required positional argument: 'partition' +WARNING:multimodal_retrieval_vdb:⚠️ 系统将在无VDB模式下运行,数据将不会持久化 +INFO:multimodal_retrieval_vdb:多模态检索系统初始化完成 +ERROR:__main__:❌ VDB系统自动初始化失败: VDB连接失败 +ERROR:__main__:Traceback (most recent call last): + File "/root/mmeb/web_app_vdb.py", line 667, in auto_initialize + raise Exception("VDB连接失败") +Exception: VDB连接失败 + + * Serving Flask app 'web_app_vdb' + * Debug mode: off +Address already in use +Port 5000 is in use by another program. Either identify and stop that program, or start the server with a different port. +败 +ERROR:__main__:Traceback (most recent call last): + File "/root/mmeb/web_app_vdb.py", line 664, in auto_initialize + raise Exception("模型加载失败") +Exception: 模型加载失败 + + * Serving Flask app 'web_app_vdb' + * Debug mode: off +Address already in use +Port 5000 is in use by another program. Either identify and stop that program, or start the server with a different port. diff --git a/web_app_local.py b/web_app_local.py new file mode 100644 index 0000000..d6bd994 --- /dev/null +++ b/web_app_local.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +本地多模态检索系统Web应用 +集成本地模型和FAISS向量数据库 +支持文搜文、文搜图、图搜文、图搜图四种检索模式 +""" + +import os +import sys +import logging +import time +import json +import base64 +from io import BytesIO +from pathlib import Path +import numpy as np +from PIL import Image +from flask import Flask, request, jsonify, render_template, send_from_directory +from werkzeug.utils import secure_filename +import torch + +# 设置离线模式 +os.environ['TRANSFORMERS_OFFLINE'] = '1' + +# 导入本地模块 +from multimodal_retrieval_local import MultimodalRetrievalLocal +from optimized_file_handler import OptimizedFileHandler + +# 设置日志 +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# 创建Flask应用 +app = Flask(__name__) + +# 配置 +app.config['UPLOAD_FOLDER'] = '/tmp/mmeb_uploads' +app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB +app.config['MODEL_PATH'] = '/root/models/Ops-MM-embedding-v1-7B' +app.config['INDEX_PATH'] = '/root/mmeb/local_faiss_index' +app.config['ALLOWED_EXTENSIONS'] = {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'} + +# 确保上传目录存在 +os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) + +# 创建临时文件夹 +if not os.path.exists(app.config['UPLOAD_FOLDER']): + os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) + +# 创建文件处理器 +from optimized_file_handler import OptimizedFileHandler +file_handler = OptimizedFileHandler(local_storage_dir=app.config['UPLOAD_FOLDER']) + +# 全局变量 +retrieval_system = None + +def allowed_file(filename): + """检查文件扩展名是否允许""" + return '.' in filename and \ + filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS'] + +def init_retrieval_system(): + """初始化检索系统""" + global retrieval_system + + if retrieval_system is not None: + return retrieval_system + + logger.info("初始化多模态检索系统...") + + # 检查模型路径 + model_path = app.config['MODEL_PATH'] + if not os.path.exists(model_path): + logger.error(f"模型路径不存在: {model_path}") + raise FileNotFoundError(f"模型路径不存在: {model_path}") + + # 初始化检索系统 + retrieval_system = MultimodalRetrievalLocal( + model_path=model_path, + use_all_gpus=True, + index_path=app.config['INDEX_PATH'] + ) + + logger.info("多模态检索系统初始化完成") + return retrieval_system + +def get_image_base64(image_path): + """将图像转换为base64编码""" + with open(image_path, "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()).decode('utf-8') + return f"data:image/jpeg;base64,{encoded_string}" + +@app.route('/') +def index(): + """首页""" + return render_template('local_index.html') + +@app.route('/api/stats', methods=['GET']) +def get_stats(): + """获取系统统计信息""" + try: + retrieval = init_retrieval_system() + stats = retrieval.get_stats() + return jsonify({"success": True, "stats": stats}) + except Exception as e: + logger.error(f"获取统计信息失败: {str(e)}") + return jsonify({"success": False, "error": str(e)}), 500 + +@app.route('/api/add_text', methods=['POST']) +def add_text(): + """添加文本""" + try: + data = request.json + text = data.get('text') + + if not text: + return jsonify({"success": False, "error": "文本不能为空"}), 400 + + # 使用内存处理文本 + with file_handler.temp_file_context(text.encode('utf-8'), suffix='.txt') as temp_file: + logger.info(f"处理文本: {temp_file}") + + # 初始化检索系统 + retrieval = init_retrieval_system() + + # 添加文本 + metadata = { + "timestamp": time.time(), + "source": "web_upload" + } + + text_ids = retrieval.add_texts([text], [metadata]) + + # 保存索引 + retrieval.save_index() + + return jsonify({ + "success": True, + "message": "文本添加成功", + "text_id": text_ids[0] if text_ids else None + }) + + except Exception as e: + logger.error(f"添加文本失败: {str(e)}") + return jsonify({"success": False, "error": str(e)}), 500 + finally: + # 清理临时文件 + file_handler.cleanup_all_temp_files() + +@app.route('/api/add_image', methods=['POST']) +def add_image(): + """添加图像""" + try: + # 检查是否有文件 + if 'image' not in request.files: + return jsonify({"success": False, "error": "没有上传文件"}), 400 + + file = request.files['image'] + + # 检查文件名 + if file.filename == '': + return jsonify({"success": False, "error": "没有选择文件"}), 400 + + if file and allowed_file(file.filename): + # 读取图像数据 + image_data = file.read() + file_size = len(image_data) + + # 使用文件处理器处理图像 + logger.info(f"处理图像: {file.filename} ({file_size} 字节)") + + # 初始化检索系统 + retrieval = init_retrieval_system() + + # 创建临时文件 + file_obj = BytesIO(image_data) + filename = secure_filename(file.filename) + + # 保存到本地文件系统 + image_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) + with open(image_path, 'wb') as f: + f.write(image_data) + + # 加载图像 + try: + image = Image.open(BytesIO(image_data)) + # 确保图像是RGB模式 + if image.mode != 'RGB': + logger.info(f"将图像从 {image.mode} 转换为 RGB") + image = image.convert('RGB') + + logger.info(f"成功加载图像: {filename}, 格式: {image.format}, 模式: {image.mode}, 大小: {image.size}") + except Exception as e: + logger.error(f"加载图像失败: {filename}, 错误: {str(e)}") + return jsonify({"success": False, "error": f"图像格式不支持: {str(e)}"}), 400 + + # 添加图像 + metadata = { + "filename": filename, + "timestamp": time.time(), + "source": "web_upload", + "size": file_size, + "local_path": image_path + } + + # 添加到检索系统 + image_ids = retrieval.add_images([image], [metadata], [image_path]) + + # 保存索引 + retrieval.save_index() + + return jsonify({ + "success": True, + "message": "图像添加成功", + "image_id": image_ids[0] if image_ids else None + }) + else: + return jsonify({"success": False, "error": "不支持的文件类型"}), 400 + + except Exception as e: + logger.error(f"添加图像失败: {str(e)}") + return jsonify({"success": False, "error": str(e)}), 500 + finally: + # 清理临时文件 + file_handler.cleanup_all_temp_files() + +@app.route('/api/search_by_text', methods=['POST']) +def search_by_text(): + """文本搜索""" + try: + data = request.json + query = data.get('query') + k = int(data.get('k', 5)) + filter_type = data.get('filter_type') # "text", "image" 或 null + + if not query: + return jsonify({"success": False, "error": "查询文本不能为空"}), 400 + + # 初始化检索系统 + retrieval = init_retrieval_system() + + # 执行搜索 + results = retrieval.search_by_text(query, k, filter_type) + + # 处理结果 + processed_results = [] + for result in results: + item = { + "score": result.get("score", 0), + "type": result.get("type") + } + + if result.get("type") == "text": + item["text"] = result.get("text", "") + elif result.get("type") == "image": + if "path" in result and os.path.exists(result["path"]): + item["image"] = get_image_base64(result["path"]) + item["filename"] = os.path.basename(result["path"]) + if "description" in result: + item["description"] = result["description"] + + processed_results.append(item) + + return jsonify({ + "success": True, + "results": processed_results, + "query": query, + "filter_type": filter_type + }) + + except Exception as e: + logger.error(f"文本搜索失败: {str(e)}") + return jsonify({"success": False, "error": str(e)}), 500 + +@app.route('/api/search_by_image', methods=['POST']) +def search_by_image(): + """图像搜索""" + try: + # 检查是否有文件 + if 'image' not in request.files: + return jsonify({"success": False, "error": "没有上传文件"}), 400 + + file = request.files['image'] + k = int(request.form.get('k', 5)) + filter_type = request.form.get('filter_type') # "text", "image" 或 null + + # 检查文件名 + if file.filename == '': + return jsonify({"success": False, "error": "没有选择文件"}), 400 + + if file and allowed_file(file.filename): + # 读取图像数据 + image_data = file.read() + file_size = len(image_data) + + # 根据文件大小选择处理方式 + if file_size <= 5 * 1024 * 1024: # 5MB + # 小文件使用内存处理 + logger.info(f"使用内存处理搜索图像: {file.filename} ({file_size} 字节)") + image = Image.open(BytesIO(image_data)) + + # 初始化检索系统 + retrieval = init_retrieval_system() + + # 执行搜索 + results = retrieval.search_by_image(image, k, filter_type) + else: + # 大文件使用临时文件处理 + with file_handler.temp_file_context(image_data, suffix=os.path.splitext(file.filename)[1]) as temp_file: + logger.info(f"使用临时文件处理搜索图像: {temp_file} ({file_size} 字节)") + + # 初始化检索系统 + retrieval = init_retrieval_system() + + # 加载图像 + image = Image.open(temp_file) + + # 执行搜索 + results = retrieval.search_by_image(image, k, filter_type) + + # 处理结果 + processed_results = [] + for result in results: + item = { + "score": result.get("score", 0), + "type": result.get("type") + } + + if result.get("type") == "text": + item["text"] = result.get("text", "") + elif result.get("type") == "image": + if "path" in result and os.path.exists(result["path"]): + item["image"] = get_image_base64(result["path"]) + item["filename"] = os.path.basename(result["path"]) + if "description" in result: + item["description"] = result["description"] + + processed_results.append(item) + + return jsonify({ + "success": True, + "results": processed_results, + "filter_type": filter_type + }) + else: + return jsonify({"success": False, "error": "不支持的文件类型"}), 400 + + except Exception as e: + logger.error(f"图像搜索失败: {str(e)}") + return jsonify({"success": False, "error": str(e)}), 500 + finally: + # 清理临时文件 + file_handler.cleanup_all_temp_files() + +@app.route('/uploads/') +def uploaded_file(filename): + """提供上传文件的访问""" + return send_from_directory(app.config['UPLOAD_FOLDER'], filename) + +@app.route('/temp/') +def temp_file(filename): + """提供临时文件的访问""" + return send_from_directory(app.config['UPLOAD_FOLDER'], filename) + +@app.route('/api/save_index', methods=['POST']) +def save_index(): + """保存索引""" + try: + # 初始化检索系统 + retrieval = init_retrieval_system() + + # 保存索引 + retrieval.save_index() + + return jsonify({ + "success": True, + "message": "索引保存成功" + }) + + except Exception as e: + logger.error(f"保存索引失败: {str(e)}") + return jsonify({"success": False, "error": str(e)}), 500 + +@app.route('/api/clear_index', methods=['POST']) +def clear_index(): + """清空索引""" + try: + # 初始化检索系统 + retrieval = init_retrieval_system() + + # 清空索引 + retrieval.clear_index() + + return jsonify({ + "success": True, + "message": "索引已清空" + }) + + except Exception as e: + logger.error(f"清空索引失败: {str(e)}") + return jsonify({"success": False, "error": str(e)}), 500 + +@app.route('/api/list_items', methods=['GET']) +def list_items(): + """列出所有索引项""" + try: + # 初始化检索系统 + retrieval = init_retrieval_system() + + # 获取所有项 + items = retrieval.list_items() + + return jsonify({ + "success": True, + "items": items + }) + + except Exception as e: + logger.error(f"列出索引项失败: {str(e)}") + return jsonify({"success": False, "error": str(e)}), 500 + +@app.route('/api/system_info', methods=['GET', 'POST']) +def system_info(): + """获取系统信息""" + try: + # GPU信息 + gpu_info = [] + if torch.cuda.is_available(): + for i in range(torch.cuda.device_count()): + gpu_info.append({ + "id": i, + "name": torch.cuda.get_device_name(i), + "memory_total": torch.cuda.get_device_properties(i).total_memory / (1024 ** 3), + "memory_allocated": torch.cuda.memory_allocated(i) / (1024 ** 3), + "memory_reserved": torch.cuda.memory_reserved(i) / (1024 ** 3) + }) + + # 检索系统信息 + retrieval_info = {} + if retrieval_system is not None: + retrieval_info = retrieval_system.get_stats() + + return jsonify({ + "success": True, + "gpu_info": gpu_info, + "retrieval_info": retrieval_info, + "model_path": app.config['MODEL_PATH'], + "index_path": app.config['INDEX_PATH'] + }) + + except Exception as e: + logger.error(f"获取系统信息失败: {str(e)}") + return jsonify({"success": False, "error": str(e)}), 500 + +if __name__ == '__main__': + try: + # 预初始化检索系统 + init_retrieval_system() + + # 启动Web应用 + app.run(host='0.0.0.0', port=5000, debug=False) + except Exception as e: + logger.error(f"启动Web应用失败: {str(e)}") + sys.exit(1) diff --git a/web_app_vdb.py b/web_app_vdb.py index a8e1ece..34bc6cd 100644 --- a/web_app_vdb.py +++ b/web_app_vdb.py @@ -514,6 +514,57 @@ def get_data_stats(): 'message': f'获取统计失败: {str(e)}' }), 500 +@app.route('/api/data/list', methods=['GET']) +def list_data(): + """获取数据列表""" + try: + # 获取图片文件列表 + image_files = [] + for ext in ALLOWED_EXTENSIONS: + pattern = os.path.join(SAMPLE_IMAGES_FOLDER, f"*.{ext}") + for file_path in glob.glob(pattern): + try: + # 转换为base64 + image_base64 = image_to_base64(file_path) + image_files.append({ + 'filename': os.path.basename(file_path), + 'filepath': file_path, + 'image_base64': image_base64, + 'size': os.path.getsize(file_path) + }) + except Exception as e: + logger.warning(f"处理图片文件失败 {file_path}: {e}") + + # 获取文本文件列表 + text_files = [] + text_file_paths = glob.glob(os.path.join(TEXT_DATA_FOLDER, "*.json")) + text_file_paths.extend(glob.glob(os.path.join(TEXT_DATA_FOLDER, "*.txt"))) + + for text_file in text_file_paths: + try: + text_files.append({ + 'filename': os.path.basename(text_file), + 'filepath': text_file, + 'size': os.path.getsize(text_file) + }) + except Exception as e: + logger.warning(f"处理文本文件失败 {text_file}: {e}") + + return jsonify({ + 'success': True, + 'image_files': image_files, + 'text_files': text_files, + 'image_count': len(image_files), + 'text_count': len(text_files) + }) + + except Exception as e: + logger.error(f"获取数据列表失败: {str(e)}") + return jsonify({ + 'success': False, + 'message': f'获取数据列表失败: {str(e)}' + }), 500 + @app.route('/api/data/clear', methods=['POST']) def clear_data(): """清空所有数据"""