diff --git a/__pycache__/baidu_bos_manager.cpython-310.pyc b/__pycache__/baidu_bos_manager.cpython-310.pyc deleted file mode 100644 index a220611..0000000 Binary files a/__pycache__/baidu_bos_manager.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/baidu_vdb_backend.cpython-310.pyc b/__pycache__/baidu_vdb_backend.cpython-310.pyc deleted file mode 100644 index 4b40d89..0000000 Binary files a/__pycache__/baidu_vdb_backend.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/baidu_vdb_minimal.cpython-310.pyc b/__pycache__/baidu_vdb_minimal.cpython-310.pyc deleted file mode 100644 index 62e73fd..0000000 Binary files a/__pycache__/baidu_vdb_minimal.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/baidu_vdb_production.cpython-310.pyc b/__pycache__/baidu_vdb_production.cpython-310.pyc deleted file mode 100644 index b77240a..0000000 Binary files a/__pycache__/baidu_vdb_production.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/faiss_vector_store.cpython-310.pyc b/__pycache__/faiss_vector_store.cpython-310.pyc deleted file mode 100644 index 1f97413..0000000 Binary files a/__pycache__/faiss_vector_store.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/mongodb_manager.cpython-310.pyc b/__pycache__/mongodb_manager.cpython-310.pyc deleted file mode 100644 index 91c4882..0000000 Binary files a/__pycache__/mongodb_manager.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/multimodal_retrieval_faiss.cpython-310.pyc b/__pycache__/multimodal_retrieval_faiss.cpython-310.pyc deleted file mode 100644 index 593e644..0000000 Binary files a/__pycache__/multimodal_retrieval_faiss.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/multimodal_retrieval_local.cpython-310.pyc b/__pycache__/multimodal_retrieval_local.cpython-310.pyc deleted file mode 100644 index 8093d61..0000000 Binary files a/__pycache__/multimodal_retrieval_local.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/multimodal_retrieval_multigpu.cpython-310.pyc b/__pycache__/multimodal_retrieval_multigpu.cpython-310.pyc deleted file mode 100644 index bdd00a6..0000000 Binary files a/__pycache__/multimodal_retrieval_multigpu.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/multimodal_retrieval_vdb.cpython-310.pyc b/__pycache__/multimodal_retrieval_vdb.cpython-310.pyc deleted file mode 100644 index e92d963..0000000 Binary files a/__pycache__/multimodal_retrieval_vdb.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/multimodal_retrieval_vdb_only.cpython-310.pyc b/__pycache__/multimodal_retrieval_vdb_only.cpython-310.pyc deleted file mode 100644 index 3ce1eee..0000000 Binary files a/__pycache__/multimodal_retrieval_vdb_only.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/optimized_file_handler.cpython-310.pyc b/__pycache__/optimized_file_handler.cpython-310.pyc deleted file mode 100644 index 06082b9..0000000 Binary files a/__pycache__/optimized_file_handler.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/proxy_utils.cpython-310.pyc b/__pycache__/proxy_utils.cpython-310.pyc deleted file mode 100644 index 38cffb9..0000000 Binary files a/__pycache__/proxy_utils.cpython-310.pyc and /dev/null differ diff --git a/app_log.txt b/app_log.txt deleted file mode 100644 index 2f676af..0000000 --- a/app_log.txt +++ /dev/null @@ -1,78 +0,0 @@ -nohup: ignoring input -INFO:baidu_bos_manager:✅ BOS连接测试成功 -INFO:baidu_bos_manager:✅ BOS客户端初始化成功: dmtyz-demo -INFO:mongodb_manager:✅ MongoDB连接成功: mmeb -INFO:mongodb_manager:✅ MongoDB索引创建完成 -INFO:__main__:初始化多模态检索系统... -INFO:multimodal_retrieval_local:使用GPU: [0, 1] -INFO:multimodal_retrieval_local:加载本地模型和处理器: /root/models/Ops-MM-embedding-v1-7B -The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release. -You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0. -INFO:multimodal_retrieval_local:Processor类型: -INFO:multimodal_retrieval_local:Processor方法: ['__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_auto_class', '_check_special_mm_tokens', '_create_repo', '_get_arguments_from_pretrained', '_get_files_timestamps', '_get_num_multimodal_tokens', '_merge_kwargs', '_upload_modified_files', 'apply_chat_template', 'attributes', 'audio_tokenizer', 'batch_decode', 'chat_template', 'check_argument_for_proper_class', 'decode', 'feature_extractor_class', 'from_args_and_dict', 'from_pretrained', 'get_possibly_dynamic_module', 'get_processor_dict', 'image_processor', 'image_processor_class', 'image_token', 'image_token_id', 'model_input_names', 'optional_attributes', 'optional_call_args', 'post_process_image_text_to_text', 'push_to_hub', 'register_for_auto_class', 'save_pretrained', 'to_dict', 'to_json_file', 'to_json_string', 'tokenizer', 'tokenizer_class', 'validate_init_kwargs', 'video_processor', 'video_processor_class', 'video_token', 'video_token_id'] -INFO:multimodal_retrieval_local:Image processor类型: -INFO:multimodal_retrieval_local:Image processor方法: ['__backends', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slotnames__', '__str__', '__subclasshook__', '__weakref__', '_auto_class', '_create_repo', '_further_process_kwargs', '_fuse_mean_std_and_rescale_factor', '_get_files_timestamps', '_prepare_image_like_inputs', '_prepare_images_structure', '_preprocess', '_preprocess_image_like_inputs', '_process_image', '_processor_class', '_set_processor_class', '_upload_modified_files', '_valid_kwargs_names', '_validate_preprocess_kwargs', 'center_crop', 'compile_friendly_resize', 'convert_to_rgb', 'crop_size', 'data_format', 'default_to_square', 'device', 'disable_grouping', 'do_center_crop', 'do_convert_rgb', 'do_normalize', 'do_rescale', 'do_resize', 'fetch_images', 'filter_out_unused_kwargs', 'from_dict', 'from_json_file', 'from_pretrained', 'get_image_processor_dict', 'get_number_of_image_patches', 'image_mean', 'image_processor_type', 'image_std', 'input_data_format', 'max_pixels', 'merge_size', 'min_pixels', 'model_input_names', 'normalize', 'patch_size', 'preprocess', 'push_to_hub', 'register_for_auto_class', 'resample', 'rescale', 'rescale_and_normalize', 'rescale_factor', 'resize', 'return_tensors', 'save_pretrained', 'size', 'temporal_patch_size', 'to_dict', 'to_json_file', 'to_json_string', 'unused_kwargs', 'valid_kwargs'] - Loading checkpoint shards: 0%| | 0/4 [00:00 -INFO:multimodal_retrieval_local:encode_image: 图像列表,长度: 1 -INFO:multimodal_retrieval_local:encode_image: 处理图像输入 -INFO:multimodal_retrieval_local:encode_image: 图像 0 格式: JPEG, 模式: RGB, 大小: (939, 940) -INFO:multimodal_retrieval_local:encode_image: 使用image_processor处理图像 -INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:52:40] "GET / HTTP/1.1" 200 - -INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:52:41] "GET /api/system_info HTTP/1.1" 200 - -INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:52:41] "GET /api/system_info HTTP/1.1" 200 - -INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:52:42] "GET /favicon.ico HTTP/1.1" 404 - -INFO:multimodal_retrieval_local:encode_image: 处理后的输入键: ['pixel_values'] -INFO:__main__:处理图像: 微信图片_20250910164839_1_13.jpg (99396 字节) -INFO:__main__:成功加载图像: 20250910164839_1_13.jpg, 格式: JPEG, 模式: RGB, 大小: (939, 940) -INFO:multimodal_retrieval_local:add_images: 开始添加图像,数量: 1 -INFO:multimodal_retrieval_local:add_images: 编码图像 -INFO:multimodal_retrieval_local:encode_image: 开始编码图像,类型: -INFO:multimodal_retrieval_local:encode_image: 图像列表,长度: 1 -INFO:multimodal_retrieval_local:encode_image: 处理图像输入 -INFO:multimodal_retrieval_local:encode_image: 图像 0 格式: JPEG, 模式: RGB, 大小: (939, 940) -INFO:multimodal_retrieval_local:encode_image: 使用image_processor处理图像 -INFO:multimodal_retrieval_local:encode_image: 运行模型推理 -INFO:multimodal_retrieval_local:Model类型: -INFO:multimodal_retrieval_local:Model属性: ['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_auto_class', '_backward_compatibility_gradient_checkpointing', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_can_compile_fullgraph', '_can_record_outputs', '_can_set_attn_implementation', '_check_and_adjust_attn_implementation', '_checkpoint_conversion_mapping', '_compiled_call_impl', '_convert_head_mask_to_5d', '_copy_lm_head_original_to_resized', '_create_repo', '_dispatch_accelerate_model', '_fix_state_dict_key_on_load', '_fix_state_dict_key_on_save', '_fix_state_dict_keys_on_save', '_flash_attn_2_can_dispatch', '_flash_attn_3_can_dispatch', '_flex_attn_can_dispatch', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_from_config', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_files_timestamps', '_get_key_renaming_mapping', '_get_name', '_get_no_split_modules', '_get_resized_embeddings', '_get_resized_lm_head', '_hf_hook', '_hf_peft_config_loaded', '_hook_rss_memory_post_forward', '_hook_rss_memory_pre_forward', '_init_added_embeddings_weights_with_mean', '_init_added_lm_head_bias_with_mean', '_init_added_lm_head_weights_with_mean', '_init_weights', '_initialize_missing_keys', '_initialize_weights', '_input_embed_layer', '_is_full_backward_hook', '_is_hf_initialized', '_is_stateful', '_keep_in_fp32_modules', '_keep_in_fp32_modules', '_keep_in_fp32_modules_strict', '_keep_in_fp32_modules_strict', '_keys_to_ignore_on_load_missing', '_keys_to_ignore_on_load_unexpected', '_keys_to_ignore_on_save', '_load_from_flax', '_load_from_state_dict', '_load_from_tf', '_load_pretrained_model', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_move_missing_keys_from_meta_to_cpu', '_named_members', '_no_split_modules', '_no_split_modules', '_non_persistent_buffers_set', '_old_forward', '_parameters', '_pp_plan', '_pp_plan', '_register_load_state_dict_pre_hook', '_register_state_dict_hook', '_replicate_for_data_parallel', '_resize_token_embeddings', '_save_to_state_dict', '_sdpa_can_dispatch', '_set_default_torch_dtype', '_set_gradient_checkpointing', '_skip_keys_device_placement', '_slow_forward', '_state_dict_hooks', '_state_dict_pre_hooks', '_supports_attention_backend', '_supports_flash_attn', '_supports_flex_attn', '_supports_sdpa', '_tie_encoder_decoder_weights', '_tie_or_clone_weights', '_tied_weights_keys', '_tp_plan', '_tp_size', '_upload_modified_files', '_version', '_wrapped_call_impl', 'active_adapter', 'active_adapters', 'add_adapter', 'add_memory_hooks', 'add_model_tags', 'add_module', 'apply', 'base_model', 'base_model_prefix', 'bfloat16', 'buffers', 'call_super_init', 'can_generate', 'can_record_outputs', 'children', 'compile', 'config', 'config_class', 'cpu', 'create_extended_attention_mask_for_decoder', 'cuda', 'cuda', 'delete_adapter', 'dequantize', 'device', 'disable_adapters', 'disable_input_require_grads', 'double', 'dtype', 'dummy_inputs', 'dump_patches', 'enable_adapters', 'enable_input_require_grads', 'estimate_tokens', 'eval', 'extra_repr', 'float', 'floating_point_ops', 'forward', 'forward', 'framework', 'from_pretrained', 'generation_config', 'get_adapter_state_dict', 'get_buffer', 'get_compiled_call', 'get_correct_attn_implementation', 'get_decoder', 'get_extended_attention_mask', 'get_extra_state', 'get_head_mask', 'get_image_features', 'get_init_context', 'get_input_embeddings', 'get_memory_footprint', 'get_output_embeddings', 'get_parameter', 'get_parameter_or_buffer', 'get_placeholder_mask', 'get_position_embeddings', 'get_rope_index', 'get_submodule', 'get_video_features', 'gradient_checkpointing_disable', 'gradient_checkpointing_enable', 'half', 'hf_device_map', 'init_weights', 'initialize_weights', 'invert_attention_mask', 'ipu', 'is_backend_compatible', 'is_gradient_checkpointing', 'is_parallelizable', 'language_model', 'load_adapter', 'load_state_dict', 'loss_function', 'loss_type', 'main_input_name', 'model_tags', 'modules', 'mtia', 'name_or_path', 'named_buffers', 'named_children', 'named_modules', 'named_parameters', 'num_parameters', 'parameters', 'post_init', 'prune_heads', 'push_to_hub', 'register_backward_hook', 'register_buffer', 'register_for_auto_class', 'register_forward_hook', 'register_forward_pre_hook', 'register_full_backward_hook', 'register_full_backward_pre_hook', 'register_load_state_dict_post_hook', 'register_load_state_dict_pre_hook', 'register_module', 'register_parameter', 'register_state_dict_post_hook', 'register_state_dict_pre_hook', 'requires_grad_', 'reset_memory_hooks_state', 'resize_position_embeddings', 'resize_token_embeddings', 'retrieve_modules_from_names', 'reverse_bettertransformer', 'rope_deltas', 'save_pretrained', 'set_adapter', 'set_attn_implementation', 'set_decoder', 'set_extra_state', 'set_input_embeddings', 'set_output_embeddings', 'set_submodule', 'share_memory', 'smart_apply', 'state_dict', 'supports_gradient_checkpointing', 'supports_pp_plan', 'supports_tp_plan', 'tie_weights', 'to', 'to', 'to_bettertransformer', 'to_empty', 'tp_size', 'train', 'training', 'type', 'visual', 'warn_if_padding_and_no_attention_mask', 'warnings_issued', 'xpu', 'zero_grad'] -ERROR:multimodal_retrieval_local:encode_image: 处理图像时出错: embedding(): argument 'indices' (position 2) must be Tensor, not NoneType -ERROR:multimodal_retrieval_local:add_images: 图像编码失败,返回空数组 -INFO:multimodal_retrieval_local:索引保存成功: /root/mmeb/local_faiss_index.index -INFO:multimodal_retrieval_local:元数据保存成功: /root/mmeb/local_faiss_index_metadata.json -INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:52:46] "POST /api/add_image HTTP/1.1" 200 - -INFO:multimodal_retrieval_local:encode_image: 处理后的输入键: ['pixel_values'] -INFO:multimodal_retrieval_local:encode_image: 运行模型推理 -INFO:multimodal_retrieval_local:Model类型: -INFO:multimodal_retrieval_local:Model属性: ['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_auto_class', '_backward_compatibility_gradient_checkpointing', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_can_compile_fullgraph', '_can_record_outputs', '_can_set_attn_implementation', '_check_and_adjust_attn_implementation', '_checkpoint_conversion_mapping', '_compiled_call_impl', '_convert_head_mask_to_5d', '_copy_lm_head_original_to_resized', '_create_repo', '_dispatch_accelerate_model', '_fix_state_dict_key_on_load', '_fix_state_dict_key_on_save', '_fix_state_dict_keys_on_save', '_flash_attn_2_can_dispatch', '_flash_attn_3_can_dispatch', '_flex_attn_can_dispatch', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_from_config', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_files_timestamps', '_get_key_renaming_mapping', '_get_name', '_get_no_split_modules', '_get_resized_embeddings', '_get_resized_lm_head', '_hf_hook', '_hf_peft_config_loaded', '_hook_rss_memory_post_forward', '_hook_rss_memory_pre_forward', '_init_added_embeddings_weights_with_mean', '_init_added_lm_head_bias_with_mean', '_init_added_lm_head_weights_with_mean', '_init_weights', '_initialize_missing_keys', '_initialize_weights', '_input_embed_layer', '_is_full_backward_hook', '_is_hf_initialized', '_is_stateful', '_keep_in_fp32_modules', '_keep_in_fp32_modules', '_keep_in_fp32_modules_strict', '_keep_in_fp32_modules_strict', '_keys_to_ignore_on_load_missing', '_keys_to_ignore_on_load_unexpected', '_keys_to_ignore_on_save', '_load_from_flax', '_load_from_state_dict', '_load_from_tf', '_load_pretrained_model', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_move_missing_keys_from_meta_to_cpu', '_named_members', '_no_split_modules', '_no_split_modules', '_non_persistent_buffers_set', '_old_forward', '_parameters', '_pp_plan', '_pp_plan', '_register_load_state_dict_pre_hook', '_register_state_dict_hook', '_replicate_for_data_parallel', '_resize_token_embeddings', '_save_to_state_dict', '_sdpa_can_dispatch', '_set_default_torch_dtype', '_set_gradient_checkpointing', '_skip_keys_device_placement', '_slow_forward', '_state_dict_hooks', '_state_dict_pre_hooks', '_supports_attention_backend', '_supports_flash_attn', '_supports_flex_attn', '_supports_sdpa', '_tie_encoder_decoder_weights', '_tie_or_clone_weights', '_tied_weights_keys', '_tp_plan', '_tp_size', '_upload_modified_files', '_version', '_wrapped_call_impl', 'active_adapter', 'active_adapters', 'add_adapter', 'add_memory_hooks', 'add_model_tags', 'add_module', 'apply', 'base_model', 'base_model_prefix', 'bfloat16', 'buffers', 'call_super_init', 'can_generate', 'can_record_outputs', 'children', 'compile', 'config', 'config_class', 'cpu', 'create_extended_attention_mask_for_decoder', 'cuda', 'cuda', 'delete_adapter', 'dequantize', 'device', 'disable_adapters', 'disable_input_require_grads', 'double', 'dtype', 'dummy_inputs', 'dump_patches', 'enable_adapters', 'enable_input_require_grads', 'estimate_tokens', 'eval', 'extra_repr', 'float', 'floating_point_ops', 'forward', 'forward', 'framework', 'from_pretrained', 'generation_config', 'get_adapter_state_dict', 'get_buffer', 'get_compiled_call', 'get_correct_attn_implementation', 'get_decoder', 'get_extended_attention_mask', 'get_extra_state', 'get_head_mask', 'get_image_features', 'get_init_context', 'get_input_embeddings', 'get_memory_footprint', 'get_output_embeddings', 'get_parameter', 'get_parameter_or_buffer', 'get_placeholder_mask', 'get_position_embeddings', 'get_rope_index', 'get_submodule', 'get_video_features', 'gradient_checkpointing_disable', 'gradient_checkpointing_enable', 'half', 'hf_device_map', 'init_weights', 'initialize_weights', 'invert_attention_mask', 'ipu', 'is_backend_compatible', 'is_gradient_checkpointing', 'is_parallelizable', 'language_model', 'load_adapter', 'load_state_dict', 'loss_function', 'loss_type', 'main_input_name', 'model_tags', 'modules', 'mtia', 'name_or_path', 'named_buffers', 'named_children', 'named_modules', 'named_parameters', 'num_parameters', 'parameters', 'post_init', 'prune_heads', 'push_to_hub', 'register_backward_hook', 'register_buffer', 'register_for_auto_class', 'register_forward_hook', 'register_forward_pre_hook', 'register_full_backward_hook', 'register_full_backward_pre_hook', 'register_load_state_dict_post_hook', 'register_load_state_dict_pre_hook', 'register_module', 'register_parameter', 'register_state_dict_post_hook', 'register_state_dict_pre_hook', 'requires_grad_', 'reset_memory_hooks_state', 'resize_position_embeddings', 'resize_token_embeddings', 'retrieve_modules_from_names', 'reverse_bettertransformer', 'rope_deltas', 'save_pretrained', 'set_adapter', 'set_attn_implementation', 'set_decoder', 'set_extra_state', 'set_input_embeddings', 'set_output_embeddings', 'set_submodule', 'share_memory', 'smart_apply', 'state_dict', 'supports_gradient_checkpointing', 'supports_pp_plan', 'supports_tp_plan', 'tie_weights', 'to', 'to', 'to_bettertransformer', 'to_empty', 'tp_size', 'train', 'training', 'type', 'visual', 'warn_if_padding_and_no_attention_mask', 'warnings_issued', 'xpu', 'zero_grad'] -ERROR:multimodal_retrieval_local:encode_image: 处理图像时出错: embedding(): argument 'indices' (position 2) must be Tensor, not NoneType -ERROR:multimodal_retrieval_local:add_images: 图像编码失败,返回空数组 -INFO:multimodal_retrieval_local:索引保存成功: /root/mmeb/local_faiss_index.index -INFO:multimodal_retrieval_local:元数据保存成功: /root/mmeb/local_faiss_index_metadata.json -INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:52:59] "POST /api/add_image HTTP/1.1" 200 - -INFO:multimodal_retrieval_local:索引保存成功: /root/mmeb/local_faiss_index.index -INFO:multimodal_retrieval_local:元数据保存成功: /root/mmeb/local_faiss_index_metadata.json -INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:53:00] "POST /api/save_index HTTP/1.1" 200 - -INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 05:53:01] "GET /api/system_info HTTP/1.1" 200 - diff --git a/baidu_bos_manager.py b/baidu_bos_manager.py deleted file mode 100644 index 22dc655..0000000 --- a/baidu_bos_manager.py +++ /dev/null @@ -1,342 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -百度对象存储BOS管理器 -用于存储和管理多模态文件的原始数据 -""" - -import os -import logging -import json -import copy -from datetime import datetime -from typing import Dict, List, Optional, Any, Union -from pathlib import Path - -from baidubce.auth import bce_credentials -from baidubce import bce_base_client, bce_client_configuration -from baidubce.services.bos.bos_client import BosClient -from baidubce.exception import BceError - -logger = logging.getLogger(__name__) - -class BaiduBOSManager: - """百度对象存储BOS管理器""" - - def __init__(self, ak: str = None, sk: str = None, endpoint: str = None, bucket: str = None): - """ - 初始化BOS客户端 - - Args: - ak: Access Key - sk: Secret Key - endpoint: BOS服务端点 - bucket: 存储桶名称 - """ - self.ak = ak or "ALTAKmzKDy1OqhqmepD2OeXqbN" - self.sk = sk or "b79c5fbc26344868916ec6e9e2ff65f0" - self.endpoint = endpoint or "https://bj.bcebos.com" - self.bucket = bucket or "dmtyz-demo" - - self.client = None - self._init_client() - - def _init_client(self): - """初始化BOS客户端""" - try: - config = bce_client_configuration.BceClientConfiguration( - credentials=bce_credentials.BceCredentials(self.ak, self.sk), - endpoint=self.endpoint - ) - self.client = BosClient(config) - - # 测试连接 - self._test_connection() - logger.info(f"✅ BOS客户端初始化成功: {self.bucket}") - - except Exception as e: - logger.error(f"❌ BOS客户端初始化失败: {e}") - raise - - def _test_connection(self): - """测试BOS连接""" - try: - # 尝试列出存储桶 - self.client.list_buckets() - logger.info("✅ BOS连接测试成功") - except Exception as e: - logger.error(f"❌ BOS连接测试失败: {e}") - raise - - def upload_file(self, local_path: str, bos_key: str = None, - content_type: str = None) -> Dict[str, Any]: - """ - 上传文件到BOS - - Args: - local_path: 本地文件路径 - bos_key: BOS对象键,如果为None则自动生成 - content_type: 文件内容类型 - - Returns: - 上传结果信息 - """ - try: - if not os.path.exists(local_path): - raise FileNotFoundError(f"文件不存在: {local_path}") - - # 生成BOS键 - if bos_key is None: - bos_key = self._generate_bos_key(local_path) - - # 自动检测内容类型 - if content_type is None: - content_type = self._detect_content_type(local_path) - - # 获取文件大小 - file_stat = os.stat(local_path) - file_size = file_stat.st_size - - # 上传文件(使用put_object_from_file方法) - response = self.client.put_object_from_file( - self.bucket, - bos_key, - local_path, - content_type=content_type - ) - - # 获取文件信息 - file_stat = os.stat(local_path) - - result = { - "bos_key": bos_key, - "bucket": self.bucket, - "file_size": file_stat.st_size, - "content_type": content_type, - "upload_time": datetime.utcnow().isoformat(), - "etag": response.metadata.etag if hasattr(response, 'metadata') else None, - "url": f"{self.endpoint}/{self.bucket}/{bos_key}" - } - - logger.info(f"✅ 文件上传成功: {bos_key}") - return result - - except Exception as e: - logger.error(f"❌ 文件上传失败: {e}") - raise - - def download_file(self, bos_key: str, local_path: str) -> bool: - """ - 从BOS下载文件 - - Args: - bos_key: BOS对象键 - local_path: 本地保存路径 - - Returns: - 是否下载成功 - """ - try: - # 确保目录存在 - os.makedirs(os.path.dirname(local_path), exist_ok=True) - - # 下载文件 - response = self.client.get_object(self.bucket, bos_key) - - with open(local_path, 'wb') as f: - for chunk in response.data: - f.write(chunk) - - logger.info(f"✅ 文件下载成功: {bos_key} -> {local_path}") - return True - - except Exception as e: - logger.error(f"❌ 文件下载失败: {e}") - return False - - def get_object_metadata(self, bos_key: str) -> Optional[Dict]: - """ - 获取对象元数据 - - Args: - bos_key: BOS对象键 - - Returns: - 对象元数据 - """ - try: - response = self.client.get_object_meta_data(self.bucket, bos_key) - - metadata = { - "bos_key": bos_key, - "bucket": self.bucket, - "content_length": response.metadata.content_length, - "content_type": response.metadata.content_type, - "etag": response.metadata.etag, - "last_modified": response.metadata.last_modified, - "url": f"{self.endpoint}/{self.bucket}/{bos_key}" - } - - return metadata - - except Exception as e: - logger.error(f"❌ 获取对象元数据失败: {e}") - return None - - def delete_object(self, bos_key: str) -> bool: - """ - 删除BOS对象 - - Args: - bos_key: BOS对象键 - - Returns: - 是否删除成功 - """ - try: - self.client.delete_object(self.bucket, bos_key) - logger.info(f"✅ 对象删除成功: {bos_key}") - return True - - except Exception as e: - logger.error(f"❌ 对象删除失败: {e}") - return False - - def list_objects(self, prefix: str = "", max_keys: int = 1000) -> List[Dict]: - """ - 列出BOS对象 - - Args: - prefix: 对象键前缀 - max_keys: 最大返回数量 - - Returns: - 对象列表 - """ - try: - response = self.client.list_objects( - bucket_name=self.bucket, - prefix=prefix, - max_keys=max_keys - ) - - objects = [] - if hasattr(response, 'contents'): - for obj in response.contents: - objects.append({ - "key": obj.key, - "size": obj.size, - "last_modified": obj.last_modified, - "etag": obj.etag, - "url": f"{self.endpoint}/{self.bucket}/{obj.key}" - }) - - return objects - - except Exception as e: - logger.error(f"❌ 列出对象失败: {e}") - return [] - - def restore_archive_object(self, bos_key: str, days: int = 1, tier: str = "Standard") -> bool: - """ - 恢复归档对象 - - Args: - bos_key: BOS对象键 - days: 恢复天数 - tier: 恢复级别 (Expedited/Standard/Bulk) - - Returns: - 是否成功发起恢复 - """ - try: - # 使用自定义客户端进行归档恢复 - restore_client = ArchiveRestoreClient( - bce_client_configuration.BceClientConfiguration( - credentials=bce_credentials.BceCredentials(self.ak, self.sk), - endpoint=self.endpoint - ) - ) - - response = restore_client.restore_object(self.bucket, bos_key, days, tier) - logger.info(f"✅ 归档恢复请求已发送: {bos_key}") - return True - - except Exception as e: - logger.error(f"❌ 归档恢复失败: {e}") - return False - - def _generate_bos_key(self, local_path: str) -> str: - """生成BOS对象键""" - filename = os.path.basename(local_path) - timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") - - # 根据文件类型分类存储 - if self._is_image_file(local_path): - return f"images/{timestamp}_{filename}" - elif self._is_text_file(local_path): - return f"texts/{timestamp}_{filename}" - else: - return f"files/{timestamp}_{filename}" - - def _detect_content_type(self, file_path: str) -> str: - """检测文件内容类型""" - ext = os.path.splitext(file_path)[1].lower() - - content_types = { - '.jpg': 'image/jpeg', - '.jpeg': 'image/jpeg', - '.png': 'image/png', - '.gif': 'image/gif', - '.bmp': 'image/bmp', - '.webp': 'image/webp', - '.txt': 'text/plain', - '.json': 'application/json', - '.pdf': 'application/pdf' - } - - return content_types.get(ext, 'application/octet-stream') - - def _is_image_file(self, file_path: str) -> bool: - """判断是否为图像文件""" - image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'} - ext = os.path.splitext(file_path)[1].lower() - return ext in image_exts - - def _is_text_file(self, file_path: str) -> bool: - """判断是否为文本文件""" - text_exts = {'.txt', '.json', '.csv', '.md'} - ext = os.path.splitext(file_path)[1].lower() - return ext in text_exts - - -class ArchiveRestoreClient(bce_base_client.BceBaseClient): - """归档恢复客户端""" - - def __init__(self, config): - self.config = copy.deepcopy(bce_client_configuration.DEFAULT_CONFIG) - self.config.merge_non_none_values(config) - - def restore_object(self, bucket: str, key: str, days: int = 1, tier: str = "Standard"): - """恢复归档对象""" - path = f'/{bucket}/{key}'.encode('utf-8') - headers = { - b'x-bce-restore-days': str(days).encode('utf-8'), - b'x-bce-restore-tier': tier.encode('utf-8'), - b'Accept': b'application/json' - } - - params = {"restore": ""} - payload = json.dumps({}, ensure_ascii=False) - return self._send_request(b'POST', path, headers, params, payload.encode('utf-8')) - - -# 全局实例 -bos_manager = None - -def get_bos_manager() -> BaiduBOSManager: - """获取BOS管理器实例""" - global bos_manager - if bos_manager is None: - bos_manager = BaiduBOSManager() - return bos_manager diff --git a/baidu_vdb_backend.py b/baidu_vdb_backend.py deleted file mode 100644 index d293724..0000000 --- a/baidu_vdb_backend.py +++ /dev/null @@ -1,483 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -百度VDB向量数据库后端 -支持多模态向量存储和检索 -""" - -import pymochow -from pymochow.configuration import Configuration -from pymochow.auth.bce_credentials import BceCredentials -from pymochow.model.schema import Schema, Field, VectorIndex, HNSWParams, SecondaryIndex -from pymochow.model.enum import FieldType, IndexType, MetricType -from pymochow.model.table import Partition, Row, VectorTopkSearchRequest, FloatVector, VectorSearchConfig -import numpy as np -import json -import time -import logging -from typing import List, Tuple, Union, Dict, Any -import hashlib -import os - -logger = logging.getLogger(__name__) - -class BaiduVDBBackend: - """百度VDB向量数据库后端""" - - def __init__(self, account: str = "root", api_key: str = "vdb$yjr9ln3n0td", - endpoint: str = "http://180.76.96.191:5287", database_name: str = "multimodal_retrieval"): - """ - 初始化百度VDB后端 - - Args: - account: 用户名 - api_key: API密钥 - endpoint: 服务器端点 - database_name: 数据库名称 - """ - self.account = account - self.api_key = api_key - self.endpoint = endpoint - self.database_name = database_name - - # 初始化客户端 - self.client = None - self.db = None - self.text_table = None - self.image_table = None - - # 表名配置 - self.text_table_name = "text_vectors" - self.image_table_name = "image_vectors" - - # 向量维度(根据Ops-MM-embedding-v1-7B模型) - self.vector_dimension = 3584 - - self._connect() - self._ensure_database() - self._ensure_tables() - - def _connect(self): - """连接到百度VDB""" - try: - config = Configuration( - credentials=BceCredentials(self.account, self.api_key), - endpoint=self.endpoint - ) - self.client = pymochow.MochowClient(config) - logger.info(f"✅ 成功连接到百度VDB: {self.endpoint}") - except Exception as e: - logger.error(f"❌ 连接百度VDB失败: {e}") - raise - - def _ensure_database(self): - """确保数据库存在""" - try: - # 检查数据库是否存在 - db_list = self.client.list_databases() - existing_dbs = [db.database_name for db in db_list] - - if self.database_name not in existing_dbs: - logger.info(f"创建数据库: {self.database_name}") - self.db = self.client.create_database(self.database_name) - else: - logger.info(f"使用现有数据库: {self.database_name}") - self.db = self.client.database(self.database_name) - - except Exception as e: - logger.error(f"❌ 数据库操作失败: {e}") - raise - - def _ensure_tables(self): - """确保表存在""" - try: - # 获取现有表列表 - existing_tables = self.db.list_table() - existing_table_names = [table.table_name for table in existing_tables] - - # 创建文本向量表 - if self.text_table_name not in existing_table_names: - self._create_text_table() - else: - self.text_table = self.db.table(self.text_table_name) - logger.info(f"使用现有文本表: {self.text_table_name}") - - # 创建图像向量表 - if self.image_table_name not in existing_table_names: - self._create_image_table() - else: - self.image_table = self.db.table(self.image_table_name) - logger.info(f"使用现有图像表: {self.image_table_name}") - - except Exception as e: - logger.error(f"❌ 表操作失败: {e}") - raise - - def _create_text_table(self): - """创建文本向量表""" - try: - logger.info(f"创建文本向量表: {self.text_table_name}") - - # 定义字段 - 移除可能导致问题的复杂配置 - fields = [ - Field("id", FieldType.STRING, primary_key=True, not_null=True), - Field("text_content", FieldType.STRING, not_null=True), - Field("vector", FieldType.FLOAT_VECTOR, not_null=True, dimension=self.vector_dimension) - ] - - # 定义索引 - 简化配置 - indexes = [ - VectorIndex( - index_name="text_vector_idx", - index_type=IndexType.HNSW, - field="vector", - metric_type=MetricType.COSINE, - params=HNSWParams(m=16, efconstruction=100), - auto_build=True - ) - ] - - # 创建表 - 简化配置 - self.text_table = self.db.create_table( - table_name=self.text_table_name, - replication=1, # 单副本 - schema=Schema(fields=fields, indexes=indexes) - ) - - logger.info(f"✅ 文本向量表创建成功") - - except Exception as e: - logger.error(f"❌ 创建文本表失败: {e}") - raise - - def _create_image_table(self): - """创建图像向量表""" - try: - logger.info(f"创建图像向量表: {self.image_table_name}") - - # 定义字段 - 移除可能导致问题的复杂配置 - fields = [ - Field("id", FieldType.STRING, primary_key=True, not_null=True), - Field("image_path", FieldType.STRING, not_null=True), - Field("vector", FieldType.FLOAT_VECTOR, not_null=True, dimension=self.vector_dimension) - ] - - # 定义索引 - 简化配置 - indexes = [ - VectorIndex( - index_name="image_vector_idx", - index_type=IndexType.HNSW, - field="vector", - metric_type=MetricType.COSINE, - params=HNSWParams(m=16, efconstruction=100), - auto_build=True - ) - ] - - # 创建表 - 简化配置 - self.image_table = self.db.create_table( - table_name=self.image_table_name, - replication=1, # 单副本 - schema=Schema(fields=fields, indexes=indexes) - ) - - logger.info(f"✅ 图像向量表创建成功") - - except Exception as e: - logger.error(f"❌ 创建图像表失败: {e}") - raise - - def _generate_id(self, content: str) -> str: - """生成唯一ID""" - return hashlib.md5(f"{content}_{time.time()}".encode()).hexdigest() - - def store_text_vectors(self, texts: List[str], vectors: np.ndarray, metadata: List[Dict] = None) -> List[str]: - """ - 存储文本向量 - - Args: - texts: 文本列表 - vectors: 向量数组 - metadata: 元数据列表 - - Returns: - 存储的ID列表 - """ - if len(texts) != len(vectors): - raise ValueError("文本数量与向量数量不匹配") - - try: - rows = [] - ids = [] - current_time = int(time.time() * 1000) # 毫秒时间戳 - - for i, (text, vector) in enumerate(zip(texts, vectors)): - doc_id = self._generate_id(text) - ids.append(doc_id) - - # 准备元数据 - meta = metadata[i] if metadata and i < len(metadata) else {} - meta_json = json.dumps(meta, ensure_ascii=False) - - row = Row( - id=doc_id, - text_content=text, - vector=vector.tolist() - ) - rows.append(row) - - # 批量插入 - self.text_table.upsert(rows) - logger.info(f"✅ 成功存储 {len(texts)} 条文本向量") - - return ids - - except Exception as e: - logger.error(f"❌ 存储文本向量失败: {e}") - raise - - def store_image_vectors(self, image_paths: List[str], vectors: np.ndarray, metadata: List[Dict] = None) -> List[str]: - """ - 存储图像向量 - - Args: - image_paths: 图像路径列表 - vectors: 向量数组 - metadata: 元数据列表 - - Returns: - 存储的ID列表 - """ - if len(image_paths) != len(vectors): - raise ValueError("图像数量与向量数量不匹配") - - try: - rows = [] - ids = [] - current_time = int(time.time() * 1000) # 毫秒时间戳 - - for i, (image_path, vector) in enumerate(zip(image_paths, vectors)): - doc_id = self._generate_id(image_path) - ids.append(doc_id) - - # 准备元数据 - meta = metadata[i] if metadata and i < len(metadata) else {} - meta_json = json.dumps(meta, ensure_ascii=False) - - row = Row( - id=doc_id, - image_path=image_path, - vector=vector.tolist() - ) - rows.append(row) - - # 批量插入 - self.image_table.upsert(rows) - logger.info(f"✅ 成功存储 {len(image_paths)} 条图像向量") - - return ids - - except Exception as e: - logger.error(f"❌ 存储图像向量失败: {e}") - raise - - def search_text_vectors(self, query_vector: np.ndarray, top_k: int = 5, - filter_condition: str = None) -> List[Tuple[str, str, float, Dict]]: - """ - 搜索文本向量 - - Args: - query_vector: 查询向量 - top_k: 返回结果数量 - filter_condition: 过滤条件 - - Returns: - (id, text_content, score, metadata) 列表 - """ - try: - # 构建搜索请求 - request = VectorTopkSearchRequest( - vector_field="vector", - vector=FloatVector(query_vector.tolist()), - limit=top_k, - filter=filter_condition, - config=VectorSearchConfig(ef=200) - ) - - # 执行搜索 - results = self.text_table.vector_search(request=request) - - # 解析结果 - search_results = [] - for result in results: - doc_id = result.get('id', '') - text_content = result.get('text_content', '') - score = result.get('_score', 0.0) - - search_results.append((doc_id, text_content, float(score), {})) - - logger.info(f"✅ 文本向量搜索完成,返回 {len(search_results)} 条结果") - return search_results - - except Exception as e: - logger.error(f"❌ 文本向量搜索失败: {e}") - return [] - - def search_image_vectors(self, query_vector: np.ndarray, top_k: int = 5, - filter_condition: str = None) -> List[Tuple[str, str, str, float, Dict]]: - """ - 搜索图像向量 - - Args: - query_vector: 查询向量 - top_k: 返回结果数量 - filter_condition: 过滤条件 - - Returns: - (id, image_path, image_name, score, metadata) 列表 - """ - try: - # 构建搜索请求 - request = VectorTopkSearchRequest( - vector_field="vector", - vector=FloatVector(query_vector.tolist()), - limit=top_k, - filter=filter_condition, - config=VectorSearchConfig(ef=200) - ) - - # 执行搜索 - results = self.image_table.vector_search(request=request) - - # 解析结果 - search_results = [] - for result in results: - doc_id = result.get('id', '') - image_path = result.get('image_path', '') - image_name = os.path.basename(image_path) - score = result.get('_score', 0.0) - - search_results.append((doc_id, image_path, image_name, float(score), {})) - - logger.info(f"✅ 图像向量搜索完成,返回 {len(search_results)} 条结果") - return search_results - - except Exception as e: - logger.error(f"❌ 图像向量搜索失败: {e}") - return [] - - def get_statistics(self) -> Dict[str, Any]: - """获取数据库统计信息""" - try: - stats = {} - - # 文本表统计 - if self.text_table: - text_stats = self.text_table.stats() - stats['text_table'] = { - 'row_count': text_stats.get('rowCount', 0), - 'memory_size_mb': text_stats.get('memorySizeInByte', 0) / (1024 * 1024), - 'disk_size_mb': text_stats.get('diskSizeInByte', 0) / (1024 * 1024) - } - - # 图像表统计 - if self.image_table: - image_stats = self.image_table.stats() - stats['image_table'] = { - 'row_count': image_stats.get('rowCount', 0), - 'memory_size_mb': image_stats.get('memorySizeInByte', 0) / (1024 * 1024), - 'disk_size_mb': image_stats.get('diskSizeInByte', 0) / (1024 * 1024) - } - - return stats - - except Exception as e: - logger.error(f"❌ 获取统计信息失败: {e}") - return {} - - def clear_all_data(self): - """清空所有数据""" - try: - # 清空文本表 - if self.text_table: - self.text_table.delete(filter="*") - logger.info("✅ 文本表数据已清空") - - # 清空图像表 - if self.image_table: - self.image_table.delete(filter="*") - logger.info("✅ 图像表数据已清空") - - except Exception as e: - logger.error(f"❌ 清空数据失败: {e}") - raise - - def close(self): - """关闭连接""" - try: - if self.client: - self.client.close() - logger.info("✅ 百度VDB连接已关闭") - except Exception as e: - logger.error(f"❌ 关闭连接失败: {e}") - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.close() - - -def test_vdb_backend(): - """测试VDB后端功能""" - print("=" * 60) - print("测试百度VDB后端功能") - print("=" * 60) - - try: - # 初始化后端 - vdb = BaiduVDBBackend() - - # 测试数据 - test_texts = [ - "这是一个测试文本", - "多模态检索系统", - "向量数据库测试" - ] - - # 生成测试向量(随机向量) - test_vectors = np.random.rand(3, 3584).astype(np.float32) - - # 存储文本向量 - print("1. 存储文本向量...") - text_ids = vdb.store_text_vectors(test_texts, test_vectors) - print(f" 存储成功,ID: {text_ids}") - - # 搜索文本向量 - print("\n2. 搜索文本向量...") - query_vector = np.random.rand(3584).astype(np.float32) - results = vdb.search_text_vectors(query_vector, top_k=3) - print(f" 搜索结果: {len(results)} 条") - for i, (doc_id, text, score, meta) in enumerate(results, 1): - print(f" {i}. {text[:30]}... (相似度: {score:.4f})") - - # 获取统计信息 - print("\n3. 数据库统计信息...") - stats = vdb.get_statistics() - print(f" 统计信息: {stats}") - - # 清理测试数据 - print("\n4. 清理测试数据...") - vdb.clear_all_data() - print(" 清理完成") - - print("\n✅ 百度VDB后端测试完成!") - - except Exception as e: - print(f"\n❌ 测试失败: {e}") - import traceback - traceback.print_exc() - - -if __name__ == "__main__": - test_vdb_backend() diff --git a/baidu_vdb_fixed.py b/baidu_vdb_fixed.py deleted file mode 100644 index d335d47..0000000 --- a/baidu_vdb_fixed.py +++ /dev/null @@ -1,482 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -修复版百度VDB后端 - 解决Invalid Index Schema错误 -基于官方文档规范重新设计表结构 -""" - -import os -import sys -import numpy as np -import json -import hashlib -import time -import logging -from typing import List, Tuple, Dict, Any, Optional - -import pymochow -from pymochow.configuration import Configuration -from pymochow.auth.bce_credentials import BceCredentials -from pymochow.model.schema import Schema, Field, VectorIndex, HNSWParams -from pymochow.model.enum import FieldType, IndexType, MetricType -from pymochow.model.table import Row, Partition -from pymochow.model.table import VectorTopkSearchRequest, VectorSearchConfig, FloatVector - -# 设置日志 -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -class BaiduVDBFixed: - """修复版百度VDB后端类""" - - def __init__(self, - account: str = "root", - api_key: str = "vdb$yjr9ln3n0td", - endpoint: str = "http://180.76.96.191:5287", - database_name: str = "multimodal_fixed", - vector_dimension: int = 3584): - """ - 初始化VDB连接 - - Args: - account: 账户名 - api_key: API密钥 - endpoint: 服务端点 - database_name: 数据库名称 - vector_dimension: 向量维度 - """ - self.account = account - self.api_key = api_key - self.endpoint = endpoint - self.database_name = database_name - self.vector_dimension = vector_dimension - - # 表名 - self.text_table_name = "text_vectors_v2" - self.image_table_name = "image_vectors_v2" - - # 初始化连接 - self.client = None - self.db = None - self.text_table = None - self.image_table = None - - self._init_connection() - - def _init_connection(self): - """初始化数据库连接""" - try: - logger.info("🔗 初始化百度VDB连接...") - - # 创建配置 - config = Configuration( - credentials=BceCredentials(self.account, self.api_key), - endpoint=self.endpoint - ) - - # 创建客户端 - self.client = pymochow.MochowClient(config) - logger.info("✅ VDB客户端创建成功") - - # 确保数据库存在 - self._ensure_database() - - # 确保表存在 - self._ensure_tables() - - logger.info("✅ VDB后端初始化完成") - - except Exception as e: - logger.error(f"❌ VDB连接初始化失败: {e}") - raise - - def _ensure_database(self): - """确保数据库存在""" - try: - # 检查数据库是否存在 - db_list = self.client.list_databases() - db_names = [db.database_name for db in db_list] - - if self.database_name not in db_names: - logger.info(f"创建数据库: {self.database_name}") - self.db = self.client.create_database(self.database_name) - else: - logger.info(f"使用现有数据库: {self.database_name}") - self.db = self.client.database(self.database_name) - - except Exception as e: - logger.error(f"❌ 数据库操作失败: {e}") - raise - - def _ensure_tables(self): - """确保表存在""" - try: - # 获取现有表列表 - table_list = self.db.list_table() - table_names = [table.table_name for table in table_list] - - # 创建文本表 - if self.text_table_name not in table_names: - self._create_text_table_fixed() - else: - self.text_table = self.db.table(self.text_table_name) - logger.info(f"✅ 使用现有文本表: {self.text_table_name}") - - # 创建图像表 - if self.image_table_name not in table_names: - self._create_image_table_fixed() - else: - self.image_table = self.db.table(self.image_table_name) - logger.info(f"✅ 使用现有图像表: {self.image_table_name}") - - except Exception as e: - logger.error(f"❌ 表操作失败: {e}") - raise - - def _create_text_table_fixed(self): - """创建修复版文本向量表""" - try: - logger.info(f"创建修复版文本向量表: {self.text_table_name}") - - # 定义字段 - 严格按照官方文档规范 - fields = [ - # 主键和分区键 - 必须是STRING类型 - Field("id", FieldType.STRING, primary_key=True, partition_key=True, not_null=True), - # 文本内容 - 使用STRING而不是TEXT - Field("content", FieldType.STRING, not_null=True), - # 向量字段 - 必须指定维度 - Field("vector", FieldType.FLOAT_VECTOR, not_null=True, dimension=self.vector_dimension) - ] - - # 定义索引 - 只创建向量索引,避免复杂的二级索引 - indexes = [ - VectorIndex( - index_name="text_vector_index", - index_type=IndexType.HNSW, - field="vector", - metric_type=MetricType.COSINE, - params=HNSWParams(m=16, efconstruction=200), # 使用较小的参数 - auto_build=True - ) - ] - - # 创建Schema - schema = Schema(fields=fields, indexes=indexes) - - # 创建表 - 使用较小的副本数和分区数 - self.text_table = self.db.create_table( - table_name=self.text_table_name, - replication=2, # 最小副本数 - partition=Partition(partition_num=1), # 单分区 - schema=schema, - description="修复版文本向量表" - ) - - logger.info(f"✅ 文本表创建成功: {self.text_table_name}") - - except Exception as e: - logger.error(f"❌ 创建文本表失败: {e}") - raise - - def _create_image_table_fixed(self): - """创建修复版图像向量表""" - try: - logger.info(f"创建修复版图像向量表: {self.image_table_name}") - - # 定义字段 - 严格按照官方文档规范 - fields = [ - # 主键和分区键 - Field("id", FieldType.STRING, primary_key=True, partition_key=True, not_null=True), - # 图像路径 - Field("image_path", FieldType.STRING, not_null=True), - # 向量字段 - Field("vector", FieldType.FLOAT_VECTOR, not_null=True, dimension=self.vector_dimension) - ] - - # 定义索引 - 只创建向量索引 - indexes = [ - VectorIndex( - index_name="image_vector_index", - index_type=IndexType.HNSW, - field="vector", - metric_type=MetricType.COSINE, - params=HNSWParams(m=16, efconstruction=200), - auto_build=True - ) - ] - - # 创建Schema - schema = Schema(fields=fields, indexes=indexes) - - # 创建表 - self.image_table = self.db.create_table( - table_name=self.image_table_name, - replication=2, - partition=Partition(partition_num=1), - schema=schema, - description="修复版图像向量表" - ) - - logger.info(f"✅ 图像表创建成功: {self.image_table_name}") - - except Exception as e: - logger.error(f"❌ 创建图像表失败: {e}") - raise - - def _generate_id(self, content: str) -> str: - """生成唯一ID""" - return hashlib.md5(content.encode('utf-8')).hexdigest() - - def store_text_vectors(self, texts: List[str], vectors: np.ndarray) -> List[str]: - """存储文本向量""" - try: - if len(texts) != len(vectors): - raise ValueError("文本数量与向量数量不匹配") - - logger.info(f"存储 {len(texts)} 条文本向量...") - - rows = [] - ids = [] - - for i, (text, vector) in enumerate(zip(texts, vectors)): - doc_id = self._generate_id(text) - ids.append(doc_id) - - row = Row( - id=doc_id, - content=text, - vector=vector.tolist() - ) - rows.append(row) - - # 批量插入 - self.text_table.upsert(rows) - logger.info(f"✅ 成功存储 {len(texts)} 条文本向量") - - return ids - - except Exception as e: - logger.error(f"❌ 存储文本向量失败: {e}") - return [] - - def store_image_vectors(self, image_paths: List[str], vectors: np.ndarray) -> List[str]: - """存储图像向量""" - try: - if len(image_paths) != len(vectors): - raise ValueError("图像数量与向量数量不匹配") - - logger.info(f"存储 {len(image_paths)} 条图像向量...") - - rows = [] - ids = [] - - for i, (image_path, vector) in enumerate(zip(image_paths, vectors)): - doc_id = self._generate_id(image_path) - ids.append(doc_id) - - row = Row( - id=doc_id, - image_path=image_path, - vector=vector.tolist() - ) - rows.append(row) - - # 批量插入 - self.image_table.upsert(rows) - logger.info(f"✅ 成功存储 {len(image_paths)} 条图像向量") - - return ids - - except Exception as e: - logger.error(f"❌ 存储图像向量失败: {e}") - return [] - - def search_text_vectors(self, query_vector: np.ndarray, top_k: int = 5) -> List[Tuple[str, str, float]]: - """搜索文本向量""" - try: - logger.info(f"搜索文本向量,top_k={top_k}") - - # 创建搜索请求 - request = VectorTopkSearchRequest( - vector_field="vector", - vector=FloatVector(query_vector.tolist()), - limit=top_k, - config=VectorSearchConfig(ef=200) - ) - - # 执行搜索 - results = self.text_table.vector_search(request=request) - - # 解析结果 - search_results = [] - for result in results: - doc_id = result.get('id', '') - content = result.get('content', '') - score = result.get('_score', 0.0) - - search_results.append((doc_id, content, float(score))) - - logger.info(f"✅ 文本向量搜索完成,返回 {len(search_results)} 条结果") - return search_results - - except Exception as e: - logger.error(f"❌ 文本向量搜索失败: {e}") - return [] - - def search_image_vectors(self, query_vector: np.ndarray, top_k: int = 5) -> List[Tuple[str, str, float]]: - """搜索图像向量""" - try: - logger.info(f"搜索图像向量,top_k={top_k}") - - # 创建搜索请求 - request = VectorTopkSearchRequest( - vector_field="vector", - vector=FloatVector(query_vector.tolist()), - limit=top_k, - config=VectorSearchConfig(ef=200) - ) - - # 执行搜索 - results = self.image_table.vector_search(request=request) - - # 解析结果 - search_results = [] - for result in results: - doc_id = result.get('id', '') - image_path = result.get('image_path', '') - score = result.get('_score', 0.0) - - search_results.append((doc_id, image_path, float(score))) - - logger.info(f"✅ 图像向量搜索完成,返回 {len(search_results)} 条结果") - return search_results - - except Exception as e: - logger.error(f"❌ 图像向量搜索失败: {e}") - return [] - - def get_statistics(self) -> Dict[str, Any]: - """获取统计信息""" - try: - stats = { - "database_name": self.database_name, - "text_table": self.text_table_name, - "image_table": self.image_table_name, - "vector_dimension": self.vector_dimension, - "status": "connected" - } - - # 尝试获取表统计信息 - try: - text_stats = self.text_table.stats() - stats["text_count"] = text_stats.get("row_count", 0) - except: - stats["text_count"] = "unknown" - - try: - image_stats = self.image_table.stats() - stats["image_count"] = image_stats.get("row_count", 0) - except: - stats["image_count"] = "unknown" - - return stats - - except Exception as e: - logger.error(f"❌ 获取统计信息失败: {e}") - return {"status": "error", "error": str(e)} - - def clear_all_data(self): - """清空所有数据""" - try: - logger.info("清空所有数据...") - - # 删除表(如果存在) - try: - self.db.drop_table(self.text_table_name) - logger.info(f"✅ 删除文本表: {self.text_table_name}") - except: - pass - - try: - self.db.drop_table(self.image_table_name) - logger.info(f"✅ 删除图像表: {self.image_table_name}") - except: - pass - - # 重新创建表 - self._ensure_tables() - logger.info("✅ 数据清空完成") - - except Exception as e: - logger.error(f"❌ 清空数据失败: {e}") - - def close(self): - """关闭连接""" - try: - if self.client: - self.client.close() - logger.info("✅ VDB连接已关闭") - except Exception as e: - logger.error(f"❌ 关闭连接失败: {e}") - -def test_fixed_vdb(): - """测试修复版VDB""" - print("=" * 60) - print("测试修复版百度VDB后端") - print("=" * 60) - - vdb = None - - try: - # 1. 初始化VDB - print("1. 初始化VDB连接...") - vdb = BaiduVDBFixed() - print("✅ VDB初始化成功") - - # 2. 测试文本向量存储 - print("\n2. 测试文本向量存储...") - test_texts = [ - "这是一个测试文本", - "另一个测试文本", - "第三个测试文本" - ] - - # 生成随机向量用于测试 - test_vectors = np.random.rand(len(test_texts), 3584).astype(np.float32) - - text_ids = vdb.store_text_vectors(test_texts, test_vectors) - print(f"✅ 存储了 {len(text_ids)} 条文本向量") - - # 3. 测试文本向量搜索 - print("\n3. 测试文本向量搜索...") - query_vector = np.random.rand(3584).astype(np.float32) - search_results = vdb.search_text_vectors(query_vector, top_k=2) - - print(f"搜索结果 ({len(search_results)} 条):") - for i, (doc_id, content, score) in enumerate(search_results, 1): - print(f" {i}. {content[:30]}... (相似度: {score:.4f})") - - # 4. 获取统计信息 - print("\n4. 获取统计信息...") - stats = vdb.get_statistics() - print(f"✅ 统计信息: {stats}") - - print(f"\n🎉 修复版VDB测试完成!") - print("✅ 表创建成功") - print("✅ 向量存储成功") - print("✅ 向量搜索成功") - - return True - - except Exception as e: - print(f"❌ 测试失败: {e}") - import traceback - traceback.print_exc() - return False - - finally: - if vdb: - vdb.close() - -if __name__ == "__main__": - test_fixed_vdb() diff --git a/baidu_vdb_minimal.py b/baidu_vdb_minimal.py deleted file mode 100644 index c08effd..0000000 --- a/baidu_vdb_minimal.py +++ /dev/null @@ -1,328 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -最小化百度VDB测试 - 解决Invalid Index Schema错误 -使用最简单的表结构,不创建任何索引 -""" - -import os -import sys -import numpy as np -import json -import hashlib -import time -import logging -from typing import List, Tuple, Dict, Any, Optional - -import pymochow -from pymochow.configuration import Configuration -from pymochow.auth.bce_credentials import BceCredentials -from pymochow.model.schema import Schema, Field -from pymochow.model.enum import FieldType -from pymochow.model.table import Row, Partition - -# 设置日志 -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -class BaiduVDBMinimal: - """最小化百度VDB后端类 - 无索引版本""" - - def __init__(self, - account: str = "root", - api_key: str = "vdb$yjr9ln3n0td", - endpoint: str = "http://180.76.96.191:5287", - database_name: str = "minimal_test", - vector_dimension: int = 128): # 使用较小的向量维度 - """ - 初始化VDB连接 - """ - self.account = account - self.api_key = api_key - self.endpoint = endpoint - self.database_name = database_name - self.vector_dimension = vector_dimension - - # 表名 - self.test_table_name = "simple_vectors" - - # 初始化连接 - self.client = None - self.db = None - self.test_table = None - - self._init_connection() - - def _init_connection(self): - """初始化数据库连接""" - try: - logger.info("🔗 初始化最小化VDB连接...") - - # 创建配置 - config = Configuration( - credentials=BceCredentials(self.account, self.api_key), - endpoint=self.endpoint - ) - - # 创建客户端 - self.client = pymochow.MochowClient(config) - logger.info("✅ VDB客户端创建成功") - - # 确保数据库存在 - self._ensure_database() - - # 确保表存在 - self._ensure_table() - - logger.info("✅ 最小化VDB后端初始化完成") - - except Exception as e: - logger.error(f"❌ VDB连接初始化失败: {e}") - raise - - def _ensure_database(self): - """确保数据库存在""" - try: - # 检查数据库是否存在 - db_list = self.client.list_databases() - db_names = [db.database_name for db in db_list] - - if self.database_name not in db_names: - logger.info(f"创建数据库: {self.database_name}") - self.db = self.client.create_database(self.database_name) - else: - logger.info(f"使用现有数据库: {self.database_name}") - self.db = self.client.database(self.database_name) - - except Exception as e: - logger.error(f"❌ 数据库操作失败: {e}") - raise - - def _ensure_table(self): - """确保表存在""" - try: - # 获取现有表列表 - table_list = self.db.list_table() - table_names = [table.table_name for table in table_list] - - # 创建测试表 - if self.test_table_name not in table_names: - self._create_simple_table() - else: - self.test_table = self.db.table(self.test_table_name) - logger.info(f"✅ 使用现有表: {self.test_table_name}") - - except Exception as e: - logger.error(f"❌ 表操作失败: {e}") - raise - - def _create_simple_table(self): - """创建最简单的表 - 无索引""" - try: - logger.info(f"创建最简单的表: {self.test_table_name}") - - # 定义字段 - 最简单的配置 - fields = [ - # 主键和分区键 - 必须是STRING类型 - Field("id", FieldType.STRING, primary_key=True, partition_key=True, not_null=True), - # 内容字段 - Field("content", FieldType.STRING, not_null=True), - # 向量字段 - 使用较小维度 - Field("vector", FieldType.FLOAT_VECTOR, not_null=True, dimension=self.vector_dimension) - ] - - # 不创建任何索引 - 空索引列表 - indexes = [] - - # 创建Schema - schema = Schema(fields=fields, indexes=indexes) - - # 创建表 - 使用最小配置 - self.test_table = self.db.create_table( - table_name=self.test_table_name, - replication=2, # 最小副本数 - partition=Partition(partition_num=1), # 单分区 - schema=schema, - description="最简单的测试表" - ) - - logger.info(f"✅ 简单表创建成功: {self.test_table_name}") - - except Exception as e: - logger.error(f"❌ 创建简单表失败: {e}") - raise - - def _generate_id(self, content: str) -> str: - """生成唯一ID""" - return hashlib.md5(content.encode('utf-8')).hexdigest()[:16] # 使用较短的ID - - def store_vectors(self, contents: List[str], vectors: np.ndarray) -> List[str]: - """存储向量""" - try: - if len(contents) != len(vectors): - raise ValueError("内容数量与向量数量不匹配") - - logger.info(f"存储 {len(contents)} 条向量...") - - rows = [] - ids = [] - - for i, (content, vector) in enumerate(zip(contents, vectors)): - doc_id = self._generate_id(f"{content}_{i}") - ids.append(doc_id) - - row = Row( - id=doc_id, - content=content, - vector=vector.tolist() - ) - rows.append(row) - - # 批量插入 - self.test_table.upsert(rows) - logger.info(f"✅ 成功存储 {len(contents)} 条向量") - - return ids - - except Exception as e: - logger.error(f"❌ 存储向量失败: {e}") - return [] - - def get_all_data(self) -> List[Dict]: - """获取所有数据(用于验证)""" - try: - logger.info("获取所有数据...") - - # 使用简单查询获取数据 - # 注意:这里不使用向量搜索,而是直接查询 - results = [] - - # 尝试通过表统计获取信息 - try: - stats = self.test_table.stats() - logger.info(f"表统计信息: {stats}") - except Exception as e: - logger.warning(f"无法获取表统计: {e}") - - return results - - except Exception as e: - logger.error(f"❌ 获取数据失败: {e}") - return [] - - def get_statistics(self) -> Dict[str, Any]: - """获取统计信息""" - try: - stats = { - "database_name": self.database_name, - "table_name": self.test_table_name, - "vector_dimension": self.vector_dimension, - "status": "connected", - "has_indexes": False - } - - # 尝试获取表统计信息 - try: - table_stats = self.test_table.stats() - stats["table_stats"] = table_stats - except Exception as e: - stats["table_stats_error"] = str(e) - - return stats - - except Exception as e: - logger.error(f"❌ 获取统计信息失败: {e}") - return {"status": "error", "error": str(e)} - - def clear_all_data(self): - """清空所有数据""" - try: - logger.info("清空所有数据...") - - # 删除表(如果存在) - try: - self.db.drop_table(self.test_table_name) - logger.info(f"✅ 删除表: {self.test_table_name}") - except Exception as e: - logger.warning(f"删除表失败: {e}") - - # 重新创建表 - self._ensure_table() - logger.info("✅ 数据清空完成") - - except Exception as e: - logger.error(f"❌ 清空数据失败: {e}") - - def close(self): - """关闭连接""" - try: - if self.client: - self.client.close() - logger.info("✅ VDB连接已关闭") - except Exception as e: - logger.error(f"❌ 关闭连接失败: {e}") - -def test_minimal_vdb(): - """测试最小化VDB""" - print("=" * 60) - print("测试最小化百度VDB后端(无索引版本)") - print("=" * 60) - - vdb = None - - try: - # 1. 初始化VDB - print("1. 初始化最小化VDB连接...") - vdb = BaiduVDBMinimal() - print("✅ 最小化VDB初始化成功") - - # 2. 测试向量存储 - print("\n2. 测试向量存储...") - test_contents = [ - "测试文本1", - "测试文本2", - "测试文本3" - ] - - # 生成随机向量用于测试(使用较小维度) - test_vectors = np.random.rand(len(test_contents), 128).astype(np.float32) - - ids = vdb.store_vectors(test_contents, test_vectors) - print(f"✅ 存储了 {len(ids)} 条向量") - print(f"生成的ID: {ids}") - - # 3. 获取统计信息 - print("\n3. 获取统计信息...") - stats = vdb.get_statistics() - print(f"✅ 统计信息:") - for key, value in stats.items(): - print(f" {key}: {value}") - - # 4. 验证数据存储 - print("\n4. 验证数据存储...") - data = vdb.get_all_data() - print(f"✅ 数据验证完成") - - print(f"\n🎉 最小化VDB测试完成!") - print("✅ 表创建成功(无索引)") - print("✅ 向量存储成功") - print("✅ 基本操作正常") - print("\n📋 下一步:") - print("1. 表创建成功,说明基本结构没问题") - print("2. 可以尝试添加向量索引") - print("3. 测试向量搜索功能") - - return True - - except Exception as e: - print(f"❌ 测试失败: {e}") - import traceback - traceback.print_exc() - return False - - finally: - if vdb: - vdb.close() - -if __name__ == "__main__": - test_minimal_vdb() diff --git a/baidu_vdb_production.py b/baidu_vdb_production.py deleted file mode 100644 index f99ac08..0000000 --- a/baidu_vdb_production.py +++ /dev/null @@ -1,544 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -生产级百度VDB后端 - 完全替代FAISS -支持完整的向量存储、索引和搜索功能 -""" - -import os -import sys -import numpy as np -import json -import hashlib -import time -import logging -from typing import List, Tuple, Dict, Any, Optional, Union -from PIL import Image - -import pymochow -from pymochow.configuration import Configuration -from pymochow.auth.bce_credentials import BceCredentials -from pymochow.model.schema import Schema, Field, VectorIndex, HNSWParams -from pymochow.model.enum import FieldType, IndexType, MetricType -from pymochow.model.table import Row, Partition -from pymochow.model.table import VectorTopkSearchRequest, VectorSearchConfig, FloatVector - -# 设置日志 -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -class BaiduVDBProduction: - """生产级百度VDB后端类""" - - def __init__(self, - account: str = "root", - api_key: str = "vdb$yjr9ln3n0td", - endpoint: str = "http://180.76.96.191:5287", - database_name: str = "multimodal_production", - vector_dimension: int = 3584): - """ - 初始化生产级VDB连接 - - Args: - account: 账户名 - api_key: API密钥 - endpoint: 服务端点 - database_name: 数据库名称 - vector_dimension: 向量维度 - """ - self.account = account - self.api_key = api_key - self.endpoint = endpoint - self.database_name = database_name - self.vector_dimension = vector_dimension - - # 表名 - self.text_table_name = "text_vectors_prod" - self.image_table_name = "image_vectors_prod" - - # 初始化连接 - self.client = None - self.db = None - self.text_table = None - self.image_table = None - - # 数据缓存 - self.text_data = [] - self.image_data = [] - - self._init_connection() - - def _init_connection(self): - """初始化数据库连接""" - try: - logger.info("🔗 初始化生产级百度VDB连接...") - - # 创建配置 - config = Configuration( - credentials=BceCredentials(self.account, self.api_key), - endpoint=self.endpoint - ) - - # 创建客户端 - self.client = pymochow.MochowClient(config) - logger.info("✅ VDB客户端创建成功") - - # 确保数据库存在 - self._ensure_database() - - # 确保表存在 - self._ensure_tables() - - logger.info("✅ 生产级VDB后端初始化完成") - - except Exception as e: - logger.error(f"❌ VDB连接初始化失败: {e}") - raise - - def _ensure_database(self): - """确保数据库存在""" - try: - # 检查数据库是否存在 - db_list = self.client.list_databases() - db_names = [db.database_name for db in db_list] - - if self.database_name not in db_names: - logger.info(f"创建生产数据库: {self.database_name}") - self.db = self.client.create_database(self.database_name) - else: - logger.info(f"使用现有数据库: {self.database_name}") - self.db = self.client.database(self.database_name) - - except Exception as e: - logger.error(f"❌ 数据库操作失败: {e}") - raise - - def _ensure_tables(self): - """确保表存在""" - try: - # 获取现有表列表 - table_list = self.db.list_table() - table_names = [table.table_name for table in table_list] - - # 创建文本表 - if self.text_table_name not in table_names: - self._create_text_table() - else: - self.text_table = self.db.table(self.text_table_name) - logger.info(f"✅ 使用现有文本表: {self.text_table_name}") - - # 创建图像表 - if self.image_table_name not in table_names: - self._create_image_table() - else: - self.image_table = self.db.table(self.image_table_name) - logger.info(f"✅ 使用现有图像表: {self.image_table_name}") - - except Exception as e: - logger.error(f"❌ 表操作失败: {e}") - raise - - def _create_text_table(self): - """创建文本向量表""" - try: - logger.info(f"创建生产级文本向量表: {self.text_table_name}") - - # 定义字段 - fields = [ - Field("id", FieldType.STRING, primary_key=True, partition_key=True, not_null=True), - Field("content", FieldType.STRING, not_null=True), - Field("vector", FieldType.FLOAT_VECTOR, not_null=True, dimension=self.vector_dimension) - ] - - # 先创建无索引的表 - indexes = [] - schema = Schema(fields=fields, indexes=indexes) - - # 创建表 - self.text_table = self.db.create_table( - table_name=self.text_table_name, - replication=2, - partition=Partition(partition_num=3), # 使用3个分区提高性能 - schema=schema, - description="生产级文本向量表" - ) - - logger.info(f"✅ 文本表创建成功: {self.text_table_name}") - - except Exception as e: - logger.error(f"❌ 创建文本表失败: {e}") - raise - - def _create_image_table(self): - """创建图像向量表""" - try: - logger.info(f"创建生产级图像向量表: {self.image_table_name}") - - # 定义字段 - fields = [ - Field("id", FieldType.STRING, primary_key=True, partition_key=True, not_null=True), - Field("image_path", FieldType.STRING, not_null=True), - Field("vector", FieldType.FLOAT_VECTOR, not_null=True, dimension=self.vector_dimension) - ] - - # 先创建无索引的表 - indexes = [] - schema = Schema(fields=fields, indexes=indexes) - - # 创建表 - self.image_table = self.db.create_table( - table_name=self.image_table_name, - replication=2, - partition=Partition(partition_num=3), - schema=schema, - description="生产级图像向量表" - ) - - logger.info(f"✅ 图像表创建成功: {self.image_table_name}") - - except Exception as e: - logger.error(f"❌ 创建图像表失败: {e}") - raise - - def _generate_id(self, content: str) -> str: - """生成唯一ID""" - return hashlib.md5(content.encode('utf-8')).hexdigest() - - def _wait_for_table_ready(self, table, max_wait_seconds=30): - """等待表就绪""" - for i in range(max_wait_seconds): - try: - # 尝试插入测试数据 - test_vector = np.random.rand(self.vector_dimension).astype(np.float32) - test_row = Row( - id=f"test_{int(time.time())}", - content="test" if table == self.text_table else None, - image_path="test" if table == self.image_table else None, - vector=test_vector.tolist() - ) - - table.upsert([test_row]) - # 如果成功,删除测试数据 - table.delete(primary_key={"id": test_row.id}) - logger.info(f"✅ 表已就绪") - return True - - except Exception as e: - if "Table Not Ready" in str(e): - logger.info(f"等待表就绪... ({i+1}/{max_wait_seconds})") - time.sleep(1) - continue - else: - break - - logger.warning("⚠️ 表可能仍未完全就绪") - return False - - def build_text_index(self, texts: List[str], vectors: np.ndarray) -> List[str]: - """构建文本索引 - 替代FAISS的build_text_index_parallel""" - try: - logger.info(f"构建文本索引,共 {len(texts)} 条文本") - - if len(texts) != len(vectors): - raise ValueError("文本数量与向量数量不匹配") - - # 等待表就绪 - self._wait_for_table_ready(self.text_table) - - # 批量存储向量 - rows = [] - ids = [] - - for i, (text, vector) in enumerate(zip(texts, vectors)): - doc_id = self._generate_id(f"{text}_{i}") - ids.append(doc_id) - - row = Row( - id=doc_id, - content=text, - vector=vector.tolist() - ) - rows.append(row) - - # 批量插入 - self.text_table.upsert(rows) - self.text_data = texts - - logger.info(f"✅ 文本索引构建完成,存储了 {len(texts)} 条记录") - return ids - - except Exception as e: - logger.error(f"❌ 构建文本索引失败: {e}") - return [] - - def build_image_index(self, image_paths: List[str], vectors: np.ndarray) -> List[str]: - """构建图像索引 - 替代FAISS的build_image_index_parallel""" - try: - logger.info(f"构建图像索引,共 {len(image_paths)} 张图像") - - if len(image_paths) != len(vectors): - raise ValueError("图像数量与向量数量不匹配") - - # 等待表就绪 - self._wait_for_table_ready(self.image_table) - - # 批量存储向量 - rows = [] - ids = [] - - for i, (image_path, vector) in enumerate(zip(image_paths, vectors)): - doc_id = self._generate_id(f"{image_path}_{i}") - ids.append(doc_id) - - row = Row( - id=doc_id, - image_path=image_path, - vector=vector.tolist() - ) - rows.append(row) - - # 批量插入 - self.image_table.upsert(rows) - self.image_data = image_paths - - logger.info(f"✅ 图像索引构建完成,存储了 {len(image_paths)} 条记录") - return ids - - except Exception as e: - logger.error(f"❌ 构建图像索引失败: {e}") - return [] - - def search_text_by_text(self, query_vector: np.ndarray, top_k: int = 5) -> List[Tuple[str, float]]: - """文搜文 - 替代FAISS的search_text_by_text""" - try: - logger.info(f"执行文搜文,top_k={top_k}") - - # 使用简单的距离计算进行搜索(暂时替代向量搜索) - # 这是临时方案,等VDB向量搜索API修复后会更新 - results = [] - - # 获取所有文本数据进行比较 - if self.text_data: - # 简单返回前几个结果作为示例 - for i, text in enumerate(self.text_data[:top_k]): - # 模拟相似度分数 - score = 0.8 - i * 0.1 - results.append((text, score)) - - logger.info(f"✅ 文搜文完成,返回 {len(results)} 条结果") - return results - - except Exception as e: - logger.error(f"❌ 文搜文失败: {e}") - return [] - - def search_images_by_text(self, query_vector: np.ndarray, top_k: int = 5) -> List[Tuple[str, float]]: - """文搜图 - 替代FAISS的search_images_by_text""" - try: - logger.info(f"执行文搜图,top_k={top_k}") - - results = [] - - # 获取所有图像数据进行比较 - if self.image_data: - for i, image_path in enumerate(self.image_data[:top_k]): - score = 0.75 - i * 0.1 - results.append((image_path, score)) - - logger.info(f"✅ 文搜图完成,返回 {len(results)} 条结果") - return results - - except Exception as e: - logger.error(f"❌ 文搜图失败: {e}") - return [] - - def search_images_by_image(self, query_vector: np.ndarray, top_k: int = 5) -> List[Tuple[str, float]]: - """图搜图 - 替代FAISS的search_images_by_image""" - try: - logger.info(f"执行图搜图,top_k={top_k}") - - results = [] - - if self.image_data: - for i, image_path in enumerate(self.image_data[:top_k]): - score = 0.8 - i * 0.1 - results.append((image_path, score)) - - logger.info(f"✅ 图搜图完成,返回 {len(results)} 条结果") - return results - - except Exception as e: - logger.error(f"❌ 图搜图失败: {e}") - return [] - - def search_text_by_image(self, query_vector: np.ndarray, top_k: int = 5) -> List[Tuple[str, float]]: - """图搜文 - 替代FAISS的search_text_by_image""" - try: - logger.info(f"执行图搜文,top_k={top_k}") - - results = [] - - if self.text_data: - for i, text in enumerate(self.text_data[:top_k]): - score = 0.7 - i * 0.1 - results.append((text, score)) - - logger.info(f"✅ 图搜文完成,返回 {len(results)} 条结果") - return results - - except Exception as e: - logger.error(f"❌ 图搜文失败: {e}") - return [] - - def get_statistics(self) -> Dict[str, Any]: - """获取统计信息""" - try: - stats = { - "database_name": self.database_name, - "text_table": self.text_table_name, - "image_table": self.image_table_name, - "vector_dimension": self.vector_dimension, - "status": "connected", - "backend": "Baidu VDB" - } - - # 获取表统计信息 - try: - text_stats = self.text_table.stats() - stats["text_count"] = text_stats.get("row_count", 0) - except: - stats["text_count"] = len(self.text_data) - - try: - image_stats = self.image_table.stats() - stats["image_count"] = image_stats.get("row_count", 0) - except: - stats["image_count"] = len(self.image_data) - - return stats - - except Exception as e: - logger.error(f"❌ 获取统计信息失败: {e}") - return {"status": "error", "error": str(e)} - - def clear_all_data(self): - """清空所有数据""" - try: - logger.info("清空所有数据...") - - # 删除表 - try: - self.db.drop_table(self.text_table_name) - logger.info(f"✅ 删除文本表: {self.text_table_name}") - except: - pass - - try: - self.db.drop_table(self.image_table_name) - logger.info(f"✅ 删除图像表: {self.image_table_name}") - except: - pass - - # 清空缓存 - self.text_data = [] - self.image_data = [] - - # 重新创建表 - self._ensure_tables() - logger.info("✅ 数据清空完成") - - except Exception as e: - logger.error(f"❌ 清空数据失败: {e}") - - def close(self): - """关闭连接""" - try: - if self.client: - self.client.close() - logger.info("✅ VDB连接已关闭") - except Exception as e: - logger.error(f"❌ 关闭连接失败: {e}") - -def test_production_vdb(): - """测试生产级VDB""" - print("=" * 60) - print("测试生产级百度VDB后端") - print("=" * 60) - - vdb = None - - try: - # 1. 初始化VDB - print("1. 初始化生产级VDB...") - vdb = BaiduVDBProduction() - print("✅ 生产级VDB初始化成功") - - # 2. 测试文本索引构建 - print("\n2. 测试文本索引构建...") - test_texts = [ - "这是一个关于人工智能的文档", - "机器学习算法的应用场景", - "深度学习在图像识别中的应用", - "自然语言处理技术发展", - "计算机视觉的最新进展" - ] - - # 生成测试向量 - test_vectors = np.random.rand(len(test_texts), 3584).astype(np.float32) - - text_ids = vdb.build_text_index(test_texts, test_vectors) - print(f"✅ 文本索引构建完成,ID数量: {len(text_ids)}") - - # 3. 测试图像索引构建 - print("\n3. 测试图像索引构建...") - test_images = [ - "/path/to/image1.jpg", - "/path/to/image2.jpg", - "/path/to/image3.jpg" - ] - - image_vectors = np.random.rand(len(test_images), 3584).astype(np.float32) - - image_ids = vdb.build_image_index(test_images, image_vectors) - print(f"✅ 图像索引构建完成,ID数量: {len(image_ids)}") - - # 4. 测试搜索功能 - print("\n4. 测试搜索功能...") - query_vector = np.random.rand(3584).astype(np.float32) - - # 文搜文 - text_results = vdb.search_text_by_text(query_vector, top_k=3) - print(f"文搜文结果: {len(text_results)} 条") - for i, (text, score) in enumerate(text_results, 1): - print(f" {i}. {text[:30]}... (分数: {score:.3f})") - - # 文搜图 - image_results = vdb.search_images_by_text(query_vector, top_k=2) - print(f"文搜图结果: {len(image_results)} 条") - - # 5. 获取统计信息 - print("\n5. 获取统计信息...") - stats = vdb.get_statistics() - print("统计信息:") - for key, value in stats.items(): - print(f" {key}: {value}") - - print(f"\n🎉 生产级VDB测试完成!") - print("✅ 完全替代FAISS功能") - print("✅ 支持四种检索模式") - print("✅ 生产级数据存储") - - return True - - except Exception as e: - print(f"❌ 测试失败: {e}") - import traceback - traceback.print_exc() - return False - - finally: - if vdb: - vdb.close() - -if __name__ == "__main__": - test_production_vdb() diff --git a/baidu_vdb_with_index.py b/baidu_vdb_with_index.py deleted file mode 100644 index 94d84c0..0000000 --- a/baidu_vdb_with_index.py +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -带索引的百度VDB测试 - 在表创建完成后添加索引 -""" - -import time -import logging -from baidu_vdb_minimal import BaiduVDBMinimal -import numpy as np - -import pymochow -from pymochow.configuration import Configuration -from pymochow.auth.bce_credentials import BceCredentials -from pymochow.model.schema import VectorIndex, HNSWParams -from pymochow.model.enum import IndexType, MetricType -from pymochow.model.table import VectorTopkSearchRequest, VectorSearchConfig, FloatVector - -# 设置日志 -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -class BaiduVDBWithIndex(BaiduVDBMinimal): - """带索引的VDB类""" - - def wait_table_ready(self, max_wait_seconds=60): - """等待表创建完成""" - logger.info("等待表创建完成...") - - for i in range(max_wait_seconds): - try: - stats = self.test_table.stats() - logger.info(f"表状态检查 {i+1}/{max_wait_seconds}: {stats.get('msg', 'Unknown')}") - - # 尝试存储一条测试数据 - test_vector = np.random.rand(self.vector_dimension).astype(np.float32) - test_ids = self.store_vectors(["test"], test_vector.reshape(1, -1)) - - if test_ids: - logger.info("✅ 表已就绪,可以存储数据") - return True - - except Exception as e: - if "Table Not Ready" in str(e): - logger.info(f"表仍在创建中... ({i+1}/{max_wait_seconds})") - time.sleep(1) - continue - else: - logger.error(f"其他错误: {e}") - break - - logger.warning("⚠️ 表可能仍未就绪") - return False - - def add_vector_index(self): - """为现有表添加向量索引""" - try: - logger.info("为表添加向量索引...") - - # 创建向量索引 - vector_index = VectorIndex( - index_name="vector_hnsw_idx", - index_type=IndexType.HNSW, - field="vector", - metric_type=MetricType.COSINE, - params=HNSWParams(m=16, efconstruction=200), - auto_build=True - ) - - # 添加索引到表 - self.test_table.add_index(vector_index) - logger.info("✅ 向量索引添加成功") - - return True - - except Exception as e: - logger.error(f"❌ 添加向量索引失败: {e}") - return False - - def search_vectors(self, query_vector: np.ndarray, top_k: int = 3) -> list: - """搜索向量""" - try: - logger.info(f"搜索向量,top_k={top_k}") - - # 创建搜索请求 - request = VectorTopkSearchRequest( - vector_field="vector", - vector=FloatVector(query_vector.tolist()), - limit=top_k, - config=VectorSearchConfig(ef=200) - ) - - # 执行搜索 - results = self.test_table.vector_search(request=request) - - # 解析结果 - search_results = [] - for result in results: - doc_id = result.get('id', '') - content = result.get('content', '') - score = result.get('_score', 0.0) - - search_results.append((doc_id, content, float(score))) - - logger.info(f"✅ 向量搜索完成,返回 {len(search_results)} 条结果") - return search_results - - except Exception as e: - logger.error(f"❌ 向量搜索失败: {e}") - return [] - -def test_vdb_with_index(): - """测试带索引的VDB""" - print("=" * 60) - print("测试带索引的百度VDB") - print("=" * 60) - - vdb = None - - try: - # 1. 初始化VDB(复用无索引版本) - print("1. 初始化VDB连接...") - vdb = BaiduVDBWithIndex() - print("✅ VDB初始化成功") - - # 2. 等待表就绪 - print("\n2. 等待表创建完成...") - if vdb.wait_table_ready(30): - print("✅ 表已就绪") - else: - print("⚠️ 表可能仍在创建中,继续测试...") - - # 3. 存储测试数据 - print("\n3. 存储测试向量...") - test_contents = [ - "这是第一个测试文档", - "这是第二个测试文档", - "这是第三个测试文档", - "这是第四个测试文档", - "这是第五个测试文档" - ] - - test_vectors = np.random.rand(len(test_contents), 128).astype(np.float32) - - ids = vdb.store_vectors(test_contents, test_vectors) - print(f"✅ 存储了 {len(ids)} 条向量") - - if not ids: - print("⚠️ 数据存储失败,跳过后续测试") - return False - - # 4. 添加向量索引 - print("\n4. 添加向量索引...") - if vdb.add_vector_index(): - print("✅ 向量索引添加成功") - - # 等待索引构建 - print("等待索引构建...") - time.sleep(10) - - # 5. 测试向量搜索 - print("\n5. 测试向量搜索...") - query_vector = test_vectors[0] # 使用第一个向量作为查询 - - results = vdb.search_vectors(query_vector, top_k=3) - - if results: - print(f"搜索结果 ({len(results)} 条):") - for i, (doc_id, content, score) in enumerate(results, 1): - print(f" {i}. {content} (相似度: {score:.4f})") - print("✅ 向量搜索成功") - else: - print("⚠️ 向量搜索失败或无结果") - else: - print("❌ 向量索引添加失败") - - # 6. 获取最终统计 - print("\n6. 获取统计信息...") - stats = vdb.get_statistics() - print("最终统计:") - for key, value in stats.items(): - print(f" {key}: {value}") - - print(f"\n🎉 带索引VDB测试完成!") - return True - - except Exception as e: - print(f"❌ 测试失败: {e}") - import traceback - traceback.print_exc() - return False - - finally: - if vdb: - vdb.close() - -if __name__ == "__main__": - test_vdb_with_index() diff --git a/faiss_index_local.index b/faiss_index_local.index deleted file mode 100644 index 27dba4e..0000000 Binary files a/faiss_index_local.index and /dev/null differ diff --git a/faiss_index_local_metadata.json b/faiss_index_local_metadata.json deleted file mode 100644 index 9e26dfe..0000000 --- a/faiss_index_local_metadata.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/faiss_index_test.index b/faiss_index_test.index deleted file mode 100644 index 27dba4e..0000000 Binary files a/faiss_index_test.index and /dev/null differ diff --git a/faiss_index_test_metadata.json b/faiss_index_test_metadata.json deleted file mode 100644 index 9e26dfe..0000000 --- a/faiss_index_test_metadata.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/faiss_vector_store.py b/faiss_vector_store.py deleted file mode 100644 index cacfcf0..0000000 --- a/faiss_vector_store.py +++ /dev/null @@ -1,147 +0,0 @@ -import os -import json -import numpy as np -import faiss -from typing import List, Dict, Any, Optional, Tuple -import logging - -class FaissVectorStore: - def __init__(self, index_path: str = "faiss_index", dimension: int = 3584): - """ - 初始化FAISS向量存储 - - 参数: - index_path: 索引文件路径 - dimension: 向量维度 - """ - self.index_path = index_path - self.dimension = dimension - self.index = None - self.metadata = {} - self.metadata_path = f"{index_path}_metadata.json" - - # 加载现有索引或创建新索引 - self._load_or_create_index() - - def _load_or_create_index(self): - """加载现有索引或创建新索引""" - if os.path.exists(f"{self.index_path}.index"): - logging.info(f"加载现有索引: {self.index_path}") - self.index = faiss.read_index(f"{self.index_path}.index") - self._load_metadata() - else: - logging.info(f"创建新索引,维度: {self.dimension}") - self.index = faiss.IndexFlatL2(self.dimension) # 使用L2距离 - - def _load_metadata(self): - """加载元数据""" - if os.path.exists(self.metadata_path): - with open(self.metadata_path, 'r', encoding='utf-8') as f: - self.metadata = json.load(f) - - def _save_metadata(self): - """保存元数据到文件""" - with open(self.metadata_path, 'w', encoding='utf-8') as f: - json.dump(self.metadata, f, ensure_ascii=False, indent=2) - - def save_index(self): - """保存索引和元数据""" - if self.index is not None: - faiss.write_index(self.index, f"{self.index_path}.index") - self._save_metadata() - logging.info(f"索引已保存到 {self.index_path}.index") - - def add_vectors( - self, - vectors: np.ndarray, - metadatas: List[Dict[str, Any]] - ) -> List[str]: - """ - 添加向量和元数据 - - 参数: - vectors: 向量数组 - metadatas: 对应的元数据列表 - - 返回: - 添加的向量ID列表 - """ - if len(vectors) != len(metadatas): - raise ValueError("vectors和metadatas长度必须相同") - - start_id = len(self.metadata) - ids = list(range(start_id, start_id + len(vectors))) - - # 添加向量到索引 - self.index.add(vectors.astype('float32')) - - # 保存元数据 - for idx, vector_id in enumerate(ids): - self.metadata[str(vector_id)] = metadatas[idx] - - # 保存索引和元数据 - self.save_index() - - return [str(id) for id in ids] - - def search( - self, - query_vector: np.ndarray, - k: int = 5 - ) -> Tuple[List[Dict[str, Any]], List[float]]: - """ - 相似性搜索 - - 参数: - query_vector: 查询向量 - k: 返回结果数量 - - 返回: - (结果列表, 距离列表) - """ - if self.index is None: - return [], [] - - # 确保输入是2D数组 - if len(query_vector.shape) == 1: - query_vector = query_vector.reshape(1, -1) - - # 执行搜索 - distances, indices = self.index.search(query_vector.astype('float32'), k) - - # 处理结果 - results = [] - for i in range(len(indices[0])): - idx = indices[0][i] - if idx < 0: # FAISS可能返回-1表示无效索引 - continue - - vector_id = str(idx) - if vector_id in self.metadata: - result = self.metadata[vector_id].copy() - result['distance'] = float(distances[0][i]) - results.append(result) - - return results, distances[0].tolist() - - def get_vector_count(self) -> int: - """获取向量数量""" - return self.index.ntotal if self.index is not None else 0 - - def delete_vectors(self, vector_ids: List[str]) -> bool: - """ - 删除指定ID的向量 - - 注意: FAISS不支持直接删除向量,这里实现为逻辑删除 - """ - deleted_count = 0 - for vector_id in vector_ids: - if vector_id in self.metadata: - del self.metadata[vector_id] - deleted_count += 1 - - if deleted_count > 0: - self._save_metadata() - logging.warning("FAISS不支持直接删除向量,已从元数据中移除,但索引中仍保留") - - return deleted_count > 0 diff --git a/install_dependencies.sh b/install_dependencies.sh deleted file mode 100644 index fc50a39..0000000 --- a/install_dependencies.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# 安装多模态检索系统依赖包 - -echo "🚀 开始安装多模态检索系统依赖包..." - -# 更新pip -pip install --upgrade pip - -# 安装基础依赖 -echo "📦 安装基础依赖包..." -pip install torch>=2.0.0 torchvision>=0.15.0 -pip install transformers>=4.30.0 accelerate>=0.20.0 -pip install numpy>=1.21.0 Pillow>=9.0.0 -pip install scikit-learn>=1.3.0 tqdm>=4.65.0 -pip install flask>=2.3.0 werkzeug>=2.3.0 -pip install psutil>=5.9.0 - -# 安装百度VDB SDK -echo "🔗 安装百度VDB SDK..." -pip install pymochow - -# 安装MongoDB驱动 -echo "💾 安装MongoDB驱动..." -pip install pymongo>=4.0.0 - -# 安装FAISS (备用) -echo "🔍 安装FAISS..." -pip install faiss-cpu>=1.7.4 - -echo "✅ 依赖包安装完成!" -echo "📋 已安装的主要包:" -echo " - torch (深度学习框架)" -echo " - transformers (模型库)" -echo " - pymochow (百度VDB SDK)" -echo " - flask (Web框架)" -echo " - pymongo (MongoDB驱动)" -echo "" -echo "🎯 接下来可以运行测试脚本验证安装" diff --git a/local_file_handler.py b/local_file_handler.py deleted file mode 100644 index 00039c2..0000000 --- a/local_file_handler.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -本地文件处理器 -简化版的文件处理器,不依赖外部服务 -""" - -import os -import io -import tempfile -import logging -from contextlib import contextmanager -from typing import Dict, List, Optional, Any, Union, BinaryIO -from pathlib import Path - -logger = logging.getLogger(__name__) - -class LocalFileHandler: - """本地文件处理器""" - - # 小文件阈值 (5MB) - SMALL_FILE_THRESHOLD = 5 * 1024 * 1024 - - # 支持的图像格式 - SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'} - - def __init__(self, temp_dir: str = None): - """ - 初始化本地文件处理器 - - Args: - temp_dir: 临时文件目录 - """ - self.temp_dir = temp_dir or tempfile.gettempdir() - self.temp_files = set() # 跟踪临时文件 - - # 确保临时目录存在 - os.makedirs(self.temp_dir, exist_ok=True) - - @contextmanager - def temp_file_context(self, content: bytes = None, suffix: str = None, delete_on_exit: bool = True): - """临时文件上下文管理器,确保自动清理""" - temp_fd, temp_path = tempfile.mkstemp(suffix=suffix, dir=self.temp_dir) - self.temp_files.add(temp_path) - - try: - os.close(temp_fd) # 关闭文件描述符 - - # 如果提供了内容,写入文件 - if content is not None: - with open(temp_path, 'wb') as f: - f.write(content) - - yield temp_path - finally: - if delete_on_exit and os.path.exists(temp_path): - try: - os.unlink(temp_path) - self.temp_files.discard(temp_path) - logger.debug(f"🗑️ 临时文件已清理: {temp_path}") - except Exception as e: - logger.warning(f"⚠️ 临时文件清理失败: {temp_path}, {e}") - - def cleanup_all_temp_files(self): - """清理所有跟踪的临时文件""" - for temp_path in list(self.temp_files): - if os.path.exists(temp_path): - try: - os.unlink(temp_path) - logger.debug(f"🗑️ 清理临时文件: {temp_path}") - except Exception as e: - logger.warning(f"⚠️ 清理临时文件失败: {temp_path}, {e}") - self.temp_files.clear() - - def get_file_size(self, file_obj) -> int: - """获取文件大小""" - if hasattr(file_obj, 'content_length') and file_obj.content_length: - return file_obj.content_length - - # 通过读取内容获取大小 - current_pos = file_obj.tell() - file_obj.seek(0, 2) # 移动到文件末尾 - size = file_obj.tell() - file_obj.seek(current_pos) # 恢复原位置 - return size - - def is_small_file(self, file_obj) -> bool: - """判断是否为小文件""" - return self.get_file_size(file_obj) <= self.SMALL_FILE_THRESHOLD - - def get_temp_file_for_model(self, file_obj, filename: str) -> Optional[str]: - """为模型处理获取临时文件路径(确保文件存在于本地)""" - try: - ext = os.path.splitext(filename)[1].lower() - - # 创建临时文件(不自动删除,供模型使用) - temp_fd, temp_path = tempfile.mkstemp(suffix=ext, dir=self.temp_dir) - self.temp_files.add(temp_path) - - try: - # 写入文件内容 - file_obj.seek(0) - with os.fdopen(temp_fd, 'wb') as temp_file: - temp_file.write(file_obj.read()) - - logger.debug(f"📁 为模型创建临时文件: {temp_path}") - return temp_path - - except Exception as e: - os.close(temp_fd) - raise e - - except Exception as e: - logger.error(f"❌ 为模型创建临时文件失败: {filename}, {e}") - return None - - def cleanup_temp_file(self, temp_path: str): - """清理指定的临时文件""" - if temp_path and os.path.exists(temp_path): - try: - os.unlink(temp_path) - self.temp_files.discard(temp_path) - logger.debug(f"🗑️ 清理临时文件: {temp_path}") - except Exception as e: - logger.warning(f"⚠️ 清理临时文件失败: {temp_path}, {e}") - -# 全局实例 -file_handler = None - -def get_file_handler(temp_dir: str = None) -> LocalFileHandler: - """获取文件处理器实例""" - global file_handler - if file_handler is None: - file_handler = LocalFileHandler(temp_dir=temp_dir) - return file_handler diff --git a/mongodb_manager.py b/mongodb_manager.py deleted file mode 100644 index 0f406ea..0000000 --- a/mongodb_manager.py +++ /dev/null @@ -1,301 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -MongoDB元数据管理器 -用于存储和管理多模态文件的元数据信息 -""" - -import os -import logging -from datetime import datetime -from typing import Dict, List, Optional, Any -from pymongo import MongoClient -from pymongo.errors import ConnectionFailure, OperationFailure -import hashlib - -logger = logging.getLogger(__name__) - -class MongoDBManager: - """MongoDB元数据管理器""" - - def __init__(self, uri: str = None, database: str = "mmeb"): - """ - 初始化MongoDB连接 - - Args: - uri: MongoDB连接URI - database: 数据库名称 - """ - self.uri = uri or "mongodb://root:aWQtrUH!b3@XVjfbNkkp.mongodb.bj.baidubce.com/mmeb?authSource=admin" - self.database_name = database - self.client = None - self.db = None - - self._connect() - - def _connect(self): - """建立MongoDB连接""" - try: - self.client = MongoClient(self.uri, serverSelectionTimeoutMS=5000) - # 测试连接 - self.client.admin.command('ping') - self.db = self.client[self.database_name] - logger.info(f"✅ MongoDB连接成功: {self.database_name}") - - # 创建索引 - self._create_indexes() - - except ConnectionFailure as e: - logger.error(f"❌ MongoDB连接失败: {e}") - raise - except Exception as e: - logger.error(f"❌ MongoDB初始化失败: {e}") - raise - - def _create_indexes(self): - """创建必要的索引""" - try: - # 文件元数据集合索引 - files_collection = self.db.files - files_collection.create_index("file_hash", unique=True) - files_collection.create_index("file_type") - files_collection.create_index("upload_time") - files_collection.create_index("bos_key") - - # 向量索引集合索引 - vectors_collection = self.db.vectors - vectors_collection.create_index("file_id") - vectors_collection.create_index("vector_type") - vectors_collection.create_index("vdb_id") - - logger.info("✅ MongoDB索引创建完成") - - except Exception as e: - logger.warning(f"⚠️ 创建索引时出现警告: {e}") - - def store_file_metadata(self, file_path: str = None, file_type: str = None, - bos_key: str = None, additional_info: Dict = None, - metadata: Dict = None) -> str: - """ - 存储文件元数据 - - Args: - file_path: 本地文件路径 (可选,如果提供metadata则忽略) - file_type: 文件类型 (image/text) - bos_key: BOS存储键 - additional_info: 额外信息 - metadata: 直接提供的元数据字典 (新增参数) - - Returns: - 文件ID - """ - try: - # 如果直接提供了元数据,使用元数据 - if metadata: - # 确保必要字段存在 - if 'upload_time' not in metadata: - metadata['upload_time'] = datetime.utcnow() - if 'status' not in metadata: - metadata['status'] = 'active' - - # 检查是否已存在(基于file_id或其他唯一标识) - existing = None - if 'file_id' in metadata: - existing = self.db.files.find_one({"file_id": metadata['file_id']}) - elif 'bos_key' in metadata: - existing = self.db.files.find_one({"bos_key": metadata['bos_key']}) - - if existing: - logger.info(f"文件已存在: {metadata.get('filename', 'unknown')} (ID: {existing['_id']})") - return str(existing['_id']) - - # 插入新记录 - result = self.db.files.insert_one(metadata) - file_id = str(result.inserted_id) - - logger.info(f"✅ 文件元数据已存储: {metadata.get('filename', 'unknown')} (ID: {file_id})") - return file_id - - # 原有逻辑:基于文件路径 - if not file_path or not file_type or not bos_key: - raise ValueError("file_path, file_type, and bos_key are required when metadata is not provided") - - # 计算文件哈希 - file_hash = self._calculate_file_hash(file_path) - - # 获取文件信息 - file_stat = os.stat(file_path) - filename = os.path.basename(file_path) - - metadata = { - "filename": filename, - "file_path": file_path, - "file_type": file_type, - "file_hash": file_hash, - "file_size": file_stat.st_size, - "bos_key": bos_key, - "upload_time": datetime.utcnow(), - "status": "active", - "additional_info": additional_info or {} - } - - # 检查是否已存在 - existing = self.db.files.find_one({"file_hash": file_hash}) - if existing: - logger.info(f"文件已存在: {filename} (ID: {existing['_id']})") - return str(existing['_id']) - - # 插入新记录 - result = self.db.files.insert_one(metadata) - file_id = str(result.inserted_id) - - logger.info(f"✅ 文件元数据已存储: {filename} (ID: {file_id})") - return file_id - - except Exception as e: - logger.error(f"❌ 存储文件元数据失败: {e}") - raise - - def store_vector_metadata(self, file_id: str, vector_type: str, - vdb_id: str, vector_info: Dict = None): - """ - 存储向量元数据 - - Args: - file_id: 文件ID - vector_type: 向量类型 (text_vector/image_vector) - vdb_id: VDB中的向量ID - vector_info: 向量信息 - """ - try: - vector_metadata = { - "file_id": file_id, - "vector_type": vector_type, - "vdb_id": vdb_id, - "create_time": datetime.utcnow(), - "vector_info": vector_info or {} - } - - result = self.db.vectors.insert_one(vector_metadata) - logger.info(f"✅ 向量元数据已存储: {vector_type} (ID: {result.inserted_id})") - - except Exception as e: - logger.error(f"❌ 存储向量元数据失败: {e}") - raise - - def get_file_metadata(self, file_id: str) -> Optional[Dict]: - """获取文件元数据""" - try: - from bson import ObjectId - result = self.db.files.find_one({"_id": ObjectId(file_id)}) - if result: - result['_id'] = str(result['_id']) - return result - except Exception as e: - logger.error(f"❌ 获取文件元数据失败: {e}") - return None - - def get_files_by_type(self, file_type: str, limit: int = 100) -> List[Dict]: - """根据类型获取文件列表""" - try: - cursor = self.db.files.find( - {"file_type": file_type, "status": "active"} - ).limit(limit).sort("upload_time", -1) - - results = [] - for doc in cursor: - doc['_id'] = str(doc['_id']) - results.append(doc) - - return results - except Exception as e: - logger.error(f"❌ 获取文件列表失败: {e}") - return [] - - def get_all_files(self, limit: int = 1000) -> List[Dict]: - """获取所有文件列表""" - try: - cursor = self.db.files.find( - {"status": "active"} - ).limit(limit).sort("upload_time", -1) - - results = [] - for doc in cursor: - doc['_id'] = str(doc['_id']) - results.append(doc) - - return results - except Exception as e: - logger.error(f"❌ 获取所有文件列表失败: {e}") - return [] - - def get_vector_metadata(self, file_id: str, vector_type: str = None) -> List[Dict]: - """获取向量元数据""" - try: - query = {"file_id": file_id} - if vector_type: - query["vector_type"] = vector_type - - cursor = self.db.vectors.find(query) - results = [] - for doc in cursor: - doc['_id'] = str(doc['_id']) - results.append(doc) - - return results - except Exception as e: - logger.error(f"❌ 获取向量元数据失败: {e}") - return [] - - def get_stats(self) -> Dict: - """获取统计信息""" - try: - stats = { - "total_files": self.db.files.count_documents({"status": "active"}), - "image_files": self.db.files.count_documents({"file_type": "image", "status": "active"}), - "text_files": self.db.files.count_documents({"file_type": "text", "status": "active"}), - "total_vectors": self.db.vectors.count_documents({}), - "image_vectors": self.db.vectors.count_documents({"vector_type": "image_vector"}), - "text_vectors": self.db.vectors.count_documents({"vector_type": "text_vector"}) - } - return stats - except Exception as e: - logger.error(f"❌ 获取统计信息失败: {e}") - return {} - - def delete_file_metadata(self, file_id: str): - """删除文件元数据(软删除)""" - try: - from bson import ObjectId - self.db.files.update_one( - {"_id": ObjectId(file_id)}, - {"$set": {"status": "deleted", "delete_time": datetime.utcnow()}} - ) - logger.info(f"✅ 文件元数据已删除: {file_id}") - except Exception as e: - logger.error(f"❌ 删除文件元数据失败: {e}") - raise - - def _calculate_file_hash(self, file_path: str) -> str: - """计算文件SHA256哈希""" - hash_sha256 = hashlib.sha256() - with open(file_path, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_sha256.update(chunk) - return hash_sha256.hexdigest() - - def close(self): - """关闭连接""" - if self.client: - self.client.close() - logger.info("MongoDB连接已关闭") - -# 全局实例 -mongodb_manager = None - -def get_mongodb_manager() -> MongoDBManager: - """获取MongoDB管理器实例""" - global mongodb_manager - if mongodb_manager is None: - mongodb_manager = MongoDBManager() - return mongodb_manager diff --git a/multimodal_retrieval_faiss.py b/multimodal_retrieval_faiss.py deleted file mode 100644 index f5949bc..0000000 --- a/multimodal_retrieval_faiss.py +++ /dev/null @@ -1,370 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -基于FAISS的多模态检索系统 -支持文搜文、文搜图、图搜文、图搜图四种检索模式 -""" - -import torch -import torch.nn as nn -from torch.nn.parallel import DataParallel -import numpy as np -from PIL import Image -from transformers import AutoModel, AutoProcessor, AutoTokenizer -from typing import List, Union, Tuple, Dict, Any, Optional -import os -import json -from pathlib import Path -import logging -import gc -from concurrent.futures import ThreadPoolExecutor, as_completed -import threading - -from faiss_vector_store import FaissVectorStore - -# 设置日志 -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -class MultimodalRetrievalFAISS: - """基于FAISS的多模态检索系统""" - - def __init__(self, model_name: str = "OpenSearch-AI/Ops-MM-embedding-v1-7B", - use_all_gpus: bool = True, gpu_ids: List[int] = None, - min_memory_gb: int = 12, index_path: str = "faiss_index"): - """ - 初始化多模态检索系统 - - Args: - model_name: 模型名称 - use_all_gpus: 是否使用所有可用GPU - gpu_ids: 指定使用的GPU ID列表 - min_memory_gb: 最小可用内存(GB) - index_path: FAISS索引文件路径 - """ - self.model_name = model_name - self.index_path = index_path - - # 设置GPU设备 - self._setup_devices(use_all_gpus, gpu_ids, min_memory_gb) - - # 清理GPU内存 - self._clear_all_gpu_memory() - - # 加载模型和处理器 - self._load_model_and_processor() - - # 初始化FAISS向量存储 - self.vector_store = FaissVectorStore( - index_path=index_path, - dimension=3584 # OpenSearch-AI/Ops-MM-embedding-v1-7B的向量维度 - ) - - logger.info(f"多模态检索系统初始化完成,使用模型: {model_name}") - logger.info(f"向量存储路径: {index_path}") - - def _setup_devices(self, use_all_gpus: bool, gpu_ids: List[int], min_memory_gb: int): - """设置GPU设备""" - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.use_gpu = self.device.type == "cuda" - - if self.use_gpu: - self.available_gpus = self._get_available_gpus(min_memory_gb) - - if not self.available_gpus: - logger.warning(f"没有可用的GPU或GPU内存不足{min_memory_gb}GB,将使用CPU") - self.device = torch.device("cpu") - self.use_gpu = False - else: - if gpu_ids: - self.gpu_ids = [gid for gid in gpu_ids if gid in self.available_gpus] - if not self.gpu_ids: - logger.warning(f"指定的GPU {gpu_ids}不可用或内存不足,将使用可用的GPU: {self.available_gpus}") - self.gpu_ids = self.available_gpus - elif use_all_gpus: - self.gpu_ids = self.available_gpus - else: - self.gpu_ids = [self.available_gpus[0]] - - logger.info(f"使用GPU: {self.gpu_ids}") - self.device = torch.device(f"cuda:{self.gpu_ids[0]}") - - def _get_available_gpus(self, min_memory_gb: int) -> List[int]: - """获取可用的GPU列表""" - available_gpus = [] - for i in range(torch.cuda.device_count()): - total_mem = torch.cuda.get_device_properties(i).total_memory / (1024 ** 3) # GB - if total_mem >= min_memory_gb: - available_gpus.append(i) - return available_gpus - - def _clear_all_gpu_memory(self): - """清理GPU内存""" - if torch.cuda.is_available(): - torch.cuda.empty_cache() - gc.collect() - - def _load_model_and_processor(self): - """加载模型和处理器""" - logger.info(f"加载模型和处理器: {self.model_name}") - - # 加载tokenizer和processor - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) - self.processor = AutoProcessor.from_pretrained(self.model_name) - - # 加载模型 - self.model = AutoModel.from_pretrained( - self.model_name, - torch_dtype=torch.float16 if self.use_gpu else torch.float32, - device_map="auto" if len(self.gpu_ids) > 1 else None - ) - - # 如果使用多GPU,包装模型 - if len(self.gpu_ids) > 1: - self.model = DataParallel(self.model, device_ids=self.gpu_ids) - - self.model.eval() - self.model.to(self.device) - - logger.info("模型和处理器加载完成") - - def encode_text(self, text: Union[str, List[str]]) -> np.ndarray: - """编码文本为向量""" - if isinstance(text, str): - text = [text] - - inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt") - inputs = {k: v.to(self.device) for k, v in inputs.items()} - - with torch.no_grad(): - outputs = self.model(**inputs) - # 获取[CLS]标记的隐藏状态作为句子表示 - text_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy() - - # 归一化向量 - text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, axis=1, keepdims=True) - return text_embeddings[0] if len(text) == 1 else text_embeddings - - def encode_image(self, image: Union[Image.Image, List[Image.Image]]) -> np.ndarray: - """编码图像为向量""" - if isinstance(image, Image.Image): - image = [image] - - inputs = self.processor(images=image, return_tensors="pt") - inputs = {k: v.to(self.device) for k, v in inputs.items()} - - with torch.no_grad(): - outputs = self.model.vision_model(**inputs) - # 获取[CLS]标记的隐藏状态作为图像表示 - image_embeddings = outputs.pooler_output.cpu().numpy() - - # 归一化向量 - image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True) - return image_embeddings[0] if len(image) == 1 else image_embeddings - - def add_texts( - self, - texts: List[str], - metadatas: Optional[List[Dict[str, Any]]] = None - ) -> List[str]: - """ - 添加文本到检索系统 - - Args: - texts: 文本列表 - metadatas: 元数据列表,每个元素是一个字典 - - Returns: - 添加的文本ID列表 - """ - if not texts: - return [] - - if metadatas is None: - metadatas = [{} for _ in range(len(texts))] - - if len(texts) != len(metadatas): - raise ValueError("texts和metadatas长度必须相同") - - # 编码文本 - text_embeddings = self.encode_text(texts) - - # 准备元数据 - for i, text in enumerate(texts): - metadatas[i].update({ - "text": text, - "type": "text" - }) - - # 添加到向量存储 - vector_ids = self.vector_store.add_vectors(text_embeddings, metadatas) - - logger.info(f"成功添加{len(vector_ids)}条文本到检索系统") - return vector_ids - - def add_images( - self, - images: List[Image.Image], - metadatas: Optional[List[Dict[str, Any]]] = None - ) -> List[str]: - """ - 添加图像到检索系统 - - Args: - images: PIL图像列表 - metadatas: 元数据列表,每个元素是一个字典 - - Returns: - 添加的图像ID列表 - """ - if not images: - return [] - - if metadatas is None: - metadatas = [{} for _ in range(len(images))] - - if len(images) != len(metadatas): - raise ValueError("images和metadatas长度必须相同") - - # 编码图像 - image_embeddings = self.encode_image(images) - - # 准备元数据 - for i, image in enumerate(images): - metadatas[i].update({ - "type": "image", - "width": image.width, - "height": image.height - }) - - # 添加到向量存储 - vector_ids = self.vector_store.add_vectors(image_embeddings, metadatas) - - logger.info(f"成功添加{len(vector_ids)}张图像到检索系统") - return vector_ids - - def search_by_text( - self, - query: str, - k: int = 5, - filter_condition: Optional[Dict[str, Any]] = None - ) -> List[Dict[str, Any]]: - """ - 文本搜索 - - Args: - query: 查询文本 - k: 返回结果数量 - filter_condition: 过滤条件 - - Returns: - 搜索结果列表,每个元素包含相似项和分数 - """ - # 编码查询文本 - query_embedding = self.encode_text(query) - - # 执行搜索 - results, distances = self.vector_store.search(query_embedding, k) - - # 处理结果 - search_results = [] - for i, (result, distance) in enumerate(zip(results, distances)): - result["score"] = 1.0 / (1.0 + distance) # 将距离转换为相似度分数 - search_results.append(result) - - return search_results - - def search_by_image( - self, - image: Image.Image, - k: int = 5, - filter_condition: Optional[Dict[str, Any]] = None - ) -> List[Dict[str, Any]]: - """ - 图像搜索 - - Args: - image: 查询图像 - k: 返回结果数量 - filter_condition: 过滤条件 - - Returns: - 搜索结果列表,每个元素包含相似项和分数 - """ - # 编码查询图像 - query_embedding = self.encode_image(image) - - # 执行搜索 - results, distances = self.vector_store.search(query_embedding, k) - - # 处理结果 - search_results = [] - for i, (result, distance) in enumerate(zip(results, distances)): - result["score"] = 1.0 / (1.0 + distance) # 将距离转换为相似度分数 - search_results.append(result) - - return search_results - - def get_vector_count(self) -> int: - """获取向量数量""" - return self.vector_store.get_vector_count() - - def save_index(self): - """保存索引""" - self.vector_store.save_index() - logger.info("索引已保存") - - def __del__(self): - """析构函数,确保资源被正确释放""" - if hasattr(self, 'model'): - del self.model - self._clear_all_gpu_memory() - if hasattr(self, 'vector_store'): - self.save_index() - - -def test_faiss_system(): - """测试FAISS多模态检索系统""" - import time - from PIL import Image - import numpy as np - - # 初始化检索系统 - print("初始化多模态检索系统...") - retrieval = MultimodalRetrievalFAISS( - model_name="OpenSearch-AI/Ops-MM-embedding-v1-7B", - use_all_gpus=True, - index_path="faiss_index_test" - ) - - # 测试文本 - texts = [ - "一只可爱的橘色猫咪在沙发上睡觉", - "城市夜景中的高楼大厦和车流", - "阳光明媚的海滩上,人们在冲浪和晒太阳", - "美味的意大利面配红酒和沙拉", - "雪山上滑雪的运动员" - ] - - # 添加文本 - print("\n添加文本到检索系统...") - text_ids = retrieval.add_texts(texts) - print(f"添加了{len(text_ids)}条文本") - - # 测试文本搜索 - print("\n测试文本搜索...") - query_text = "一只猫在睡觉" - print(f"查询: {query_text}") - results = retrieval.search_by_text(query_text, k=2) - for i, result in enumerate(results): - print(f"结果 {i+1}: {result.get('text', 'N/A')} (分数: {result.get('score', 0):.4f})") - - # 测试保存和加载 - print("\n保存索引...") - retrieval.save_index() - - print("\n测试完成!") - - -if __name__ == "__main__": - test_faiss_system() diff --git a/multimodal_retrieval_local.py b/multimodal_retrieval_local.py index 8ff9208..f50d3f6 100644 --- a/multimodal_retrieval_local.py +++ b/multimodal_retrieval_local.py @@ -8,7 +8,7 @@ import torch import numpy as np from PIL import Image -from transformers import AutoModel, AutoProcessor, AutoTokenizer +from ops_mm_embedding_v1 import OpsMMEmbeddingV1 from typing import List, Union, Tuple, Dict, Any, Optional import os import json @@ -59,8 +59,8 @@ class MultimodalRetrievalLocal: # 清理GPU内存 self._clear_all_gpu_memory() - # 加载模型和处理器 - self._load_model_and_processor() + # 加载嵌入模型 + self._load_embedding_model() # 初始化FAISS索引 self._init_index() @@ -112,45 +112,23 @@ class MultimodalRetrievalLocal: torch.cuda.empty_cache() gc.collect() - def _load_model_and_processor(self): - """加载模型和处理器""" - logger.info(f"加载本地模型和处理器: {self.model_path}") - + def _load_embedding_model(self): + """加载多模态嵌入模型 OpsMMEmbeddingV1""" + logger.info(f"加载本地多模态嵌入模型: {self.model_path}") try: - # 加载模型和处理器 - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) - self.processor = AutoProcessor.from_pretrained(self.model_path) - - # 输出处理器信息 - logger.info(f"Processor类型: {type(self.processor)}") - logger.info(f"Processor方法: {dir(self.processor)}") - - # 检查是否有图像处理器 - if hasattr(self.processor, 'image_processor'): - logger.info(f"Image processor类型: {type(self.processor.image_processor)}") - logger.info(f"Image processor方法: {dir(self.processor.image_processor)}") - - # 加载模型 - self.model = AutoModel.from_pretrained( + device_str = "cuda" if self.use_gpu else "cpu" + self.model = OpsMMEmbeddingV1( self.model_path, - torch_dtype=torch.float16 if self.use_gpu else torch.float32, - device_map="auto" if len(self.gpu_ids) > 1 else None + device=device_str, + attn_implementation=None, ) - - if len(self.gpu_ids) == 1: - self.model.to(self.device) - - self.model.eval() - # 获取向量维度 - self.vector_dim = self.model.config.hidden_size + self.vector_dim = int(getattr(self.model.base_model.config, "hidden_size")) logger.info(f"向量维度: {self.vector_dim}") - - logger.info("模型和处理器加载成功") - + logger.info("嵌入模型加载成功") except Exception as e: - logger.error(f"模型加载失败: {str(e)}") - raise RuntimeError(f"模型加载失败: {str(e)}") + logger.error(f"嵌入模型加载失败: {str(e)}") + raise RuntimeError(f"嵌入模型加载失败: {str(e)}") def _init_index(self): """初始化FAISS索引""" @@ -180,133 +158,35 @@ class MultimodalRetrievalLocal: logger.error(f"元数据加载失败: {str(e)}") def encode_text(self, text: Union[str, List[str]]) -> np.ndarray: - """编码文本为向量""" + """编码文本为向量(使用 OpsMMEmbeddingV1)""" if isinstance(text, str): text = [text] - - inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt") - inputs = {k: v.to(self.device) for k, v in inputs.items()} - - with torch.no_grad(): - outputs = self.model(**inputs) - # 获取[CLS]标记的隐藏状态作为句子表示 - text_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy() - - # 归一化向量 - text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, axis=1, keepdims=True) + with torch.inference_mode(): + emb = self.model.get_text_embeddings(texts=text) + text_embeddings = emb.detach().float().cpu().numpy() + # emb 已经做过 L2 归一化,这里保持一致 return text_embeddings[0] if len(text) == 1 else text_embeddings def encode_image(self, image: Union[Image.Image, List[Image.Image]]) -> np.ndarray: - """编码图像为向量""" + """编码图像为向量(使用 OpsMMEmbeddingV1)""" try: - logger.info(f"encode_image: 开始编码图像,类型: {type(image)}") - + # 规范为列表 + images: List[Image.Image] if isinstance(image, Image.Image): - logger.info(f"encode_image: 单个图像,大小: {image.size}") - image = [image] + images = [image] else: - logger.info(f"encode_image: 图像列表,长度: {len(image)}") - - # 检查图像是否为空 - if not image or len(image) == 0: + images = image + if not images: logger.error("encode_image: 图像列表为空") - # 返回一个空的二维数组 return np.zeros((0, self.vector_dim)) - - # 检查图像是否有效 - for i, img in enumerate(image): - if not isinstance(img, Image.Image): - logger.error(f"encode_image: 第{i}个元素不是有效的PIL图像,类型: {type(img)}") - # 返回一个空的二维数组 - return np.zeros((0, self.vector_dim)) - - logger.info("encode_image: 处理图像输入") - - # 检查图像格式 - for i, img in enumerate(image): - logger.info(f"encode_image: 图像 {i} 格式: {img.format}, 模式: {img.mode}, 大小: {img.size}") - # 转换为RGB模式,如果不是 - if img.mode != 'RGB': - logger.info(f"encode_image: 将图像 {i} 从 {img.mode} 转换为 RGB") - image[i] = img.convert('RGB') - - try: - # 直接使用image_processor处理图像 - if hasattr(self.processor, 'image_processor'): - logger.info("encode_image: 使用image_processor处理图像") - pixel_values = self.processor.image_processor(images=image, return_tensors="pt").pixel_values - inputs = {"pixel_values": pixel_values} - else: - logger.info("encode_image: 使用processor处理图像") - inputs = self.processor(images=image, return_tensors="pt") - - if not inputs or len(inputs) == 0: - logger.error("encode_image: processor返回了空的输入") - return np.zeros((0, self.vector_dim)) - - logger.info(f"encode_image: 处理后的输入键: {list(inputs.keys())}") - inputs = {k: v.to(self.device) for k, v in inputs.items()} - - logger.info("encode_image: 运行模型推理") - logger.info(f"Model类型: {type(self.model)}") - logger.info(f"Model属性: {dir(self.model)}") - - # 检查模型结构 - try: - logger.info(f"Model配置: {self.model.config}") - logger.info(f"Model配置属性: {dir(self.model.config)}") - else: - visual_outputs = self.model.visual(**inputs) - - if hasattr(visual_outputs, 'pooler_output'): - image_embeddings = visual_outputs.pooler_output.cpu().numpy() - elif hasattr(visual_outputs, 'last_hidden_state'): - image_embeddings = visual_outputs.last_hidden_state[:, 0, :].cpu().numpy() - else: - logger.error("encode_image: 无法从视觉模型输出中获取图像向量") - raise ValueError("无法从视觉模型输出中获取图像向量") - else: - # 尝试直接使用模型进行推理 - logger.info("encode_image: 尝试直接使用模型进行推理") - with torch.no_grad(): - # 使用空文本输入,只提供图像 - if 'pixel_values' in inputs: - outputs = self.model(pixel_values=inputs['pixel_values'], input_ids=None) - else: - outputs = self.model(**inputs, input_ids=None) - - # 尝试从输出中获取图像向量 - if hasattr(outputs, 'image_embeds'): - image_embeddings = outputs.image_embeds.cpu().numpy() - elif hasattr(outputs, 'vision_model_output') and hasattr(outputs.vision_model_output, 'pooler_output'): - image_embeddings = outputs.vision_model_output.pooler_output.cpu().numpy() - elif hasattr(outputs, 'pooler_output'): - image_embeddings = outputs.pooler_output.cpu().numpy() - elif hasattr(outputs, 'last_hidden_state'): - image_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy() - else: - logger.error("encode_image: 无法从模型输出中获取图像向量") - raise ValueError("无法从模型输出中获取图像向量") - except Exception as e: - logger.error(f"encode_image: 处理图像时出错: {str(e)}") - raise e - return np.zeros((0, self.vector_dim)) - - # 归一化向量 - image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True) - - # 始终返回二维数组,即使只有一个图像 - if len(image) == 1: - result = np.array([image_embeddings[0]]) - logger.info(f"encode_image: 返回单个图像向量,形状: {result.shape}") - return result - else: - logger.info(f"encode_image: 返回多个图像向量,形状: {image_embeddings.shape}") - return image_embeddings - + # 强制为 RGB + rgb_images = [img.convert('RGB') if img.mode != 'RGB' else img for img in images] + with torch.inference_mode(): + emb = self.model.get_image_embeddings(images=rgb_images) + image_embeddings = emb.detach().float().cpu().numpy() + return image_embeddings except Exception as e: logger.error(f"encode_image: 异常: {str(e)}") - # 返回一个空的二维数组 return np.zeros((0, self.vector_dim)) def add_texts( diff --git a/multimodal_retrieval_vdb.py b/multimodal_retrieval_vdb.py deleted file mode 100644 index e3996a0..0000000 --- a/multimodal_retrieval_vdb.py +++ /dev/null @@ -1,592 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -集成百度VDB的多模态检索系统 -支持文搜文、文搜图、图搜文、图搜图四种检索模式 -""" - -import torch -import numpy as np -from PIL import Image -from transformers import AutoModel, AutoProcessor, AutoTokenizer -from typing import List, Union, Tuple, Dict, Any -import os -import json -import logging -import gc -from baidu_vdb_backend import BaiduVDBBackend - -# 设置日志 -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -class MultimodalRetrievalVDB: - """集成百度VDB的多模态检索系统""" - - def __init__(self, model_name: str = "OpenSearch-AI/Ops-MM-embedding-v1-7B", - use_all_gpus: bool = True, gpu_ids: List[int] = None, - vdb_config: Dict[str, str] = None): - """ - 初始化多模态检索系统 - - Args: - model_name: 模型名称 - use_all_gpus: 是否使用所有可用GPU - gpu_ids: 指定使用的GPU ID列表 - vdb_config: VDB配置字典 - """ - self.model_name = model_name - - # 设置GPU设备 - self._setup_devices(use_all_gpus, gpu_ids) - - # 清理GPU内存 - self._clear_gpu_memory() - - logger.info(f"正在加载模型到GPU: {self.device_ids}") - - # 加载模型和处理器 - self.model = None - self.tokenizer = None - self.processor = None - self._load_model() - - # 初始化百度VDB后端 - if vdb_config is None: - vdb_config = { - "account": "root", - "api_key": "vdb$yjr9ln3n0td", - "endpoint": "http://180.76.96.191:5287", - "database_name": "multimodal_retrieval" - } - - try: - self.vdb = BaiduVDBBackend(**vdb_config) - logger.info("✅ VDB后端初始化成功") - except Exception as e: - logger.error(f"❌ VDB后端初始化失败: {e}") - # 创建一个模拟的VDB后端,避免系统完全崩溃 - self.vdb = None - logger.warning("⚠️ 系统将在无VDB模式下运行,数据将不会持久化") - - logger.info("多模态检索系统初始化完成") - - def _setup_devices(self, use_all_gpus: bool, gpu_ids: List[int]): - """设置GPU设备""" - if not torch.cuda.is_available(): - raise RuntimeError("CUDA不可用,无法使用GPU") - - total_gpus = torch.cuda.device_count() - logger.info(f"检测到 {total_gpus} 个GPU") - - if use_all_gpus: - self.device_ids = list(range(total_gpus)) - elif gpu_ids: - self.device_ids = gpu_ids - else: - self.device_ids = [0] - - self.num_gpus = len(self.device_ids) - self.primary_device = f"cuda:{self.device_ids[0]}" - - logger.info(f"使用GPU: {self.device_ids}, 主设备: {self.primary_device}") - - def _clear_gpu_memory(self): - """清理GPU内存""" - for gpu_id in self.device_ids: - torch.cuda.set_device(gpu_id) - torch.cuda.empty_cache() - torch.cuda.synchronize() - gc.collect() - logger.info("GPU内存已清理") - - def _load_model(self): - """加载模型""" - try: - # 设置环境变量优化内存使用 - os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' - - # 清理GPU内存 - self._clear_gpu_memory() - - # 设置离线模式环境变量 - os.environ['TRANSFORMERS_OFFLINE'] = '1' - os.environ['HF_HUB_OFFLINE'] = '1' - - # 尝试加载模型,如果网络失败则使用本地缓存 - try: - # 加载模型 - if self.num_gpus > 1: - # 多GPU加载 - max_memory = {i: "18GiB" for i in self.device_ids} - - self.model = AutoModel.from_pretrained( - self.model_name, - trust_remote_code=True, - torch_dtype=torch.float16, - device_map="auto", - max_memory=max_memory, - low_cpu_mem_usage=True, - local_files_only=False # 允许从网络下载 - ) - else: - # 单GPU加载 - self.model = AutoModel.from_pretrained( - self.model_name, - trust_remote_code=True, - torch_dtype=torch.float16, - device_map=self.primary_device, - local_files_only=False # 允许从网络下载 - ) - - logger.info("模型从网络加载成功") - - except Exception as network_error: - logger.warning(f"网络加载失败,尝试本地缓存: {network_error}") - - # 尝试从本地缓存加载 - try: - if self.num_gpus > 1: - max_memory = {i: "18GiB" for i in self.device_ids} - - self.model = AutoModel.from_pretrained( - self.model_name, - trust_remote_code=True, - torch_dtype=torch.float16, - device_map="auto", - max_memory=max_memory, - low_cpu_mem_usage=True, - local_files_only=True # 仅使用本地文件 - ) - else: - self.model = AutoModel.from_pretrained( - self.model_name, - trust_remote_code=True, - torch_dtype=torch.float16, - device_map=self.primary_device, - local_files_only=True # 仅使用本地文件 - ) - - logger.info("模型从本地缓存加载成功") - - except Exception as local_error: - logger.error(f"本地缓存加载也失败: {local_error}") - raise local_error - - # 加载分词器和处理器 - try: - self.tokenizer = AutoTokenizer.from_pretrained( - self.model_name, - trust_remote_code=True, - local_files_only=False - ) - except Exception as e: - logger.warning(f"Tokenizer网络加载失败,尝试本地: {e}") - self.tokenizer = AutoTokenizer.from_pretrained( - self.model_name, - trust_remote_code=True, - local_files_only=True - ) - - try: - self.processor = AutoProcessor.from_pretrained( - self.model_name, - trust_remote_code=True, - local_files_only=False - ) - except Exception as e: - logger.warning(f"Processor加载失败,使用tokenizer: {e}") - try: - self.processor = AutoProcessor.from_pretrained( - self.model_name, - trust_remote_code=True, - local_files_only=True - ) - except Exception as e2: - logger.warning(f"Processor本地加载也失败,使用tokenizer: {e2}") - self.processor = self.tokenizer - - logger.info("模型加载完成") - return True - - except Exception as e: - logger.error(f"模型加载失败: {str(e)}") - return False - - def encode_text_batch(self, texts: List[str]) -> np.ndarray: - """ - 批量编码文本为向量 - - Args: - texts: 文本列表 - - Returns: - 文本向量数组 - """ - if not texts: - return np.array([]) - - with torch.no_grad(): - # 预处理输入 - inputs = self.tokenizer( - text=texts, - return_tensors="pt", - padding=True, - truncation=True, - max_length=512 - ) - - # 将输入移动到主设备 - inputs = {k: v.to(self.primary_device) for k, v in inputs.items()} - - # 前向传播 - outputs = self.model(**inputs) - embeddings = outputs.last_hidden_state.mean(dim=1) - - # 清理GPU内存 - del inputs, outputs - torch.cuda.empty_cache() - - return embeddings.cpu().numpy().astype(np.float32) - - def encode_image_batch(self, images: List[Union[str, Image.Image]]) -> np.ndarray: - """ - 批量编码图像为向量 - - Args: - images: 图像路径或PIL图像列表 - - Returns: - 图像向量数组 - """ - if not images: - return np.array([]) - - # 预处理图像 - processed_images = [] - for img in images: - if isinstance(img, str): - img = Image.open(img).convert('RGB') - elif isinstance(img, Image.Image): - img = img.convert('RGB') - processed_images.append(img) - - try: - logger.info(f"处理 {len(processed_images)} 张图像") - - # 使用多模态模型生成图像embedding - conversations = [] - for i in range(len(processed_images)): - conversation = [ - { - "role": "user", - "content": [ - {"type": "image", "image": processed_images[i]}, - {"type": "text", "text": "What is in this image?"} - ] - } - ] - conversations.append(conversation) - - # 使用processor处理 - try: - texts = [] - for conv in conversations: - text = self.processor.apply_chat_template(conv, tokenize=False, add_generation_prompt=False) - texts.append(text) - - # 处理文本和图像 - inputs = self.processor( - text=texts, - images=processed_images, - return_tensors="pt", - padding=True - ) - - # 移动到GPU - inputs = {k: v.to(self.primary_device) for k, v in inputs.items()} - - # 获取模型输出 - with torch.no_grad(): - outputs = self.model(**inputs) - embeddings = outputs.last_hidden_state.mean(dim=1) - - # 转换为numpy数组 - embeddings = embeddings.cpu().numpy().astype(np.float32) - - except Exception as inner_e: - logger.warning(f"多模态模型图像编码失败: {inner_e}") - # 使用零向量作为fallback - embedding_dim = 3584 - embeddings = np.zeros((len(processed_images), embedding_dim), dtype=np.float32) - - logger.info(f"生成图像embeddings: {embeddings.shape}") - return embeddings - - except Exception as e: - logger.error(f"图像编码失败: {e}") - # 返回零向量作为fallback - embedding_dim = 3584 - embeddings = np.zeros((len(processed_images), embedding_dim), dtype=np.float32) - return embeddings - - def store_texts(self, texts: List[str], metadata: List[Dict] = None) -> List[str]: - """ - 存储文本数据 - - Args: - texts: 文本列表 - metadata: 元数据列表 - - Returns: - 存储的ID列表 - """ - if self.vdb is None: - logger.warning("VDB不可用,文本数据将不会持久化存储") - return [] - - logger.info(f"正在存储 {len(texts)} 条文本数据") - - # 分批处理 - batch_size = 16 - all_ids = [] - - for i in range(0, len(texts), batch_size): - batch_texts = texts[i:i+batch_size] - batch_metadata = metadata[i:i+batch_size] if metadata else None - - try: - # 编码文本 - vectors = self.encode_text_batch(batch_texts) - - # 存储到VDB - ids = self.vdb.store_text_vectors(batch_texts, vectors, batch_metadata) - all_ids.extend(ids) - - logger.info(f"已处理 {i + len(batch_texts)}/{len(texts)} 条文本") - - except Exception as e: - logger.error(f"处理文本批次时出错: {e}") - continue - - logger.info(f"✅ 文本存储完成,共 {len(all_ids)} 条") - return all_ids - - def store_images(self, image_paths: List[str], metadata: List[Dict] = None) -> List[str]: - """ - 存储图像数据 - - Args: - image_paths: 图像路径列表 - metadata: 元数据列表 - - Returns: - 存储的ID列表 - """ - if self.vdb is None: - logger.warning("VDB不可用,图像数据将不会持久化存储") - return [] - - logger.info(f"正在存储 {len(image_paths)} 张图像数据") - - # 图像处理使用更小的批次 - batch_size = 8 - all_ids = [] - - for i in range(0, len(image_paths), batch_size): - batch_images = image_paths[i:i+batch_size] - batch_metadata = metadata[i:i+batch_size] if metadata else None - - try: - # 编码图像 - vectors = self.encode_image_batch(batch_images) - - # 存储到VDB - ids = self.vdb.store_image_vectors(batch_images, vectors, batch_metadata) - all_ids.extend(ids) - - logger.info(f"已处理 {i + len(batch_images)}/{len(image_paths)} 张图像") - - except Exception as e: - logger.error(f"处理图像批次时出错: {e}") - continue - - logger.info(f"✅ 图像存储完成,共 {len(all_ids)} 条") - return all_ids - - def search_text_by_text(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]: - """文搜文:使用文本查询搜索相似文本""" - if self.vdb is None: - logger.warning("VDB不可用,无法执行搜索") - return [] - - logger.info(f"执行文搜文查询: {query}") - - # 编码查询文本 - query_vector = self.encode_text_batch([query])[0] - - # 在VDB中搜索 - results = self.vdb.search_text_vectors(query_vector, top_k) - - # 格式化结果 - formatted_results = [] - for doc_id, text_content, score, metadata in results: - formatted_results.append((text_content, score)) - - return formatted_results - - def search_images_by_text(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]: - """文搜图:使用文本查询搜索相似图像""" - if self.vdb is None: - logger.warning("VDB不可用,无法执行搜索") - return [] - - logger.info(f"执行文搜图查询: {query}") - - # 编码查询文本 - query_vector = self.encode_text_batch([query])[0] - - # 在VDB中搜索图像 - results = self.vdb.search_image_vectors(query_vector, top_k) - - # 格式化结果 - formatted_results = [] - for doc_id, image_path, image_name, score, metadata in results: - formatted_results.append((image_path, score)) - - return formatted_results - - def search_text_by_image(self, query_image: Union[str, Image.Image], top_k: int = 5) -> List[Tuple[str, float]]: - """图搜文:使用图像查询搜索相似文本""" - if self.vdb is None: - logger.warning("VDB不可用,无法执行搜索") - return [] - - logger.info(f"执行图搜文查询") - - # 编码查询图像 - query_vector = self.encode_image_batch([query_image])[0] - - # 在VDB中搜索文本 - results = self.vdb.search_text_vectors(query_vector, top_k) - - # 格式化结果 - formatted_results = [] - for doc_id, text_content, score, metadata in results: - formatted_results.append((text_content, score)) - - return formatted_results - - def search_images_by_image(self, query_image: Union[str, Image.Image], top_k: int = 5) -> List[Tuple[str, float]]: - """图搜图:使用图像查询搜索相似图像""" - if self.vdb is None: - logger.warning("VDB不可用,无法执行搜索") - return [] - - logger.info(f"执行图搜图查询") - - # 编码查询图像 - query_vector = self.encode_image_batch([query_image])[0] - - # 在VDB中搜索图像 - results = self.vdb.search_image_vectors(query_vector, top_k) - - # 格式化结果 - formatted_results = [] - for doc_id, image_path, image_name, score, metadata in results: - formatted_results.append((image_path, score)) - - return formatted_results - - # Web应用兼容的方法名称 - def search_text_to_text(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]: - """文搜文:Web应用兼容方法""" - return self.search_text_by_text(query, top_k) - - def search_text_to_image(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]: - """文搜图:Web应用兼容方法""" - return self.search_images_by_text(query, top_k) - - def search_image_to_text(self, query_image: Union[str, Image.Image], top_k: int = 5) -> List[Tuple[str, float]]: - """图搜文:Web应用兼容方法""" - return self.search_text_by_image(query_image, top_k) - - def search_image_to_image(self, query_image: Union[str, Image.Image], top_k: int = 5) -> List[Tuple[str, float]]: - """图搜图:Web应用兼容方法""" - return self.search_images_by_image(query_image, top_k) - - def get_statistics(self) -> Dict[str, Any]: - """获取系统统计信息""" - if self.vdb is None: - return {"error": "VDB不可用"} - return self.vdb.get_statistics() - - def clear_all_data(self): - """清空所有数据""" - if self.vdb is None: - logger.warning("VDB不可用,无法清空数据") - return - self.vdb.clear_all_data() - - def close(self): - """关闭系统""" - if self.vdb: - self.vdb.close() - self._clear_gpu_memory() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.close() - - -def check_system_info(): - """检查系统信息""" - print("=== 多模态检索系统信息 ===") - - if not torch.cuda.is_available(): - print("❌ CUDA不可用") - return - - gpu_count = torch.cuda.device_count() - print(f"✅ 检测到 {gpu_count} 个GPU") - print(f"CUDA版本: {torch.version.cuda}") - print(f"PyTorch版本: {torch.__version__}") - - for i in range(gpu_count): - gpu_name = torch.cuda.get_device_name(i) - gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3 - print(f"GPU {i}: {gpu_name} ({gpu_memory:.1f}GB)") - - print("========================") - - -if __name__ == "__main__": - # 检查系统环境 - check_system_info() - - # 示例使用 - print("\n正在初始化多模态检索系统...") - - try: - retrieval_system = MultimodalRetrievalVDB() - print("✅ 系统初始化成功!") - - # 显示统计信息 - stats = retrieval_system.get_statistics() - print(f"\n📊 数据库统计信息: {stats}") - - print("\n🚀 多模态检索系统就绪!") - print("支持的检索模式:") - print("1. 文搜文: search_text_by_text()") - print("2. 文搜图: search_images_by_text()") - print("3. 图搜文: search_text_by_image()") - print("4. 图搜图: search_images_by_image()") - print("5. 存储文本: store_texts()") - print("6. 存储图像: store_images()") - - except Exception as e: - print(f"❌ 系统初始化失败: {e}") - import traceback - traceback.print_exc() diff --git a/multimodal_retrieval_vdb_only.py b/multimodal_retrieval_vdb_only.py deleted file mode 100644 index a8ccee5..0000000 --- a/multimodal_retrieval_vdb_only.py +++ /dev/null @@ -1,443 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -纯百度VDB多模态检索系统 - 完全替代FAISS -支持文搜文、文搜图、图搜文、图搜图四种检索模式 -""" - -import torch -import torch.nn as nn -from torch.nn.parallel import DataParallel, DistributedDataParallel -import numpy as np -from PIL import Image -from transformers import AutoModel, AutoProcessor, AutoTokenizer -from typing import List, Union, Tuple, Dict, Any -import os -import json -from pathlib import Path -import logging -import gc -from concurrent.futures import ThreadPoolExecutor, as_completed -import threading - -from baidu_vdb_production import BaiduVDBProduction - -# 设置日志 -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -class MultimodalRetrievalVDBOnly: - """纯百度VDB多模态检索系统,完全替代FAISS""" - - def __init__(self, model_name: str = "OpenSearch-AI/Ops-MM-embedding-v1-7B", - use_all_gpus: bool = True, gpu_ids: List[int] = None, min_memory_gb=12): - """ - 初始化纯VDB多模态检索系统 - - Args: - model_name: 模型名称 - use_all_gpus: 是否使用所有可用GPU - gpu_ids: 指定使用的GPU ID列表 - min_memory_gb: 最小可用内存(GB) - """ - self.model_name = model_name - - # 设置GPU设备 - self._setup_devices(use_all_gpus, gpu_ids, min_memory_gb) - - # 清理GPU内存 - self._clear_all_gpu_memory() - - logger.info(f"正在加载模型到多GPU: {self.device_ids}") - - # 加载模型和处理器 - self.model = None - self.tokenizer = None - self.processor = None - self._load_model_multigpu() - - # 初始化百度VDB后端(替代FAISS索引) - logger.info("初始化百度VDB后端...") - self.vdb = BaiduVDBProduction() - logger.info("✅ 百度VDB后端初始化完成") - - # 线程锁 - self.model_lock = threading.Lock() - - logger.info("✅ 纯VDB多模态检索系统初始化完成") - - def _setup_devices(self, use_all_gpus, gpu_ids, min_memory_gb): - """设置GPU设备""" - if not torch.cuda.is_available(): - raise RuntimeError("CUDA不可用,需要GPU支持") - - total_gpus = torch.cuda.device_count() - logger.info(f"检测到 {total_gpus} 个GPU") - - # 获取可用GPU - available_gpus = [] - for i in range(total_gpus): - memory_gb = torch.cuda.get_device_properties(i).total_memory / (1024**3) - free_memory = torch.cuda.memory_reserved(i) / (1024**3) - available_memory = memory_gb - free_memory - - logger.info(f"GPU {i}: {torch.cuda.get_device_properties(i).name} ({memory_gb:.1f}GB)") - - if available_memory >= min_memory_gb: - available_gpus.append(i) - logger.info(f"GPU {i}: {available_memory:.0f}MB 可用 (合适)") - else: - logger.info(f"GPU {i}: {available_memory:.0f}MB 可用 (不足)") - - if not available_gpus: - raise RuntimeError(f"没有找到满足 {min_memory_gb}GB 内存要求的GPU") - - # 选择使用的GPU - if gpu_ids: - self.device_ids = [gpu_id for gpu_id in gpu_ids if gpu_id in available_gpus] - elif use_all_gpus: - self.device_ids = available_gpus - else: - self.device_ids = [available_gpus[0]] - - if not self.device_ids: - raise RuntimeError("没有可用的GPU设备") - - # 设置主设备 - self.primary_device = f"cuda:{self.device_ids[0]}" - torch.cuda.set_device(self.device_ids[0]) - - logger.info(f"使用GPU: {self.device_ids}, 主设备: {self.primary_device}") - - def _clear_all_gpu_memory(self): - """清理所有GPU内存""" - for device_id in self.device_ids: - with torch.cuda.device(device_id): - torch.cuda.empty_cache() - gc.collect() - logger.info("所有GPU内存已清理") - - def _load_model_multigpu(self): - """加载模型到多GPU""" - try: - # 清理GPU内存 - self._clear_all_gpu_memory() - - logger.info(f"正在加载模型到多GPU: {self.device_ids}") - - # 加载模型 - self.model = AutoModel.from_pretrained( - self.model_name, - torch_dtype=torch.float16, - trust_remote_code=True, - device_map="auto" - ) - - # 加载tokenizer和processor - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - logger.info("Tokenizer加载成功") - - self.processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=True) - logger.info("Processor加载成功") - - # 显示设备映射 - if hasattr(self.model, 'hf_device_map'): - logger.info(f"模型已成功加载到设备: {dict(list(self.model.hf_device_map.items())[:10])}") - - self.model.eval() - logger.info("多GPU模型加载完成") - - except Exception as e: - logger.error(f"模型加载失败: {e}") - raise - - def encode_text_batch(self, texts: List[str], batch_size: int = 8) -> np.ndarray: - """批量编码文本""" - try: - with self.model_lock: - all_embeddings = [] - - for i in range(0, len(texts), batch_size): - batch_texts = texts[i:i + batch_size] - - # 使用processor处理文本 - inputs = self.processor( - text=batch_texts, - return_tensors="pt", - padding=True, - truncation=True, - max_length=512 - ) - - # 将输入移动到主设备 - inputs = {k: v.to(self.primary_device) for k, v in inputs.items()} - - with torch.no_grad(): - outputs = self.model(**inputs) - embeddings = outputs.last_hidden_state.mean(dim=1) - embeddings = embeddings.cpu().numpy() - all_embeddings.append(embeddings) - - return np.vstack(all_embeddings) - - except Exception as e: - logger.error(f"文本编码失败: {e}") - return np.zeros((len(texts), 3584), dtype=np.float32) - - def encode_image_batch(self, images: List[Union[str, Image.Image]], batch_size: int = 4) -> np.ndarray: - """批量编码图像""" - try: - with self.model_lock: - processed_images = [] - - # 处理图像输入 - for img in images: - if isinstance(img, str): - if os.path.exists(img): - processed_images.append(Image.open(img).convert('RGB')) - else: - logger.warning(f"图像文件不存在: {img}") - processed_images.append(Image.new('RGB', (224, 224), color='white')) - elif isinstance(img, Image.Image): - processed_images.append(img.convert('RGB')) - else: - logger.warning(f"不支持的图像类型: {type(img)}") - processed_images.append(Image.new('RGB', (224, 224), color='white')) - - all_embeddings = [] - - for i in range(0, len(processed_images), batch_size): - batch_images = processed_images[i:i + batch_size] - - # 使用processor处理图像 - inputs = self.processor( - images=batch_images, - return_tensors="pt", - padding=True - ) - - # 将输入移动到主设备 - inputs = {k: v.to(self.primary_device) for k, v in inputs.items()} - - with torch.no_grad(): - outputs = self.model(**inputs) - embeddings = outputs.last_hidden_state.mean(dim=1) - embeddings = embeddings.cpu().numpy() - all_embeddings.append(embeddings) - - return np.vstack(all_embeddings) - - except Exception as e: - logger.error(f"图像编码失败: {e}") - embedding_dim = 3584 - embeddings = np.zeros((len(images), embedding_dim), dtype=np.float32) - return embeddings - - def build_text_index_parallel(self, texts: List[str], save_path: str = None): - """ - 构建文本索引(使用VDB替代FAISS) - """ - try: - logger.info(f"正在构建文本索引,共 {len(texts)} 条文本") - - # 编码文本 - embeddings = self.encode_text_batch(texts) - - # 使用VDB存储 - self.vdb.build_text_index(texts, embeddings) - - logger.info("文本索引构建完成") - - except Exception as e: - logger.error(f"构建文本索引失败: {e}") - raise - - def build_image_index_parallel(self, image_paths: List[str], save_path: str = None): - """ - 构建图像索引(使用VDB替代FAISS) - """ - try: - logger.info(f"正在构建图像索引,共 {len(image_paths)} 张图像") - - # 编码图像 - embeddings = self.encode_image_batch(image_paths) - - # 使用VDB存储 - self.vdb.build_image_index(image_paths, embeddings) - - logger.info("图像索引构建完成") - - except Exception as e: - logger.error(f"构建图像索引失败: {e}") - raise - - def search_text_by_text(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]: - """文搜文:使用文本查询搜索相似文本""" - try: - query_embedding = self.encode_text_batch([query]) - return self.vdb.search_text_by_text(query_embedding[0], top_k) - except Exception as e: - logger.error(f"文搜文失败: {e}") - return [] - - def search_images_by_text(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]: - """文搜图:使用文本查询搜索相似图像""" - try: - query_embedding = self.encode_text_batch([query]) - return self.vdb.search_images_by_text(query_embedding[0], top_k) - except Exception as e: - logger.error(f"文搜图失败: {e}") - return [] - - def search_images_by_image(self, query_image: Union[str, Image.Image], top_k: int = 5) -> List[Tuple[str, float]]: - """图搜图:使用图像查询搜索相似图像""" - try: - query_embedding = self.encode_image_batch([query_image]) - return self.vdb.search_images_by_image(query_embedding[0], top_k) - except Exception as e: - logger.error(f"图搜图失败: {e}") - return [] - - def search_text_by_image(self, query_image: Union[str, Image.Image], top_k: int = 5) -> List[Tuple[str, float]]: - """图搜文:使用图像查询搜索相似文本""" - try: - query_embedding = self.encode_image_batch([query_image]) - return self.vdb.search_text_by_image(query_embedding[0], top_k) - except Exception as e: - logger.error(f"图搜文失败: {e}") - return [] - - # Web应用兼容方法 - def search_by_text(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]: - """文搜文:Web应用兼容方法""" - return self.search_text_by_text(query, top_k) - - def search_by_image(self, query_image: Union[str, Image.Image], top_k: int = 5) -> List[Tuple[str, float]]: - """图搜图:Web应用兼容方法""" - return self.search_images_by_image(query_image, top_k) - - def search_images_by_text_query(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]: - """文搜图:Web应用兼容方法""" - return self.search_images_by_text(query, top_k) - - def search_texts_by_image_query(self, query_image: Union[str, Image.Image], top_k: int = 5) -> List[Tuple[str, float]]: - """图搜文:Web应用兼容方法""" - return self.search_text_by_image(query_image, top_k) - - def get_statistics(self) -> Dict[str, Any]: - """获取系统统计信息""" - try: - vdb_stats = self.vdb.get_statistics() - - stats = { - "model_name": self.model_name, - "device_ids": self.device_ids, - "primary_device": self.primary_device, - "backend": "Baidu VDB (No FAISS)", - **vdb_stats - } - - return stats - - except Exception as e: - logger.error(f"获取统计信息失败: {e}") - return {"status": "error", "error": str(e)} - - def clear_all_data(self): - """清空所有数据""" - try: - self.vdb.clear_all_data() - logger.info("✅ 所有数据已清空") - except Exception as e: - logger.error(f"❌ 清空数据失败: {e}") - - def get_gpu_memory_info(self): - """获取所有GPU内存使用信息""" - memory_info = {} - for device_id in self.device_ids: - with torch.cuda.device(device_id): - allocated = torch.cuda.memory_allocated() / (1024**3) - reserved = torch.cuda.memory_reserved() / (1024**3) - total = torch.cuda.get_device_properties(device_id).total_memory / (1024**3) - - memory_info[f"GPU_{device_id}"] = { - "allocated_GB": round(allocated, 2), - "reserved_GB": round(reserved, 2), - "total_GB": round(total, 2), - "free_GB": round(total - reserved, 2) - } - - return memory_info - - def cleanup(self): - """清理资源""" - try: - if self.vdb: - self.vdb.close() - - self._clear_all_gpu_memory() - logger.info("✅ 资源清理完成") - except Exception as e: - logger.error(f"❌ 资源清理失败: {e}") - -def test_vdb_only_system(): - """测试纯VDB多模态检索系统""" - print("=" * 60) - print("测试纯百度VDB多模态检索系统") - print("=" * 60) - - system = None - - try: - # 1. 初始化系统 - print("1. 初始化纯VDB多模态检索系统...") - system = MultimodalRetrievalVDBOnly() - print("✅ 系统初始化成功") - - # 2. 构建文本索引 - print("\n2. 构建文本索引...") - test_texts = [ - "人工智能技术的发展趋势", - "机器学习在医疗领域的应用", - "深度学习算法优化方法", - "计算机视觉技术创新", - "自然语言处理最新进展" - ] - - system.build_text_index_parallel(test_texts) - print("✅ 文本索引构建完成") - - # 3. 测试文搜文 - print("\n3. 测试文搜文...") - query = "AI技术" - results = system.search_text_by_text(query, top_k=3) - print(f"查询: {query}") - for i, (text, score) in enumerate(results, 1): - print(f" {i}. {text} (相似度: {score:.3f})") - - # 4. 获取统计信息 - print("\n4. 获取统计信息...") - stats = system.get_statistics() - print("系统统计:") - for key, value in stats.items(): - print(f" {key}: {value}") - - print(f"\n🎉 纯VDB系统测试完成!") - print("✅ 完全移除FAISS依赖") - print("✅ 使用百度VDB作为向量数据库") - print("✅ 支持多模态检索功能") - - return True - - except Exception as e: - print(f"❌ 测试失败: {e}") - import traceback - traceback.print_exc() - return False - - finally: - if system: - system.cleanup() - -if __name__ == "__main__": - test_vdb_only_system() diff --git a/nohup.out b/nohup.out deleted file mode 100644 index b11bb79..0000000 --- a/nohup.out +++ /dev/null @@ -1,49 +0,0 @@ -INFO:baidu_bos_manager:✅ BOS连接测试成功 -INFO:baidu_bos_manager:✅ BOS客户端初始化成功: dmtyz-demo -INFO:mongodb_manager:✅ MongoDB连接成功: mmeb -INFO:mongodb_manager:✅ MongoDB索引创建完成 -INFO:__main__:初始化多模态检索系统... -INFO:multimodal_retrieval_local:使用GPU: [0, 1] -INFO:multimodal_retrieval_local:加载本地模型和处理器: /root/models/Ops-MM-embedding-v1-7B -The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release. -You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0. -INFO:multimodal_retrieval_local:Processor类型: -INFO:multimodal_retrieval_local:Processor方法: ['__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_auto_class', '_check_special_mm_tokens', '_create_repo', '_get_arguments_from_pretrained', '_get_files_timestamps', '_get_num_multimodal_tokens', '_merge_kwargs', '_upload_modified_files', 'apply_chat_template', 'attributes', 'audio_tokenizer', 'batch_decode', 'chat_template', 'check_argument_for_proper_class', 'decode', 'feature_extractor_class', 'from_args_and_dict', 'from_pretrained', 'get_possibly_dynamic_module', 'get_processor_dict', 'image_processor', 'image_processor_class', 'image_token', 'image_token_id', 'model_input_names', 'optional_attributes', 'optional_call_args', 'post_process_image_text_to_text', 'push_to_hub', 'register_for_auto_class', 'save_pretrained', 'to_dict', 'to_json_file', 'to_json_string', 'tokenizer', 'tokenizer_class', 'validate_init_kwargs', 'video_processor', 'video_processor_class', 'video_token', 'video_token_id'] -INFO:multimodal_retrieval_local:Image processor类型: -INFO:multimodal_retrieval_local:Image processor方法: ['__backends', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slotnames__', '__str__', '__subclasshook__', '__weakref__', '_auto_class', '_create_repo', '_further_process_kwargs', '_fuse_mean_std_and_rescale_factor', '_get_files_timestamps', '_prepare_image_like_inputs', '_prepare_images_structure', '_preprocess', '_preprocess_image_like_inputs', '_process_image', '_processor_class', '_set_processor_class', '_upload_modified_files', '_valid_kwargs_names', '_validate_preprocess_kwargs', 'center_crop', 'compile_friendly_resize', 'convert_to_rgb', 'crop_size', 'data_format', 'default_to_square', 'device', 'disable_grouping', 'do_center_crop', 'do_convert_rgb', 'do_normalize', 'do_rescale', 'do_resize', 'fetch_images', 'filter_out_unused_kwargs', 'from_dict', 'from_json_file', 'from_pretrained', 'get_image_processor_dict', 'get_number_of_image_patches', 'image_mean', 'image_processor_type', 'image_std', 'input_data_format', 'max_pixels', 'merge_size', 'min_pixels', 'model_input_names', 'normalize', 'patch_size', 'preprocess', 'push_to_hub', 'register_for_auto_class', 'resample', 'rescale', 'rescale_and_normalize', 'rescale_factor', 'resize', 'return_tensors', 'save_pretrained', 'size', 'temporal_patch_size', 'to_dict', 'to_json_file', 'to_json_string', 'unused_kwargs', 'valid_kwargs'] - Loading checkpoint shards: 0%| | 0/4 [00:00 -INFO:multimodal_retrieval_local:encode_image: 图像列表,长度: 1 -INFO:multimodal_retrieval_local:encode_image: 处理图像输入 -INFO:multimodal_retrieval_local:encode_image: 图像 0 格式: JPEG, 模式: RGB, 大小: (939, 940) -ERROR:multimodal_retrieval_local:encode_image: 处理图像时出错: argument of type 'NoneType' is not iterable -ERROR:multimodal_retrieval_local:add_images: 图像编码失败,返回空数组 -INFO:multimodal_retrieval_local:索引保存成功: /root/mmeb/local_faiss_index.index -INFO:multimodal_retrieval_local:元数据保存成功: /root/mmeb/local_faiss_index_metadata.json -INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 04:02:50] "POST /api/add_image HTTP/1.1" 200 - -INFO:multimodal_retrieval_local:索引保存成功: /root/mmeb/local_faiss_index.index -INFO:multimodal_retrieval_local:元数据保存成功: /root/mmeb/local_faiss_index_metadata.json -INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 04:02:50] "POST /api/save_index HTTP/1.1" 200 - -INFO:werkzeug:127.0.0.1 - - [22/Sep/2025 04:02:51] "GET /api/system_info HTTP/1.1" 200 - diff --git a/quick_test.py b/quick_test.py deleted file mode 100644 index 8d64fd5..0000000 --- a/quick_test.py +++ /dev/null @@ -1,235 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -快速测试脚本 - 验证多模态检索系统功能 -""" - -import os -import sys -import logging -import traceback -from pathlib import Path - -# 设置日志 -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - -def test_imports(): - """测试关键模块导入""" - logger.info("🔍 测试模块导入...") - - try: - import torch - logger.info(f"✅ PyTorch {torch.__version__}") - - import transformers - logger.info(f"✅ Transformers {transformers.__version__}") - - import numpy as np - logger.info(f"✅ NumPy {np.__version__}") - - from PIL import Image - logger.info("✅ Pillow") - - import flask - logger.info(f"✅ Flask {flask.__version__}") - - try: - import pymochow - logger.info("✅ PyMochow (百度VDB SDK)") - except ImportError: - logger.warning("⚠️ PyMochow 未安装,需要运行: pip install pymochow") - - try: - import pymongo - logger.info("✅ PyMongo") - except ImportError: - logger.warning("⚠️ PyMongo 未安装,需要运行: pip install pymongo") - - return True - - except Exception as e: - logger.error(f"❌ 模块导入失败: {str(e)}") - return False - -def test_gpu_availability(): - """测试GPU可用性""" - logger.info("🖥️ 检查GPU环境...") - - try: - import torch - - if torch.cuda.is_available(): - gpu_count = torch.cuda.device_count() - logger.info(f"✅ 检测到 {gpu_count} 个GPU") - - for i in range(gpu_count): - gpu_name = torch.cuda.get_device_name(i) - gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3 - logger.info(f" GPU {i}: {gpu_name} ({gpu_memory:.1f}GB)") - - return True - else: - logger.info("ℹ️ 未检测到GPU,将使用CPU") - return False - - except Exception as e: - logger.error(f"❌ GPU检查失败: {str(e)}") - return False - -def test_baidu_vdb_connection(): - """测试百度VDB连接""" - logger.info("🔗 测试百度VDB连接...") - - try: - import pymochow - from pymochow.configuration import Configuration - from pymochow.auth.bce_credentials import BceCredentials - - # 连接配置 - account = "root" - api_key = "vdb$yjr9ln3n0td" - endpoint = "http://180.76.96.191:5287" - - config = Configuration( - credentials=BceCredentials(account, api_key), - endpoint=endpoint - ) - - client = pymochow.MochowClient(config) - - # 测试连接 - 列出数据库 - databases = client.list_databases() - logger.info(f"✅ VDB连接成功,发现 {len(databases)} 个数据库") - - client.close() - return True - - except ImportError: - logger.error("❌ PyMochow 未安装,无法测试VDB连接") - return False - except Exception as e: - logger.error(f"❌ VDB连接失败: {str(e)}") - return False - -def test_model_loading(): - """测试模型加载""" - logger.info("🤖 测试模型加载...") - - try: - from ops_mm_embedding_v1 import OpsMMEmbeddingV1 - - logger.info("正在初始化模型...") - model = OpsMMEmbeddingV1() - - # 测试文本编码 - test_texts = ["测试文本"] - embeddings = model.embed(texts=test_texts) - - logger.info(f"✅ 模型加载成功,向量维度: {embeddings.shape}") - return True - - except Exception as e: - logger.error(f"❌ 模型加载失败: {str(e)}") - logger.error(traceback.format_exc()) - return False - -def test_web_app_import(): - """测试Web应用导入""" - logger.info("🌐 测试Web应用模块...") - - try: - # 测试导入主要模块 - from multimodal_retrieval_vdb_only import MultimodalRetrievalVDBOnly - logger.info("✅ 多模态检索系统模块") - - from baidu_vdb_production import BaiduVDBProduction - logger.info("✅ 百度VDB后端模块") - - # 测试Web应用文件存在 - web_app_file = Path("web_app_vdb_production.py") - if web_app_file.exists(): - logger.info("✅ Web应用文件存在") - else: - logger.error("❌ Web应用文件不存在") - return False - - return True - - except Exception as e: - logger.error(f"❌ Web应用模块测试失败: {str(e)}") - return False - -def create_test_directories(): - """创建必要的测试目录""" - logger.info("📁 创建测试目录...") - - directories = ["uploads", "sample_images", "text_data"] - - for dir_name in directories: - dir_path = Path(dir_name) - dir_path.mkdir(exist_ok=True) - logger.info(f"✅ 目录已创建: {dir_name}") - -def main(): - """主测试函数""" - logger.info("🚀 开始快速测试...") - logger.info("=" * 50) - - test_results = {} - - # 1. 测试模块导入 - test_results["imports"] = test_imports() - - # 2. 测试GPU环境 - test_results["gpu"] = test_gpu_availability() - - # 3. 测试VDB连接 - test_results["vdb"] = test_baidu_vdb_connection() - - # 4. 测试Web应用模块 - test_results["web_modules"] = test_web_app_import() - - # 5. 创建测试目录 - create_test_directories() - - # 6. 尝试测试模型加载(可选) - if test_results["imports"]: - logger.info("\n⚠️ 模型加载测试需要较长时间,是否跳过?") - logger.info("如需测试模型,请单独运行模型测试") - # test_results["model"] = test_model_loading() - - # 输出测试结果 - logger.info("\n" + "=" * 50) - logger.info("📊 测试结果汇总:") - logger.info("=" * 50) - - for test_name, result in test_results.items(): - status = "✅ 通过" if result else "❌ 失败" - test_display = { - "imports": "模块导入", - "gpu": "GPU环境", - "vdb": "VDB连接", - "web_modules": "Web模块", - "model": "模型加载" - }.get(test_name, test_name) - - logger.info(f"{test_display}: {status}") - - # 计算成功率 - success_count = sum(test_results.values()) - total_count = len(test_results) - success_rate = (success_count / total_count) * 100 - - logger.info(f"\n总体成功率: {success_count}/{total_count} ({success_rate:.1f}%)") - - if success_rate >= 75: - logger.info("🎉 系统基本就绪!可以启动Web应用进行完整测试") - logger.info("运行命令: python web_app_vdb_production.py") - else: - logger.warning("⚠️ 系统存在问题,请检查失败的测试项") - - return test_results - -if __name__ == "__main__": - main() diff --git a/requirements.txt b/requirements.txt index 1da40e3..3bd914b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,10 +5,7 @@ accelerate>=0.20.0 faiss-cpu>=1.7.4 numpy>=1.21.0 Pillow>=9.0.0 -scikit-learn>=1.3.0 tqdm>=4.65.0 flask>=2.3.0 werkzeug>=2.3.0 -psutil>=5.9.0 -pymockow>=1.0.0 -pymongo>=4.0.0 +requests>=2.31.0 diff --git a/run_tests.py b/run_tests.py deleted file mode 100644 index 06f0f4c..0000000 --- a/run_tests.py +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -运行系统测试 - 验证多模态检索系统功能 -""" - -import os -import sys -import logging -import traceback -from pathlib import Path - -# 设置日志 -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - -def test_imports(): - """测试关键模块导入""" - logger.info("🔍 测试模块导入...") - - try: - import torch - logger.info(f"✅ PyTorch {torch.__version__}") - - import transformers - logger.info(f"✅ Transformers {transformers.__version__}") - - import numpy as np - logger.info(f"✅ NumPy {np.__version__}") - - from PIL import Image - logger.info("✅ Pillow") - - import flask - logger.info(f"✅ Flask {flask.__version__}") - - try: - import pymochow - logger.info("✅ PyMochow (百度VDB SDK)") - return True - except ImportError: - logger.warning("⚠️ PyMochow 未安装") - return False - - except Exception as e: - logger.error(f"❌ 模块导入失败: {str(e)}") - return False - -def test_baidu_vdb_connection(): - """测试百度VDB连接""" - logger.info("🔗 测试百度VDB连接...") - - try: - import pymochow - from pymochow.configuration import Configuration - from pymochow.auth.bce_credentials import BceCredentials - - # 连接配置 - account = "root" - api_key = "vdb$yjr9ln3n0td" - endpoint = "http://180.76.96.191:5287" - - config = Configuration( - credentials=BceCredentials(account, api_key), - endpoint=endpoint - ) - - client = pymochow.MochowClient(config) - - # 测试连接 - databases = client.list_databases() - logger.info(f"✅ VDB连接成功,发现 {len(databases)} 个数据库") - - client.close() - return True - - except Exception as e: - logger.error(f"❌ VDB连接失败: {str(e)}") - return False - -def test_system_modules(): - """测试系统模块""" - logger.info("🔧 测试系统模块...") - - try: - from multimodal_retrieval_vdb_only import MultimodalRetrievalVDBOnly - logger.info("✅ 多模态检索系统") - - from baidu_vdb_production import BaiduVDBProduction - logger.info("✅ 百度VDB后端") - - return True - - except Exception as e: - logger.error(f"❌ 系统模块测试失败: {str(e)}") - return False - -def create_directories(): - """创建必要目录""" - logger.info("📁 创建必要目录...") - - directories = ["uploads", "sample_images", "text_data"] - - for dir_name in directories: - dir_path = Path(dir_name) - dir_path.mkdir(exist_ok=True) - logger.info(f"✅ 目录: {dir_name}") - -def main(): - """主测试函数""" - logger.info("🚀 开始系统测试...") - logger.info("=" * 50) - - # 创建目录 - create_directories() - - # 运行测试 - results = {} - results["imports"] = test_imports() - - if results["imports"]: - results["vdb"] = test_baidu_vdb_connection() - results["modules"] = test_system_modules() - else: - logger.error("❌ 基础模块导入失败,跳过其他测试") - return False - - # 输出结果 - logger.info("\n" + "=" * 50) - logger.info("📊 测试结果:") - logger.info("=" * 50) - - for test_name, result in results.items(): - status = "✅ 通过" if result else "❌ 失败" - logger.info(f"{test_name}: {status}") - - success_count = sum(results.values()) - total_count = len(results) - success_rate = (success_count / total_count) * 100 - - logger.info(f"\n成功率: {success_count}/{total_count} ({success_rate:.1f}%)") - - if success_rate >= 75: - logger.info("🎉 系统测试通过!") - return True - else: - logger.warning("⚠️ 系统存在问题") - return False - -if __name__ == "__main__": - success = main() - sys.exit(0 if success else 1) diff --git a/run_web_server.py b/run_web_server.py deleted file mode 100644 index 9192b49..0000000 --- a/run_web_server.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -后台启动Web服务器脚本 -""" - -import os -import sys -import subprocess -import signal -import time -import logging - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -def start_web_server(): - """在后台启动Web服务器""" - try: - logger.info("🚀 启动优化版Web服务器...") - - # 启动Web应用进程 - process = subprocess.Popen([ - sys.executable, 'web_app_vdb_production.py' - ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - logger.info(f"✅ Web服务器已启动,PID: {process.pid}") - logger.info("🌐 服务地址: http://127.0.0.1:5000") - - # 等待几秒让服务器完全启动 - time.sleep(5) - - return process - - except Exception as e: - logger.error(f"❌ 启动Web服务器失败: {e}") - return None - -def stop_web_server(process): - """停止Web服务器""" - if process: - try: - process.terminate() - process.wait(timeout=5) - logger.info("✅ Web服务器已停止") - except subprocess.TimeoutExpired: - process.kill() - logger.info("🔥 强制停止Web服务器") - except Exception as e: - logger.error(f"❌ 停止Web服务器失败: {e}") - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description='Web服务器管理') - parser.add_argument('action', choices=['start', 'test'], - help='操作: start(启动服务器) 或 test(启动并运行测试)') - - args = parser.parse_args() - - if args.action == 'start': - # 只启动服务器 - process = start_web_server() - if process: - try: - logger.info("按 Ctrl+C 停止服务器") - process.wait() - except KeyboardInterrupt: - logger.info("🛑 用户停止服务") - stop_web_server(process) - - elif args.action == 'test': - # 启动服务器并运行测试 - process = start_web_server() - if process: - try: - # 运行测试 - logger.info("🧪 运行优化系统测试...") - test_result = subprocess.run([ - sys.executable, 'test_optimized_system.py' - ], capture_output=True, text=True) - - print(test_result.stdout) - if test_result.stderr: - print("STDERR:", test_result.stderr) - - logger.info(f"测试完成,退出码: {test_result.returncode}") - - finally: - # 停止服务器 - stop_web_server(process) diff --git a/start_test.sh b/start_test.sh deleted file mode 100644 index 4964f7d..0000000 --- a/start_test.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# 启动多模态检索系统测试 - -echo "🚀 启动多模态检索系统测试" -echo "================================" - -# 设置Python路径 -export PYTHONPATH=/root/mmeb:$PYTHONPATH - -# 1. 安装依赖包 -echo "📦 步骤1: 安装依赖包" -pip install pymochow pymongo --quiet - -# 2. 运行快速测试 -echo "🔍 步骤2: 运行快速测试" -python quick_test.py - -# 3. 测试百度VDB连接 -echo "🔗 步骤3: 测试百度VDB连接" -python test_baidu_vdb_connection.py - -# 4. 启动Web应用(可选) -echo "🌐 步骤4: 是否启动Web应用?(y/n)" -read -p "输入选择: " choice -if [ "$choice" = "y" ] || [ "$choice" = "Y" ]; then - echo "启动Web应用..." - python web_app_vdb_production.py -else - echo "跳过Web应用启动" -fi - -echo "✅ 测试完成!" diff --git a/start_web_app.py b/start_web_app.py deleted file mode 100644 index 3cb5acd..0000000 --- a/start_web_app.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -启动Web应用测试脚本 -""" - -import os -import sys -import logging -import subprocess -from pathlib import Path - -# 设置日志 -logging.basicConfig(level=logging.INFO, format='%(message)s') -logger = logging.getLogger(__name__) - -def check_dependencies(): - """检查依赖包""" - logger.info("📦 检查依赖包...") - - required_packages = [ - 'torch', 'transformers', 'numpy', 'PIL', 'flask', 'pymochow' - ] - - missing_packages = [] - for package in required_packages: - try: - if package == 'PIL': - from PIL import Image - else: - __import__(package) - logger.info(f"✅ {package}") - except ImportError: - missing_packages.append(package) - logger.error(f"❌ {package} 未安装") - - if missing_packages: - logger.info("安装缺失的包:") - for pkg in missing_packages: - if pkg == 'PIL': - logger.info("pip install Pillow") - elif pkg == 'pymochow': - logger.info("pip install pymochow") - else: - logger.info(f"pip install {pkg}") - return False - - return True - -def test_vdb_connection(): - """测试VDB连接""" - logger.info("🔗 测试百度VDB连接...") - - try: - import pymochow - from pymochow.configuration import Configuration - from pymochow.auth.bce_credentials import BceCredentials - - config = Configuration( - credentials=BceCredentials("root", "vdb$yjr9ln3n0td"), - endpoint="http://180.76.96.191:5287" - ) - - client = pymochow.MochowClient(config) - databases = client.list_databases() - client.close() - - logger.info(f"✅ VDB连接成功,发现 {len(databases)} 个数据库") - return True - - except Exception as e: - logger.error(f"❌ VDB连接失败: {e}") - return False - -def prepare_directories(): - """准备必要目录""" - logger.info("📁 准备目录...") - - directories = ["uploads", "sample_images", "text_data", "templates"] - - for dir_name in directories: - Path(dir_name).mkdir(exist_ok=True) - logger.info(f"✅ {dir_name}") - -def start_web_app(): - """启动Web应用""" - logger.info("🌐 启动Web应用...") - - try: - # 设置环境变量 - os.environ['FLASK_APP'] = 'web_app_vdb_production.py' - os.environ['FLASK_ENV'] = 'development' - - # 启动Flask应用 - logger.info("启动地址: http://localhost:5000") - logger.info("按 Ctrl+C 停止服务") - - # 直接运行Python文件 - subprocess.run([sys.executable, 'web_app_vdb_production.py'], check=True) - - except KeyboardInterrupt: - logger.info("🛑 用户停止服务") - except Exception as e: - logger.error(f"❌ Web应用启动失败: {e}") - -def main(): - """主函数""" - logger.info("🚀 启动多模态检索系统Web应用") - logger.info("=" * 50) - - # 1. 检查依赖 - if not check_dependencies(): - logger.error("❌ 依赖包检查失败,请先安装缺失的包") - return False - - # 2. 测试VDB连接 - if not test_vdb_connection(): - logger.error("❌ VDB连接失败,请检查网络和配置") - return False - - # 3. 准备目录 - prepare_directories() - - # 4. 启动Web应用 - start_web_app() - - return True - -if __name__ == "__main__": - main() diff --git a/static/favicon.ico b/static/favicon.ico new file mode 100644 index 0000000..ec31688 Binary files /dev/null and b/static/favicon.ico differ diff --git a/templates/index.html b/templates/index.html deleted file mode 100644 index 1a38d79..0000000 --- a/templates/index.html +++ /dev/null @@ -1,971 +0,0 @@ - - - - - - 多模态检索系统 - - - - - - -
-
- 未初始化 -
-
- -
-
- -
-

多模态检索系统

-

支持文搜图、文搜文、图搜图、图搜文四种检索模式

-
- -
- -
- -
- - -
-
-
- -
文搜文
-

文本查找相似文本

-
-
-
-
- -
文搜图
-

文本查找相关图片

-
-
-
-
- -
图搜文
-

图片查找相关文本

-
-
-
-
- -
图搜图
-

图片查找相似图片

-
-
-
- - -
-
-
-
-
数据管理
- 上传和管理检索数据库 -
-
-
- -
-
-
批量上传图片
-
- -

拖拽多张图片到此处或点击选择

- - -
- -
-
- - -
-
-
批量上传文本
-
- -
-
- - - -
-
-
-
- - -
-
-
- - - -
-
-
-
- - 图片: 0 张 | - 文本: 0 条 - -
-
-
-
-
-
-
- - - - - -
-
- Loading... -
-

正在搜索中...

-
- - -
-
-
-
- - - - - diff --git a/templates/local_index.html b/templates/local_index.html index e279750..1071f2f 100644 --- a/templates/local_index.html +++ b/templates/local_index.html @@ -4,6 +4,7 @@ 本地多模态检索系统 - FAISS +