651 lines
22 KiB
Python
651 lines
22 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
生产级Web应用 - 使用纯百度VDB系统,完全移除FAISS依赖
|
||
优化版本:支持自动清理、内存处理和流式上传
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import base64
|
||
import time
|
||
import logging
|
||
from io import BytesIO
|
||
from pathlib import Path
|
||
from typing import Dict, List, Any, Optional
|
||
|
||
import numpy as np
|
||
from PIL import Image
|
||
from flask import Flask, request, jsonify, render_template, send_from_directory
|
||
from werkzeug.utils import secure_filename
|
||
import threading
|
||
|
||
from multimodal_retrieval_vdb_only import MultimodalRetrievalVDBOnly
|
||
from mongodb_manager import get_mongodb_manager
|
||
from baidu_bos_manager import get_bos_manager
|
||
from optimized_file_handler import get_file_handler
|
||
|
||
# 设置日志
|
||
logging.basicConfig(level=logging.INFO)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# 创建Flask应用
|
||
app = Flask(__name__)
|
||
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size
|
||
|
||
# 全局变量
|
||
retrieval_system = None
|
||
system_lock = threading.Lock()
|
||
|
||
# 配置
|
||
UPLOAD_FOLDER = 'uploads'
|
||
ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'}
|
||
|
||
# 确保上传目录存在
|
||
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
||
|
||
def allowed_file(filename):
|
||
"""检查文件扩展名是否允许"""
|
||
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
||
|
||
def encode_image_to_base64(image_path: str) -> str:
|
||
"""将图像编码为base64字符串"""
|
||
try:
|
||
with open(image_path, 'rb') as image_file:
|
||
encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
|
||
return f"data:image/jpeg;base64,{encoded_string}"
|
||
except Exception as e:
|
||
logger.error(f"图像编码失败: {e}")
|
||
return ""
|
||
|
||
@app.route('/')
|
||
def index():
|
||
"""主页"""
|
||
return render_template('index.html')
|
||
|
||
@app.route('/api/init', methods=['POST'])
|
||
def init_system():
|
||
"""初始化多模态检索系统"""
|
||
global retrieval_system
|
||
|
||
try:
|
||
with system_lock:
|
||
if retrieval_system is None:
|
||
logger.info("🚀 初始化纯VDB多模态检索系统...")
|
||
retrieval_system = MultimodalRetrievalVDBOnly()
|
||
logger.info("✅ 系统初始化完成")
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'message': '系统初始化成功',
|
||
'backend': 'Baidu VDB (No FAISS)',
|
||
'status': 'ready'
|
||
})
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 系统初始化失败: {e}")
|
||
return jsonify({
|
||
'success': False,
|
||
'message': f'系统初始化失败: {str(e)}'
|
||
}), 500
|
||
|
||
@app.route('/api/upload/texts', methods=['POST'])
|
||
@app.route('/api/upload_texts', methods=['POST'])
|
||
def upload_texts():
|
||
"""上传文本数据 - 优化版本"""
|
||
global retrieval_system
|
||
|
||
try:
|
||
if retrieval_system is None:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '系统未初始化,请先调用 /api/init'
|
||
}), 400
|
||
|
||
data = request.get_json()
|
||
texts = data.get('texts', [])
|
||
|
||
if not texts:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '没有提供文本数据'
|
||
}), 400
|
||
|
||
# 获取优化的文件处理器
|
||
file_handler = get_file_handler()
|
||
|
||
# 使用内存处理文本数据
|
||
logger.info(f"📝 开始内存处理 {len(texts)} 条文本...")
|
||
processed_texts = file_handler.process_text_in_memory(texts)
|
||
|
||
if not processed_texts:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '文本处理失败'
|
||
}), 400
|
||
|
||
# 自动构建文本索引
|
||
index_status = "索引构建失败"
|
||
try:
|
||
with system_lock:
|
||
retrieval_system.build_text_index_parallel(texts)
|
||
|
||
# 存储向量元数据
|
||
mongodb_mgr = get_mongodb_manager()
|
||
for text_info in processed_texts:
|
||
mongodb_mgr.store_vector_metadata(
|
||
file_id=text_info["file_id"],
|
||
vector_type="text_vector",
|
||
vdb_id=text_info["bos_key"],
|
||
vector_info={"text_content": text_info["text_content"]}
|
||
)
|
||
|
||
logger.info(f"✅ 文本索引自动构建完成")
|
||
index_status = "索引已自动构建"
|
||
except Exception as e:
|
||
logger.error(f"❌ 自动构建文本索引失败: {e}")
|
||
index_status = f"索引构建失败: {str(e)}"
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'message': f'成功处理 {len(texts)} 条文本(内存模式)',
|
||
'count': len(texts),
|
||
'processed_texts': len(processed_texts),
|
||
'index_status': index_status,
|
||
'auto_indexed': True,
|
||
'processing_method': 'memory',
|
||
'storage_info': {
|
||
'bos_stored': len(processed_texts),
|
||
'mongodb_stored': len(processed_texts)
|
||
}
|
||
})
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 文本上传失败: {e}")
|
||
return jsonify({
|
||
'success': False,
|
||
'message': f'文本上传失败: {str(e)}'
|
||
}), 500
|
||
|
||
@app.route('/api/upload/images', methods=['POST'])
|
||
@app.route('/api/upload_images', methods=['POST'])
|
||
def upload_images():
|
||
"""上传图像文件 - 优化版本"""
|
||
global retrieval_system
|
||
|
||
try:
|
||
if retrieval_system is None:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '系统未初始化,请先调用 /api/init'
|
||
}), 400
|
||
|
||
if 'files' not in request.files:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '没有文件上传'
|
||
}), 400
|
||
|
||
files = request.files.getlist('files')
|
||
if not files or all(file.filename == '' for file in files):
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '没有选择文件'
|
||
}), 400
|
||
|
||
# 获取优化的文件处理器
|
||
file_handler = get_file_handler()
|
||
processed_files = []
|
||
temp_files_for_model = []
|
||
|
||
# 处理每个文件
|
||
for file in files:
|
||
if file and allowed_file(file.filename):
|
||
filename = secure_filename(file.filename)
|
||
logger.info(f"📷 处理图像文件: {filename}")
|
||
|
||
try:
|
||
# 智能处理图像(自动选择内存或临时文件)
|
||
result = file_handler.process_image_smart(file, filename)
|
||
|
||
if result:
|
||
processed_files.append(result)
|
||
|
||
# 如果需要为模型处理准备临时文件
|
||
if result.get('processing_method') == 'memory':
|
||
# 对于内存处理的文件,需要为模型创建临时文件
|
||
temp_path = file_handler.get_temp_file_for_model(file, filename)
|
||
if temp_path:
|
||
temp_files_for_model.append({
|
||
'temp_path': temp_path,
|
||
'file_info': result
|
||
})
|
||
else:
|
||
# 临时文件处理的情况,直接使用返回的路径
|
||
temp_files_for_model.append({
|
||
'temp_path': result.get('temp_path'),
|
||
'file_info': result
|
||
})
|
||
|
||
logger.info(f"✅ 图像处理成功: {filename} ({result['processing_method']})")
|
||
else:
|
||
logger.error(f"❌ 图像处理失败: {filename}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 处理图像文件失败 {filename}: {e}")
|
||
|
||
if processed_files:
|
||
# 自动构建图像索引
|
||
index_status = "索引构建失败"
|
||
try:
|
||
# 准备图像路径列表
|
||
image_paths = [item['temp_path'] for item in temp_files_for_model if item['temp_path']]
|
||
|
||
if image_paths:
|
||
with system_lock:
|
||
retrieval_system.build_image_index_parallel(image_paths)
|
||
|
||
# 存储向量元数据
|
||
mongodb_mgr = get_mongodb_manager()
|
||
for file_info in processed_files:
|
||
mongodb_mgr.store_vector_metadata(
|
||
file_id=file_info["file_id"],
|
||
vector_type="image_vector",
|
||
vdb_id=file_info["bos_key"],
|
||
vector_info={"filename": file_info["filename"]}
|
||
)
|
||
|
||
logger.info(f"✅ 图像索引自动构建完成")
|
||
index_status = "索引已自动构建"
|
||
|
||
# 清理模型处理用的临时文件
|
||
for item in temp_files_for_model:
|
||
if item['temp_path']:
|
||
file_handler.cleanup_temp_file(item['temp_path'])
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 自动构建图像索引失败: {e}")
|
||
index_status = f"索引构建失败: {str(e)}"
|
||
|
||
# 即使索引失败也要清理临时文件
|
||
for item in temp_files_for_model:
|
||
if item['temp_path']:
|
||
file_handler.cleanup_temp_file(item['temp_path'])
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'message': f'成功上传 {len(processed_files)} 个图像文件',
|
||
'count': len(processed_files),
|
||
'processed_files': len(processed_files),
|
||
'index_status': index_status,
|
||
'auto_indexed': True,
|
||
'processing_methods': [f['processing_method'] for f in processed_files],
|
||
'storage_info': {
|
||
'bos_stored': len(processed_files),
|
||
'mongodb_stored': len(processed_files)
|
||
}
|
||
})
|
||
else:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '没有有效的图像文件'
|
||
}), 400
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 图像上传失败: {e}")
|
||
# 确保清理所有临时文件
|
||
file_handler = get_file_handler()
|
||
file_handler.cleanup_all_temp_files()
|
||
return jsonify({
|
||
'success': False,
|
||
'message': f'图像上传失败: {str(e)}'
|
||
}), 500
|
||
|
||
@app.route('/api/search/image_to_text', methods=['POST'])
|
||
def search_image_to_text():
|
||
"""图搜文 - 优化版本"""
|
||
global retrieval_system
|
||
|
||
try:
|
||
if retrieval_system is None:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '系统未初始化'
|
||
}), 400
|
||
|
||
top_k = request.form.get('top_k', 5, type=int)
|
||
|
||
if 'image' not in request.files:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '没有上传图像'
|
||
}), 400
|
||
|
||
file = request.files['image']
|
||
if not file or not allowed_file(file.filename):
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '无效的图像文件'
|
||
}), 400
|
||
|
||
# 获取优化的文件处理器
|
||
file_handler = get_file_handler()
|
||
|
||
# 为模型处理创建临时文件
|
||
filename = secure_filename(file.filename)
|
||
temp_path = file_handler.get_temp_file_for_model(file, filename)
|
||
|
||
if not temp_path:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '临时文件创建失败'
|
||
}), 500
|
||
|
||
try:
|
||
# 执行图搜文
|
||
with system_lock:
|
||
results = retrieval_system.search_image_to_text(temp_path, top_k=top_k)
|
||
|
||
# 格式化结果
|
||
formatted_results = []
|
||
for text, score in results:
|
||
formatted_results.append({
|
||
'text': text,
|
||
'score': float(score),
|
||
'type': 'text'
|
||
})
|
||
|
||
# 编码查询图像
|
||
query_image_base64 = encode_image_to_base64(temp_path)
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'query_image': query_image_base64,
|
||
'results': formatted_results,
|
||
'count': len(formatted_results),
|
||
'search_type': 'image_to_text',
|
||
'processing_method': 'optimized_temp_file'
|
||
})
|
||
|
||
finally:
|
||
# 确保清理临时文件
|
||
file_handler.cleanup_temp_file(temp_path)
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 图搜文失败: {e}")
|
||
return jsonify({
|
||
'success': False,
|
||
'message': f'图搜文失败: {str(e)}'
|
||
}), 500
|
||
|
||
@app.route('/api/search/image_to_image', methods=['POST'])
|
||
def search_image_to_image():
|
||
"""图搜图 - 优化版本"""
|
||
global retrieval_system
|
||
|
||
try:
|
||
if retrieval_system is None:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '系统未初始化'
|
||
}), 400
|
||
|
||
top_k = request.form.get('top_k', 5, type=int)
|
||
|
||
if 'image' not in request.files:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '没有上传图像'
|
||
}), 400
|
||
|
||
file = request.files['image']
|
||
if not file or not allowed_file(file.filename):
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '无效的图像文件'
|
||
}), 400
|
||
|
||
# 获取优化的文件处理器
|
||
file_handler = get_file_handler()
|
||
|
||
# 为模型处理创建临时文件
|
||
filename = secure_filename(file.filename)
|
||
temp_path = file_handler.get_temp_file_for_model(file, filename)
|
||
|
||
if not temp_path:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '临时文件创建失败'
|
||
}), 500
|
||
|
||
try:
|
||
# 执行图搜图
|
||
with system_lock:
|
||
results = retrieval_system.search_image_to_image(temp_path, top_k=top_k)
|
||
|
||
# 格式化结果
|
||
formatted_results = []
|
||
for image_path, score in results:
|
||
image_base64 = encode_image_to_base64(image_path)
|
||
formatted_results.append({
|
||
'image_path': image_path,
|
||
'image_base64': image_base64,
|
||
'score': float(score),
|
||
'type': 'image'
|
||
})
|
||
|
||
# 编码查询图像
|
||
query_image_base64 = encode_image_to_base64(temp_path)
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'query_image': query_image_base64,
|
||
'results': formatted_results,
|
||
'count': len(formatted_results),
|
||
'search_type': 'image_to_image',
|
||
'processing_method': 'optimized_temp_file'
|
||
})
|
||
|
||
finally:
|
||
# 确保清理临时文件
|
||
file_handler.cleanup_temp_file(temp_path)
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 图搜图失败: {e}")
|
||
return jsonify({
|
||
'success': False,
|
||
'message': f'图搜图失败: {str(e)}'
|
||
}), 500
|
||
|
||
@app.route('/api/data/list', methods=['GET'])
|
||
def list_data():
|
||
"""获取数据列表"""
|
||
global retrieval_system
|
||
|
||
try:
|
||
if retrieval_system is None:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '系统未初始化'
|
||
}), 400
|
||
|
||
# 获取MongoDB管理器
|
||
mongodb_mgr = get_mongodb_manager()
|
||
|
||
# 获取所有文件元数据
|
||
files_data = mongodb_mgr.get_all_files()
|
||
|
||
images = []
|
||
texts = []
|
||
|
||
for file_data in files_data:
|
||
if file_data.get('file_type') == 'image':
|
||
# 提取文件名用于显示
|
||
file_path = file_data.get('file_path', '')
|
||
filename = os.path.basename(file_path) if file_path else file_data.get('bos_key', '')
|
||
images.append(filename)
|
||
elif file_data.get('file_type') == 'text':
|
||
# 获取文本内容
|
||
text_content = file_data.get('additional_info', {}).get('text_content', '')
|
||
if text_content:
|
||
texts.append(text_content)
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'images': images,
|
||
'texts': texts,
|
||
'total_files': len(files_data),
|
||
'image_count': len(images),
|
||
'text_count': len(texts)
|
||
})
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 获取数据列表失败: {e}")
|
||
return jsonify({
|
||
'success': False,
|
||
'message': f'获取数据列表失败: {str(e)}'
|
||
}), 500
|
||
|
||
@app.route('/api/stats', methods=['GET'])
|
||
@app.route('/api/status', methods=['GET'])
|
||
@app.route('/api/data/stats', methods=['GET'])
|
||
def get_stats():
|
||
"""获取系统统计信息"""
|
||
global retrieval_system
|
||
|
||
try:
|
||
if retrieval_system is None:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '系统未初始化'
|
||
}), 400
|
||
|
||
# 获取MongoDB统计信息
|
||
mongodb_mgr = get_mongodb_manager()
|
||
mongodb_stats = mongodb_mgr.get_stats()
|
||
|
||
with system_lock:
|
||
stats = retrieval_system.get_statistics()
|
||
gpu_info = retrieval_system.get_gpu_memory_info()
|
||
|
||
# 合并统计信息
|
||
enhanced_stats = {
|
||
**stats,
|
||
'mongodb_stats': mongodb_stats,
|
||
'storage_backend': {
|
||
'metadata': 'MongoDB',
|
||
'files': 'Baidu BOS'
|
||
}
|
||
}
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'stats': enhanced_stats,
|
||
'gpu_info': gpu_info,
|
||
'backend': 'Baidu VDB + MongoDB + BOS'
|
||
})
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 获取统计信息失败: {e}")
|
||
return jsonify({
|
||
'success': False,
|
||
'message': f'获取统计信息失败: {str(e)}'
|
||
}), 500
|
||
|
||
@app.route('/api/clear', methods=['POST'])
|
||
def clear_data():
|
||
"""清空所有数据 - 优化版本"""
|
||
global retrieval_system
|
||
|
||
try:
|
||
if retrieval_system is None:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '系统未初始化'
|
||
}), 400
|
||
|
||
with system_lock:
|
||
retrieval_system.clear_all_data()
|
||
|
||
# 清理所有临时文件
|
||
file_handler = get_file_handler()
|
||
file_handler.cleanup_all_temp_files()
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'message': '所有数据已清空,临时文件已清理'
|
||
})
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 清空数据失败: {e}")
|
||
return jsonify({
|
||
'success': False,
|
||
'message': f'清空数据失败: {str(e)}'
|
||
}), 500
|
||
|
||
@app.route('/uploads/<filename>')
|
||
def uploaded_file(filename):
|
||
"""提供上传文件的访问"""
|
||
return send_from_directory(UPLOAD_FOLDER, filename)
|
||
|
||
@app.errorhandler(413)
|
||
def too_large(e):
|
||
"""文件过大错误处理"""
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '文件过大,最大支持16MB'
|
||
}), 413
|
||
|
||
@app.errorhandler(500)
|
||
def internal_error(e):
|
||
"""内部服务器错误处理"""
|
||
return jsonify({
|
||
'success': False,
|
||
'message': '内部服务器错误'
|
||
}), 500
|
||
|
||
def init_app():
|
||
"""应用初始化 - 优化版本"""
|
||
global retrieval_system
|
||
|
||
print("=" * 60)
|
||
print("生产级多模态检索Web应用 (优化版)")
|
||
print("Backend: 纯百度VDB (无FAISS)")
|
||
print("Features: 自动清理 + 内存处理 + 流式上传")
|
||
print("=" * 60)
|
||
|
||
# 显示GPU信息
|
||
if torch.cuda.is_available():
|
||
gpu_count = torch.cuda.device_count()
|
||
print(f"检测到 {gpu_count} 个GPU:")
|
||
for i in range(gpu_count):
|
||
gpu_name = torch.cuda.get_device_properties(i).name
|
||
gpu_memory = torch.cuda.get_device_properties(i).total_memory / (1024**3)
|
||
print(f" GPU {i}: {gpu_name} ({gpu_memory:.1f}GB)")
|
||
|
||
print("=" * 60)
|
||
|
||
# 启动时自动初始化系统
|
||
try:
|
||
logger.info("🚀 启动时自动初始化优化版VDB检索系统...")
|
||
retrieval_system = MultimodalRetrievalVDBOnly()
|
||
logger.info("✅ 优化版系统自动初始化成功")
|
||
|
||
# 初始化文件处理器
|
||
file_handler = get_file_handler()
|
||
logger.info("✅ 优化文件处理器初始化成功")
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ 系统自动初始化失败: {e}")
|
||
logger.info("系统将在首次API调用时初始化")
|
||
|
||
if __name__ == '__main__':
|
||
import torch
|
||
|
||
# 初始化应用
|
||
init_app()
|
||
|
||
# 启动Flask应用
|
||
app.run(host='0.0.0.0', port=5000, debug=False, threaded=True)
|