#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 优化的文件处理器 支持自动清理、内存处理和流式上传 """ import os import io import tempfile import logging import uuid from contextlib import contextmanager from typing import Dict, List, Optional, Any, Union, BinaryIO from pathlib import Path from PIL import Image import numpy as np from baidu_bos_manager import get_bos_manager from mongodb_manager import get_mongodb_manager logger = logging.getLogger(__name__) class OptimizedFileHandler: """优化的文件处理器""" # 小文件阈值 (5MB) SMALL_FILE_THRESHOLD = 5 * 1024 * 1024 # 支持的图像格式 SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'} def __init__(self): self.bos_manager = get_bos_manager() self.mongodb_manager = get_mongodb_manager() self.temp_files = set() # 跟踪临时文件 @contextmanager def temp_file_context(self, suffix: str = None, delete_on_exit: bool = True): """临时文件上下文管理器,确保自动清理""" temp_fd, temp_path = tempfile.mkstemp(suffix=suffix) self.temp_files.add(temp_path) try: os.close(temp_fd) # 关闭文件描述符 yield temp_path finally: if delete_on_exit and os.path.exists(temp_path): try: os.unlink(temp_path) self.temp_files.discard(temp_path) logger.debug(f"🗑️ 临时文件已清理: {temp_path}") except Exception as e: logger.warning(f"⚠️ 临时文件清理失败: {temp_path}, {e}") def cleanup_all_temp_files(self): """清理所有跟踪的临时文件""" for temp_path in list(self.temp_files): if os.path.exists(temp_path): try: os.unlink(temp_path) logger.debug(f"🗑️ 清理临时文件: {temp_path}") except Exception as e: logger.warning(f"⚠️ 清理临时文件失败: {temp_path}, {e}") self.temp_files.clear() def get_file_size(self, file_obj) -> int: """获取文件大小""" if hasattr(file_obj, 'content_length') and file_obj.content_length: return file_obj.content_length # 通过读取内容获取大小 current_pos = file_obj.tell() file_obj.seek(0, 2) # 移动到文件末尾 size = file_obj.tell() file_obj.seek(current_pos) # 恢复原位置 return size def is_small_file(self, file_obj) -> bool: """判断是否为小文件""" return self.get_file_size(file_obj) <= self.SMALL_FILE_THRESHOLD def process_image_in_memory(self, file_obj, filename: str) -> Optional[Dict[str, Any]]: """在内存中处理小图像文件""" try: # 读取文件内容到内存 file_obj.seek(0) file_content = file_obj.read() file_obj.seek(0) # 验证图像格式 try: image = Image.open(io.BytesIO(file_content)) image.verify() # 验证图像完整性 except Exception as e: logger.error(f"❌ 图像验证失败: {filename}, {e}") return None # 生成唯一ID和BOS键 file_id = str(uuid.uuid4()) bos_key = f"images/memory_{file_id}_{filename}" # 直接上传到BOS(从内存) bos_result = self._upload_to_bos_from_memory( file_content, bos_key, filename ) if not bos_result: return None # 存储元数据到MongoDB metadata = { "_id": file_id, "filename": filename, "file_type": "image", "file_size": len(file_content), "processing_method": "memory", "bos_key": bos_key, "bos_url": bos_result["url"] } self.mongodb_manager.store_file_metadata(metadata=metadata) logger.info(f"✅ 内存处理图像成功: {filename} ({len(file_content)} bytes)") return { "file_id": file_id, "filename": filename, "bos_key": bos_key, "bos_result": bos_result, "processing_method": "memory" } except Exception as e: logger.error(f"❌ 内存处理图像失败: {filename}, {e}") return None def process_image_with_temp_file(self, file_obj, filename: str) -> Optional[Dict[str, Any]]: """使用临时文件处理大图像文件""" try: # 获取文件扩展名 ext = os.path.splitext(filename)[1].lower() with self.temp_file_context(suffix=ext) as temp_path: # 保存到临时文件 file_obj.seek(0) with open(temp_path, 'wb') as temp_file: temp_file.write(file_obj.read()) # 验证图像 try: with Image.open(temp_path) as image: image.verify() except Exception as e: logger.error(f"❌ 图像验证失败: {filename}, {e}") return None # 生成唯一ID和BOS键 file_id = str(uuid.uuid4()) bos_key = f"images/temp_{file_id}_{filename}" # 上传到BOS bos_result = self.bos_manager.upload_file(temp_path, bos_key) # 存储元数据到MongoDB file_stat = os.stat(temp_path) metadata = { "_id": file_id, "filename": filename, "file_type": "image", "file_size": file_stat.st_size, "processing_method": "temp_file", "bos_key": bos_key, "bos_url": bos_result["url"] } self.mongodb_manager.store_file_metadata(metadata=metadata) logger.info(f"✅ 临时文件处理图像成功: {filename} ({file_stat.st_size} bytes)") return { "file_id": file_id, "filename": filename, "bos_key": bos_key, "bos_result": bos_result, "processing_method": "temp_file", "temp_path": temp_path # 返回临时路径供模型处理 } except Exception as e: logger.error(f"❌ 临时文件处理图像失败: {filename}, {e}") return None def process_image_smart(self, file_obj, filename: str) -> Optional[Dict[str, Any]]: """智能处理图像文件(自动选择内存或临时文件)""" if self.is_small_file(file_obj): logger.info(f"📦 小文件内存处理: {filename}") return self.process_image_in_memory(file_obj, filename) else: logger.info(f"📁 大文件临时处理: {filename}") return self.process_image_with_temp_file(file_obj, filename) def process_text_in_memory(self, texts: List[str]) -> List[Dict[str, Any]]: """在内存中处理文本数据""" processed_texts = [] for i, text in enumerate(texts): try: # 生成唯一ID和BOS键 file_id = str(uuid.uuid4()) bos_key = f"texts/memory_{file_id}.txt" # 将文本转换为字节 text_bytes = text.encode('utf-8') # 直接上传到BOS bos_result = self._upload_to_bos_from_memory( text_bytes, bos_key, f"text_{i}.txt" ) if bos_result: # 存储元数据到MongoDB metadata = { "_id": file_id, "filename": f"text_{i}.txt", "file_type": "text", "file_size": len(text_bytes), "processing_method": "memory", "bos_key": bos_key, "bos_url": bos_result["url"], "text_content": text } self.mongodb_manager.store_file_metadata(metadata=metadata) processed_texts.append({ "file_id": file_id, "text_content": text, "bos_key": bos_key, "bos_result": bos_result }) logger.info(f"✅ 内存处理文本成功: text_{i} ({len(text_bytes)} bytes)") except Exception as e: logger.error(f"❌ 内存处理文本失败 {i}: {e}") return processed_texts def download_from_bos_for_processing(self, bos_key: str, local_filename: str = None) -> Optional[str]: """从BOS下载文件用于模型处理""" try: # 生成临时文件路径 if local_filename: ext = os.path.splitext(local_filename)[1] else: ext = os.path.splitext(bos_key)[1] with self.temp_file_context(suffix=ext, delete_on_exit=False) as temp_path: # 从BOS下载文件 success = self.bos_manager.download_file(bos_key, temp_path) if success: logger.info(f"✅ 从BOS下载文件用于处理: {bos_key}") return temp_path else: logger.error(f"❌ 从BOS下载文件失败: {bos_key}") return None except Exception as e: logger.error(f"❌ 从BOS下载文件异常: {bos_key}, {e}") return None def _upload_to_bos_from_memory(self, content: bytes, bos_key: str, filename: str) -> Optional[Dict[str, Any]]: """从内存直接上传到BOS""" try: # 创建临时文件用于上传 with self.temp_file_context() as temp_path: with open(temp_path, 'wb') as temp_file: temp_file.write(content) # 上传到BOS result = self.bos_manager.upload_file(temp_path, bos_key) return result except Exception as e: logger.error(f"❌ 内存上传到BOS失败: {filename}, {e}") return None def get_temp_file_for_model(self, file_obj, filename: str) -> Optional[str]: """为模型处理获取临时文件路径(确保文件存在于本地)""" try: ext = os.path.splitext(filename)[1].lower() # 创建临时文件(不自动删除,供模型使用) temp_fd, temp_path = tempfile.mkstemp(suffix=ext) self.temp_files.add(temp_path) try: # 写入文件内容 file_obj.seek(0) with os.fdopen(temp_fd, 'wb') as temp_file: temp_file.write(file_obj.read()) logger.debug(f"📁 为模型创建临时文件: {temp_path}") return temp_path except Exception as e: os.close(temp_fd) raise e except Exception as e: logger.error(f"❌ 为模型创建临时文件失败: {filename}, {e}") return None def cleanup_temp_file(self, temp_path: str): """清理指定的临时文件""" if temp_path and os.path.exists(temp_path): try: os.unlink(temp_path) self.temp_files.discard(temp_path) logger.debug(f"🗑️ 清理临时文件: {temp_path}") except Exception as e: logger.warning(f"⚠️ 清理临时文件失败: {temp_path}, {e}") # 全局实例 file_handler = None def get_file_handler() -> OptimizedFileHandler: """获取优化文件处理器实例""" global file_handler if file_handler is None: file_handler = OptimizedFileHandler() return file_handler