#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 百度对象存储BOS管理器 用于存储和管理多模态文件的原始数据 """ import os import logging import json import copy from datetime import datetime from typing import Dict, List, Optional, Any, Union from pathlib import Path from baidubce.auth import bce_credentials from baidubce import bce_base_client, bce_client_configuration from baidubce.services.bos.bos_client import BosClient from baidubce.exception import BceError logger = logging.getLogger(__name__) class BaiduBOSManager: """百度对象存储BOS管理器""" def __init__(self, ak: str = None, sk: str = None, endpoint: str = None, bucket: str = None): """ 初始化BOS客户端 Args: ak: Access Key sk: Secret Key endpoint: BOS服务端点 bucket: 存储桶名称 """ self.ak = ak or "ALTAKmzKDy1OqhqmepD2OeXqbN" self.sk = sk or "b79c5fbc26344868916ec6e9e2ff65f0" self.endpoint = endpoint or "https://bj.bcebos.com" self.bucket = bucket or "dmtyz-demo" self.client = None self._init_client() def _init_client(self): """初始化BOS客户端""" try: config = bce_client_configuration.BceClientConfiguration( credentials=bce_credentials.BceCredentials(self.ak, self.sk), endpoint=self.endpoint ) self.client = BosClient(config) # 测试连接 self._test_connection() logger.info(f"✅ BOS客户端初始化成功: {self.bucket}") except Exception as e: logger.error(f"❌ BOS客户端初始化失败: {e}") raise def _test_connection(self): """测试BOS连接""" try: # 尝试列出存储桶 self.client.list_buckets() logger.info("✅ BOS连接测试成功") except Exception as e: logger.error(f"❌ BOS连接测试失败: {e}") raise def upload_file(self, local_path: str, bos_key: str = None, content_type: str = None) -> Dict[str, Any]: """ 上传文件到BOS Args: local_path: 本地文件路径 bos_key: BOS对象键,如果为None则自动生成 content_type: 文件内容类型 Returns: 上传结果信息 """ try: if not os.path.exists(local_path): raise FileNotFoundError(f"文件不存在: {local_path}") # 生成BOS键 if bos_key is None: bos_key = self._generate_bos_key(local_path) # 自动检测内容类型 if content_type is None: content_type = self._detect_content_type(local_path) # 获取文件大小 file_stat = os.stat(local_path) file_size = file_stat.st_size # 上传文件(使用put_object_from_file方法) response = self.client.put_object_from_file( self.bucket, bos_key, local_path, content_type=content_type ) # 获取文件信息 file_stat = os.stat(local_path) result = { "bos_key": bos_key, "bucket": self.bucket, "file_size": file_stat.st_size, "content_type": content_type, "upload_time": datetime.utcnow().isoformat(), "etag": response.metadata.etag if hasattr(response, 'metadata') else None, "url": f"{self.endpoint}/{self.bucket}/{bos_key}" } logger.info(f"✅ 文件上传成功: {bos_key}") return result except Exception as e: logger.error(f"❌ 文件上传失败: {e}") raise def download_file(self, bos_key: str, local_path: str) -> bool: """ 从BOS下载文件 Args: bos_key: BOS对象键 local_path: 本地保存路径 Returns: 是否下载成功 """ try: # 确保目录存在 os.makedirs(os.path.dirname(local_path), exist_ok=True) # 下载文件 response = self.client.get_object(self.bucket, bos_key) with open(local_path, 'wb') as f: for chunk in response.data: f.write(chunk) logger.info(f"✅ 文件下载成功: {bos_key} -> {local_path}") return True except Exception as e: logger.error(f"❌ 文件下载失败: {e}") return False def get_object_metadata(self, bos_key: str) -> Optional[Dict]: """ 获取对象元数据 Args: bos_key: BOS对象键 Returns: 对象元数据 """ try: response = self.client.get_object_meta_data(self.bucket, bos_key) metadata = { "bos_key": bos_key, "bucket": self.bucket, "content_length": response.metadata.content_length, "content_type": response.metadata.content_type, "etag": response.metadata.etag, "last_modified": response.metadata.last_modified, "url": f"{self.endpoint}/{self.bucket}/{bos_key}" } return metadata except Exception as e: logger.error(f"❌ 获取对象元数据失败: {e}") return None def delete_object(self, bos_key: str) -> bool: """ 删除BOS对象 Args: bos_key: BOS对象键 Returns: 是否删除成功 """ try: self.client.delete_object(self.bucket, bos_key) logger.info(f"✅ 对象删除成功: {bos_key}") return True except Exception as e: logger.error(f"❌ 对象删除失败: {e}") return False def list_objects(self, prefix: str = "", max_keys: int = 1000) -> List[Dict]: """ 列出BOS对象 Args: prefix: 对象键前缀 max_keys: 最大返回数量 Returns: 对象列表 """ try: response = self.client.list_objects( bucket_name=self.bucket, prefix=prefix, max_keys=max_keys ) objects = [] if hasattr(response, 'contents'): for obj in response.contents: objects.append({ "key": obj.key, "size": obj.size, "last_modified": obj.last_modified, "etag": obj.etag, "url": f"{self.endpoint}/{self.bucket}/{obj.key}" }) return objects except Exception as e: logger.error(f"❌ 列出对象失败: {e}") return [] def restore_archive_object(self, bos_key: str, days: int = 1, tier: str = "Standard") -> bool: """ 恢复归档对象 Args: bos_key: BOS对象键 days: 恢复天数 tier: 恢复级别 (Expedited/Standard/Bulk) Returns: 是否成功发起恢复 """ try: # 使用自定义客户端进行归档恢复 restore_client = ArchiveRestoreClient( bce_client_configuration.BceClientConfiguration( credentials=bce_credentials.BceCredentials(self.ak, self.sk), endpoint=self.endpoint ) ) response = restore_client.restore_object(self.bucket, bos_key, days, tier) logger.info(f"✅ 归档恢复请求已发送: {bos_key}") return True except Exception as e: logger.error(f"❌ 归档恢复失败: {e}") return False def _generate_bos_key(self, local_path: str) -> str: """生成BOS对象键""" filename = os.path.basename(local_path) timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") # 根据文件类型分类存储 if self._is_image_file(local_path): return f"images/{timestamp}_{filename}" elif self._is_text_file(local_path): return f"texts/{timestamp}_{filename}" else: return f"files/{timestamp}_{filename}" def _detect_content_type(self, file_path: str) -> str: """检测文件内容类型""" ext = os.path.splitext(file_path)[1].lower() content_types = { '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.gif': 'image/gif', '.bmp': 'image/bmp', '.webp': 'image/webp', '.txt': 'text/plain', '.json': 'application/json', '.pdf': 'application/pdf' } return content_types.get(ext, 'application/octet-stream') def _is_image_file(self, file_path: str) -> bool: """判断是否为图像文件""" image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'} ext = os.path.splitext(file_path)[1].lower() return ext in image_exts def _is_text_file(self, file_path: str) -> bool: """判断是否为文本文件""" text_exts = {'.txt', '.json', '.csv', '.md'} ext = os.path.splitext(file_path)[1].lower() return ext in text_exts class ArchiveRestoreClient(bce_base_client.BceBaseClient): """归档恢复客户端""" def __init__(self, config): self.config = copy.deepcopy(bce_client_configuration.DEFAULT_CONFIG) self.config.merge_non_none_values(config) def restore_object(self, bucket: str, key: str, days: int = 1, tier: str = "Standard"): """恢复归档对象""" path = f'/{bucket}/{key}'.encode('utf-8') headers = { b'x-bce-restore-days': str(days).encode('utf-8'), b'x-bce-restore-tier': tier.encode('utf-8'), b'Accept': b'application/json' } params = {"restore": ""} payload = json.dumps({}, ensure_ascii=False) return self._send_request(b'POST', path, headers, params, payload.encode('utf-8')) # 全局实例 bos_manager = None def get_bos_manager() -> BaiduBOSManager: """获取BOS管理器实例""" global bos_manager if bos_manager is None: bos_manager = BaiduBOSManager() return bos_manager