mmeb/baidu_bos_manager.py
2025-09-01 11:24:01 +00:00

343 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
百度对象存储BOS管理器
用于存储和管理多模态文件的原始数据
"""
import os
import logging
import json
import copy
from datetime import datetime
from typing import Dict, List, Optional, Any, Union
from pathlib import Path
from baidubce.auth import bce_credentials
from baidubce import bce_base_client, bce_client_configuration
from baidubce.services.bos.bos_client import BosClient
from baidubce.exception import BceError
logger = logging.getLogger(__name__)
class BaiduBOSManager:
"""百度对象存储BOS管理器"""
def __init__(self, ak: str = None, sk: str = None, endpoint: str = None, bucket: str = None):
"""
初始化BOS客户端
Args:
ak: Access Key
sk: Secret Key
endpoint: BOS服务端点
bucket: 存储桶名称
"""
self.ak = ak or "ALTAKmzKDy1OqhqmepD2OeXqbN"
self.sk = sk or "b79c5fbc26344868916ec6e9e2ff65f0"
self.endpoint = endpoint or "https://bj.bcebos.com"
self.bucket = bucket or "dmtyz-demo"
self.client = None
self._init_client()
def _init_client(self):
"""初始化BOS客户端"""
try:
config = bce_client_configuration.BceClientConfiguration(
credentials=bce_credentials.BceCredentials(self.ak, self.sk),
endpoint=self.endpoint
)
self.client = BosClient(config)
# 测试连接
self._test_connection()
logger.info(f"✅ BOS客户端初始化成功: {self.bucket}")
except Exception as e:
logger.error(f"❌ BOS客户端初始化失败: {e}")
raise
def _test_connection(self):
"""测试BOS连接"""
try:
# 尝试列出存储桶
self.client.list_buckets()
logger.info("✅ BOS连接测试成功")
except Exception as e:
logger.error(f"❌ BOS连接测试失败: {e}")
raise
def upload_file(self, local_path: str, bos_key: str = None,
content_type: str = None) -> Dict[str, Any]:
"""
上传文件到BOS
Args:
local_path: 本地文件路径
bos_key: BOS对象键如果为None则自动生成
content_type: 文件内容类型
Returns:
上传结果信息
"""
try:
if not os.path.exists(local_path):
raise FileNotFoundError(f"文件不存在: {local_path}")
# 生成BOS键
if bos_key is None:
bos_key = self._generate_bos_key(local_path)
# 自动检测内容类型
if content_type is None:
content_type = self._detect_content_type(local_path)
# 获取文件大小
file_stat = os.stat(local_path)
file_size = file_stat.st_size
# 上传文件使用put_object_from_file方法
response = self.client.put_object_from_file(
self.bucket,
bos_key,
local_path,
content_type=content_type
)
# 获取文件信息
file_stat = os.stat(local_path)
result = {
"bos_key": bos_key,
"bucket": self.bucket,
"file_size": file_stat.st_size,
"content_type": content_type,
"upload_time": datetime.utcnow().isoformat(),
"etag": response.metadata.etag if hasattr(response, 'metadata') else None,
"url": f"{self.endpoint}/{self.bucket}/{bos_key}"
}
logger.info(f"✅ 文件上传成功: {bos_key}")
return result
except Exception as e:
logger.error(f"❌ 文件上传失败: {e}")
raise
def download_file(self, bos_key: str, local_path: str) -> bool:
"""
从BOS下载文件
Args:
bos_key: BOS对象键
local_path: 本地保存路径
Returns:
是否下载成功
"""
try:
# 确保目录存在
os.makedirs(os.path.dirname(local_path), exist_ok=True)
# 下载文件
response = self.client.get_object(self.bucket, bos_key)
with open(local_path, 'wb') as f:
for chunk in response.data:
f.write(chunk)
logger.info(f"✅ 文件下载成功: {bos_key} -> {local_path}")
return True
except Exception as e:
logger.error(f"❌ 文件下载失败: {e}")
return False
def get_object_metadata(self, bos_key: str) -> Optional[Dict]:
"""
获取对象元数据
Args:
bos_key: BOS对象键
Returns:
对象元数据
"""
try:
response = self.client.get_object_meta_data(self.bucket, bos_key)
metadata = {
"bos_key": bos_key,
"bucket": self.bucket,
"content_length": response.metadata.content_length,
"content_type": response.metadata.content_type,
"etag": response.metadata.etag,
"last_modified": response.metadata.last_modified,
"url": f"{self.endpoint}/{self.bucket}/{bos_key}"
}
return metadata
except Exception as e:
logger.error(f"❌ 获取对象元数据失败: {e}")
return None
def delete_object(self, bos_key: str) -> bool:
"""
删除BOS对象
Args:
bos_key: BOS对象键
Returns:
是否删除成功
"""
try:
self.client.delete_object(self.bucket, bos_key)
logger.info(f"✅ 对象删除成功: {bos_key}")
return True
except Exception as e:
logger.error(f"❌ 对象删除失败: {e}")
return False
def list_objects(self, prefix: str = "", max_keys: int = 1000) -> List[Dict]:
"""
列出BOS对象
Args:
prefix: 对象键前缀
max_keys: 最大返回数量
Returns:
对象列表
"""
try:
response = self.client.list_objects(
bucket_name=self.bucket,
prefix=prefix,
max_keys=max_keys
)
objects = []
if hasattr(response, 'contents'):
for obj in response.contents:
objects.append({
"key": obj.key,
"size": obj.size,
"last_modified": obj.last_modified,
"etag": obj.etag,
"url": f"{self.endpoint}/{self.bucket}/{obj.key}"
})
return objects
except Exception as e:
logger.error(f"❌ 列出对象失败: {e}")
return []
def restore_archive_object(self, bos_key: str, days: int = 1, tier: str = "Standard") -> bool:
"""
恢复归档对象
Args:
bos_key: BOS对象键
days: 恢复天数
tier: 恢复级别 (Expedited/Standard/Bulk)
Returns:
是否成功发起恢复
"""
try:
# 使用自定义客户端进行归档恢复
restore_client = ArchiveRestoreClient(
bce_client_configuration.BceClientConfiguration(
credentials=bce_credentials.BceCredentials(self.ak, self.sk),
endpoint=self.endpoint
)
)
response = restore_client.restore_object(self.bucket, bos_key, days, tier)
logger.info(f"✅ 归档恢复请求已发送: {bos_key}")
return True
except Exception as e:
logger.error(f"❌ 归档恢复失败: {e}")
return False
def _generate_bos_key(self, local_path: str) -> str:
"""生成BOS对象键"""
filename = os.path.basename(local_path)
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
# 根据文件类型分类存储
if self._is_image_file(local_path):
return f"images/{timestamp}_{filename}"
elif self._is_text_file(local_path):
return f"texts/{timestamp}_{filename}"
else:
return f"files/{timestamp}_{filename}"
def _detect_content_type(self, file_path: str) -> str:
"""检测文件内容类型"""
ext = os.path.splitext(file_path)[1].lower()
content_types = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.bmp': 'image/bmp',
'.webp': 'image/webp',
'.txt': 'text/plain',
'.json': 'application/json',
'.pdf': 'application/pdf'
}
return content_types.get(ext, 'application/octet-stream')
def _is_image_file(self, file_path: str) -> bool:
"""判断是否为图像文件"""
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'}
ext = os.path.splitext(file_path)[1].lower()
return ext in image_exts
def _is_text_file(self, file_path: str) -> bool:
"""判断是否为文本文件"""
text_exts = {'.txt', '.json', '.csv', '.md'}
ext = os.path.splitext(file_path)[1].lower()
return ext in text_exts
class ArchiveRestoreClient(bce_base_client.BceBaseClient):
"""归档恢复客户端"""
def __init__(self, config):
self.config = copy.deepcopy(bce_client_configuration.DEFAULT_CONFIG)
self.config.merge_non_none_values(config)
def restore_object(self, bucket: str, key: str, days: int = 1, tier: str = "Standard"):
"""恢复归档对象"""
path = f'/{bucket}/{key}'.encode('utf-8')
headers = {
b'x-bce-restore-days': str(days).encode('utf-8'),
b'x-bce-restore-tier': tier.encode('utf-8'),
b'Accept': b'application/json'
}
params = {"restore": ""}
payload = json.dumps({}, ensure_ascii=False)
return self._send_request(b'POST', path, headers, params, payload.encode('utf-8'))
# 全局实例
bos_manager = None
def get_bos_manager() -> BaiduBOSManager:
"""获取BOS管理器实例"""
global bos_manager
if bos_manager is None:
bos_manager = BaiduBOSManager()
return bos_manager