first update

This commit is contained in:
skyxz 2025-11-12 00:59:35 +08:00
commit c88bfcf840
264 changed files with 43806 additions and 0 deletions

174
.gitignore vendored Normal file
View File

@ -0,0 +1,174 @@
input/
output/
Temp/
weights/
# ---> Python
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

View File

@ -0,0 +1,2 @@
input/*
output/*

40
ACT/act_export/Dockerfile Normal file
View File

@ -0,0 +1,40 @@
FROM registry.d-robotics.cc/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
# ccr-29eug8s3-pub.cnc.bj.baidubce.com/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV TZ=Asia/Shanghai
RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
RUN apt-get update --allow-unauthenticated && apt-get install -y \
software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update \
&& apt-get install -y \
python3.10 \
python3.10-dev \
python3-pip \
python3.10-distutils \
libgl1-mesa-glx \
libglib2.0-0 \
wget \
ffmpeg \
libsm6 \
libxext6 \
&& rm -rf /var/lib/apt/lists/*
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
COPY . /app/
ENV TORCH_HOME=/app/weights/
RUN python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
RUN pip install --ignore-installed -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
ENTRYPOINT ["python3", "export.py"]

465
ACT/act_export/export.py Normal file
View File

@ -0,0 +1,465 @@
import logging
import os
import sys
import shutil
import cv2
import numpy as np
import torch
import torch.nn as nn
import argparse
import onnx
import json
import yaml
from copy import deepcopy
from termcolor import colored
from onnxsim import simplify
from pprint import pformat
import time
from lerobot.policies.act.modeling_act import ACTPolicy
from lerobot.datasets.factory import make_dataset
from lerobot.utils.utils import get_safe_torch_device, init_logging
from lerobot.configs import parser
from lerobot.configs.train import TrainPipelineConfig
_global_config = None
BPU_VisionEncoder = "BPU_ACTPolicy_VisionEncoder"
BPU_TransformerLayers = "BPU_ACTPolicy_TransformerLayers"
def onnx_sim(onnx_path, onnx_sim):
if onnx_sim:
model_onnx = onnx.load(onnx_path) # load onnx model
onnx.checker.check_model(model_onnx) # check onnx model
model_onnx, check = simplify(
model_onnx,
dynamic_input_shape=False,
input_shapes=None)
assert check, 'assert check failed'
onnx.save(model_onnx, onnx_path)
def load_config(config_path):
# 根据文件扩展名选择加载方式
with open(config_path, 'r', encoding='utf-8') as f:
config_dict = json.load(f)
args = []
if 'export' in config_dict:
export_cfg = config_dict['export']
if 'repo_id' in export_cfg:
args.extend(['--dataset.repo_id', str(export_cfg['repo_id'])])
if 'dataset_path' in export_cfg:
args.extend(['--dataset.root', str(export_cfg['dataset_path'])])
args.extend(['--policy.type', 'act'])
args.extend(['--policy.device', 'cpu' if 'gpu_id' not in config_dict else f"cuda"])
args.extend(['--policy.repo_id', str(export_cfg['repo_id'])])
# 使用 opencv 作为视频后端,避免 torchcodec 需要 FFmpeg 的问题
args.extend(['--dataset.video_backend', 'pyav'])
args.extend(['--wandb.enable', 'false'])
# 保留原始的脚本名称作为 sys.argv[0],然后添加参数
sys.argv = [sys.argv[0]] + args
logging.info(f"Loaded config from {config_path}")
logging.info(f"Config: {sys.argv}")
return config_dict
return None
class BPU_ACTPolicy_VisionEncoder(nn.Module):
def __init__(self, act_policy):
super().__init__()
self.backbone = deepcopy(act_policy.model.backbone)
self.encoder_img_feat_input_proj = deepcopy(act_policy.model.encoder_img_feat_input_proj)
def forward(self, images):
cam_features = self.backbone(images)["feature_map"]
cam_features = self.encoder_img_feat_input_proj(cam_features)
cam_features = cam_features
return cam_features
class BPU_ACTPolicy_TransformerLayers(nn.Module):
def __init__(self, act_policy, camera_names):
super().__init__()
self.model = deepcopy(act_policy.model)
self.camera_names = camera_names
def forward(self, states, *vision_features):
latent_sample = torch.zeros([1, self.model.config.latent_dim], dtype=torch.float32)
encoder_in_tokens = [self.model.encoder_latent_input_proj(latent_sample)]
encoder_in_pos_embed = self.model.encoder_1d_feature_pos_embed.weight.unsqueeze(1).unbind(dim=0)
encoder_in_tokens.append(self.model.encoder_robot_state_input_proj(states))
all_cam_features = []
all_cam_pos_embeds = []
# 动态处理所有相机的视觉特征
for vision_feature in vision_features:
cam_pos_embed = self.model.encoder_cam_feat_pos_embed(vision_feature)
all_cam_features.append(vision_feature)
all_cam_pos_embeds.append(cam_pos_embed)
tokens = []
for token in encoder_in_tokens:
tokens.append(token.view(1,1,self.model.config.dim_model))
all_cam_features = torch.cat(all_cam_features, axis=-1).permute(2, 3, 0, 1).view(-1,1,self.model.config.dim_model)
tokens.append(all_cam_features)
encoder_in_tokens = torch.cat(tokens, axis=0)
pos_embeds = []
for pos_embed in encoder_in_pos_embed:
pos_embeds.append(pos_embed.view(1,1,self.model.config.dim_model))
all_cam_pos_embeds = torch.cat(all_cam_pos_embeds, axis=-1).permute(2, 3, 0, 1).view(-1,1,self.model.config.dim_model)
pos_embeds.append(all_cam_pos_embeds)
encoder_in_pos_embed = torch.cat(pos_embeds, axis=0)
encoder_out = self.model.encoder(encoder_in_tokens, pos_embed=encoder_in_pos_embed)
decoder_in = torch.zeros(
(self.model.config.chunk_size, 1, self.model.config.dim_model),
dtype=encoder_in_pos_embed.dtype,
device=encoder_in_pos_embed.device,
)
decoder_out = self.model.decoder(
decoder_in,
encoder_out,
encoder_pos_embed=encoder_in_pos_embed,
decoder_pos_embed=self.model.decoder_pos_embed.weight.unsqueeze(1),
)
decoder_out = decoder_out.transpose(0, 1)
actions = self.model.action_head(decoder_out)
return actions
def lerobotTensor2cvmat(tensor):
img = (tensor*255).permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8)[0,:,:,:]
# img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
return img
def onnx_sim(onnx_path, onnx_sim):
if onnx_sim:
model_onnx = onnx.load(onnx_path) # load onnx model
onnx.checker.check_model(model_onnx) # check onnx model
model_onnx, check = simplify(
model_onnx,
dynamic_input_shape=False,
input_shapes=None)
assert check, 'assert check failed'
onnx.save(model_onnx, onnx_path)
@parser.wrap()
def main(cfg: TrainPipelineConfig):
# LeRobot的参数列表
# 跳过validate(),配置不是用于训练,而是用于导出
# cfg.validate()
logging.info(pformat(cfg.to_dict()))
# BPU导出参数 - 从全局配置或命令行读取
global _global_config
class BPUOptions:
act_path = _global_config['export']['model_path']
export_path = _global_config['export']['output_path'] + "/" + _global_config['task_id']
cal_num = _global_config['export']['calibration_num']
onnx_sim = True
combine_jobs = 6
opt = BPUOptions()
if _global_config:
opt.act_path = _global_config['export']['model_path']
opt.export_path = _global_config['export']['output_path'] + "/" + _global_config['task_id']
opt.cal_num = _global_config['export']['calibration_num']
opt.onnx_sim = True
opt.march = _global_config['export']['march']
opt.combine_jobs = 6
logging.info("BPU parameters loaded from config file")
logging.info("="*80)
logging.info(colored("BPU Export Configuration:", 'light_cyan'))
logging.info(f" ACT Model Path: {opt.act_path}")
logging.info(f" Export Path: {opt.export_path}")
logging.info(f" Calibration Samples: {opt.cal_num}")
logging.info(f" ONNX Simplify: {opt.onnx_sim}")
logging.info(f" March: {opt.march}")
logging.info(f" Compiler Jobs: {opt.combine_jobs}")
logging.info(f" Dataset Root: {cfg.dataset.root}")
logging.info("="*80)
if not os.path.exists(opt.export_path):
os.makedirs(opt.export_path)
visionEncoder_ws = os.path.join(opt.export_path, BPU_VisionEncoder)
transformersLayers_ws = os.path.join(opt.export_path, BPU_TransformerLayers)
onnx_name_BPU_ACTPolicy_VisionEncoder = BPU_VisionEncoder + ".onnx"
onnx_path_BPU_ACTPolicy_VisionEncoder = os.path.join(visionEncoder_ws, onnx_name_BPU_ACTPolicy_VisionEncoder)
onnx_name_BPU_ACTPolicy_TransformerLayers = BPU_TransformerLayers + ".onnx"
onnx_path_BPU_ACTPolicy_TransformerLayers = os.path.join(transformersLayers_ws, onnx_name_BPU_ACTPolicy_TransformerLayers)
## 导出校准文件路径
calbrate_data_name_BPU_ACTPolicy_VisionEncoder = "calibration_data_" + BPU_VisionEncoder
calbrate_data_path_BPU_ACTPolicy_VisionEncoder = os.path.join(visionEncoder_ws, calbrate_data_name_BPU_ACTPolicy_VisionEncoder)
calbrate_data_name_BPU_ACTPolicy_TransformerLayers = "calibration_data_" + BPU_TransformerLayers
calbrate_data_path_BPU_ACTPolicy_TransformerLayers = os.path.join(transformersLayers_ws, calbrate_data_name_BPU_ACTPolicy_TransformerLayers)
state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, "state")
## 发布文件夹的脚本路径
bpu_output_name = "bpu_output"
bpu_output_path = os.path.join(opt.export_path, bpu_output_name)
bash_build_all_path = os.path.join(opt.export_path, "build_all.sh")
## 前后处理参数文件路径
action_std_path = os.path.join(bpu_output_path, "action_std.npy")
action_mean_path = os.path.join(bpu_output_path, "action_mean.npy")
action_std_unnormalize_path = os.path.join(bpu_output_path, "action_std_unnormalize.npy")
action_mean_unnormalize_path = os.path.join(bpu_output_path, "action_mean_unnormalize.npy")
## 新建工作目录
os.makedirs(visionEncoder_ws, exist_ok=True)
logging.info(colored(f"mkdir: {visionEncoder_ws} Success.", 'green'))
os.makedirs(transformersLayers_ws, exist_ok=True)
logging.info(colored(f"mkdir: {transformersLayers_ws} Success.", 'green'))
os.makedirs(calbrate_data_path_BPU_ACTPolicy_VisionEncoder, exist_ok=True)
logging.info(colored(f"mkdir: {calbrate_data_path_BPU_ACTPolicy_VisionEncoder} Success.", 'green'))
os.makedirs(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, exist_ok=True)
logging.info(colored(f"mkdir: {calbrate_data_path_BPU_ACTPolicy_TransformerLayers} Success.", 'green'))
os.makedirs(state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers, exist_ok=True)
logging.info(colored(f"mkdir: {state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers} Success.", 'green'))
os.makedirs(bpu_output_path, exist_ok=True)
logging.info(colored(f"mkdir: {bpu_output_path} Success.", 'green'))
policy = ACTPolicy.from_pretrained(opt.act_path).cpu().eval()
logging.info(colored(f"Load ACT Policy Model: {opt.act_path} Success.", 'light_red'))
device = get_safe_torch_device(cfg.policy.device, log=True)
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
# 加载数据集
dataset = make_dataset(cfg)
dataloader = torch.utils.data.DataLoader(
dataset,
num_workers=0,
batch_size=1,
shuffle=True,
sampler=None,
pin_memory=device.type != "cpu",
drop_last=False,
)
logging.info(colored(f"Load ACT Policy Dataset: \n{dataset} Success.", 'light_red'))
batch = next(iter(dataloader))
image_keys = [key for key in batch.keys() if key.startswith('observation.images.')]
camera_names = [key.split('.')[-1] for key in image_keys]
logging.info(colored(f"Camera Names: {camera_names} Success.", 'light_red'))
logging.info(colored(f"Image Keys: {image_keys} Success.", 'light_red'))
logging.info(colored(f"Batch: {batch} Success.", 'light_red'))
outputs = policy.select_action(deepcopy(batch))
## 动态获取前后处理参数
# 为每个相机保存归一化参数
for camera_name in camera_names:
buffer_name = f"buffer_observation_images_{camera_name}"
if hasattr(policy.normalize_inputs, buffer_name):
buffer = getattr(policy.normalize_inputs, buffer_name)
camera_std = buffer.std.data.detach().cpu().numpy()
camera_mean = buffer.mean.data.detach().cpu().numpy()
camera_std_path = os.path.join(bpu_output_path, f"{camera_name}_std.npy")
camera_mean_path = os.path.join(bpu_output_path, f"{camera_name}_mean.npy")
np.save(camera_std_path, camera_std)
np.save(camera_mean_path, camera_mean)
logging.info(f"Saved {camera_name} normalization parameters")
# 保存状态和动作归一化参数
action_std = policy.normalize_inputs.buffer_observation_state.std.data.detach().cpu().numpy()
action_mean = policy.normalize_inputs.buffer_observation_state.mean.data.detach().cpu().numpy()
action_std_unnormalize = policy.unnormalize_outputs.buffer_action.std.data.detach().cpu().numpy()
action_mean_unnormalize = policy.unnormalize_outputs.buffer_action.mean.data.detach().cpu().numpy()
np.save(action_std_path, action_std)
np.save(action_mean_path, action_mean)
np.save(action_std_unnormalize_path, action_std_unnormalize)
np.save(action_mean_unnormalize_path, action_mean_unnormalize)
## Vision Encoder
batch = policy.normalize_inputs(batch)
m_VisionEncoder = BPU_ACTPolicy_VisionEncoder(policy)
m_VisionEncoder.eval()
# 动态获取相机视觉特征
vision_features = []
for camera_name in camera_names:
input_tensor = batch[f'observation.images.{camera_name}']
vision_feature = m_VisionEncoder(input_tensor)
vision_features.append(vision_feature)
logging.info(f"Generated vision features for {camera_name}: {vision_feature.shape}")
# 确定ONNX版本
opset_version = 11 if "bayes" in opt.march else 19
logging.info(f"Using ONNX opset version: {opset_version} for type: {opt.march}")
onnx_path = onnx_path_BPU_ACTPolicy_VisionEncoder
torch.onnx.export(
m_VisionEncoder, # 要转换的模型
input_tensor, # 模型的输入
onnx_path, # 输出文件名
export_params=True, # 存储训练后的参数
opset_version=opset_version, # 动态ONNX版本
do_constant_folding=True, # 是否执行常量折叠优化
input_names=['images'], # 输入节点名称
output_names=['Vision_Features'], # 输出节点名称
dynamic_axes=None
)
onnx_sim(onnx_path, opt.onnx_sim)
logging.info(colored(f"Export {onnx_path} Success.", 'green'))
m_TransformerLayers = BPU_ACTPolicy_TransformerLayers(policy, camera_names)
m_TransformerLayers.eval()
state = batch["observation.state"]
actions = m_TransformerLayers(state, *vision_features)
# np.save(f"new_actions.npy", actions.detach().cpu().numpy())
input_names = ['states'] + [f'{camera_name}_features' for camera_name in camera_names]
logging.info(f"Transformer input names: {input_names}")
onnx_path = onnx_path_BPU_ACTPolicy_TransformerLayers
torch.onnx.export(
m_TransformerLayers, # 要转换的模型
(state, *vision_features), # 模型的输入
onnx_path, # 输出文件名
export_params=True, # 存储训练后的参数
opset_version=opset_version, # 动态ONNX版本
do_constant_folding=True, # 是否执行常量折叠优化
input_names=input_names, # 动态输入节点名称
output_names=['Actions'], # 输出节点名称
dynamic_axes=None
)
onnx_sim(onnx_path, opt.onnx_sim)
logging.info(colored(f"Export {onnx_path} Success.", 'green'))
if "nash" in opt.march:
## calibrate data - 动态生成相机校准数据目录
input_names_TransformerLayers = camera_names + ["state"]
input_cal_path = []
for input_name in input_names_TransformerLayers:
p = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, input_name)
input_cal_path.append(p)
os.makedirs(p, exist_ok=True)
logging.info(colored(f"mkdir: {p} Success.", 'green'))
for i, batch in enumerate(dataloader):
name = "%.10d.npy"%i
batch = policy.normalize_inputs(batch)
# 动态处理所有相机输入
camera_inputs = {}
for camera_name in camera_names:
camera_inputs[camera_name] = batch[f'observation.images.{camera_name}']
state_input = batch["observation.state"]
## VisionEncoder - 动态保存所有相机的校准数据
if i%4 == 0:
for camera_name in camera_names:
p = os.path.join(calbrate_data_path_BPU_ACTPolicy_VisionEncoder, f"{camera_name}_" + name)
np.save(p, camera_inputs[camera_name].detach().cpu().numpy())
logging.info(colored(f"save to: {p}", 'light_blue'))
## TransformerLayers - 动态处理所有相机的视觉特征
for camera_name in camera_names:
vision_feature = m_VisionEncoder(camera_inputs[camera_name])
camera_cal_path = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, camera_name)
p = os.path.join(camera_cal_path, name)
np.save(p, vision_feature.detach().cpu().numpy())
logging.info(colored(f"save to: {p}", 'light_magenta'))
p = os.path.join(state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers, name)
np.save(p, state_input.detach().cpu().numpy())
logging.info(colored(f"save to: {p}", 'light_magenta'))
if i >= opt.cal_num:
break
if "bayes" in opt.march:
## calibrate data - 动态生成相机校准数据目录
input_names_TransformerLayers = camera_names + ["state"]
input_cal_path = []
for input_name in input_names_TransformerLayers:
p = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, input_name)
input_cal_path.append(p)
os.makedirs(p, exist_ok=True)
logging.info(colored(f"mkdir: {p} Success.", 'green'))
for i, batch in enumerate(dataloader):
name = "%.10d.nchw"%i
batch = policy.normalize_inputs(batch)
# 动态处理所有相机输入
camera_inputs = {}
for camera_name in camera_names:
camera_inputs[camera_name] = batch[f'observation.images.{camera_name}']
state_input = batch["observation.state"]
## VisionEncoder - 动态保存所有相机的校准数据 (Bayes格式)
if i%4 == 0:
for camera_name in camera_names:
p = os.path.join(calbrate_data_path_BPU_ACTPolicy_VisionEncoder, f"{camera_name}_" + name)
camera_inputs[camera_name].detach().cpu().numpy().tofile(p)
logging.info(colored(f"save to: {p}", 'light_blue'))
## TransformerLayers - 动态处理所有相机的视觉特征 (Bayes格式)
for camera_name in camera_names:
vision_feature = m_VisionEncoder(camera_inputs[camera_name])
camera_cal_path = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, camera_name)
p = os.path.join(camera_cal_path, name)
vision_feature.detach().cpu().numpy().tofile(p)
logging.info(colored(f"save to: {p}", 'light_magenta'))
p = os.path.join(state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers, name)
state_input.detach().cpu().numpy().tofile(p)
logging.info(colored(f"save to: {p}", 'light_magenta'))
if i >= opt.cal_num:
break
def generate_output_config(time_cost):
global _global_config
export_path = _global_config['export']['output_path'] + "/" + _global_config['task_id']
TransformerLayers = export_path + "/" + BPU_TransformerLayers
TransformerLayers_onnx = TransformerLayers + "/" + BPU_TransformerLayers + ".onnx"
TransformerLayers_calibration_data = TransformerLayers + "/" + "calibration_data_" + BPU_TransformerLayers
VisionEncoder = export_path + "/" + BPU_VisionEncoder
VisionEncoder_onnx = VisionEncoder + "/" + BPU_VisionEncoder + ".onnx"
VisionEncoder_calibration_data = VisionEncoder + "/" + "calibration_data_" + BPU_VisionEncoder
output_config = {
"task_name": _global_config['task_id'],
"march": _global_config['export']['march'],
"time_cost": time_cost,
"export_path": export_path,
"TransformerLayers": TransformerLayers_onnx,
"TransformerLayers_calibration_data": TransformerLayers_calibration_data,
"VisionEncoder": VisionEncoder_onnx,
"VisionEncoder_calibration_data": VisionEncoder_calibration_data,
}
with open(os.path.join(export_path, "output.json"), "w") as f:
json.dump(output_config, f)
if __name__ == "__main__":
init_logging()
config_path = "input/config.json"
_global_config = load_config(config_path)
time_start = time.time()
main()
time_end = time.time()
time_cost = time_end - time_start
logging.info(colored(f"Time Cost: {time_cost} seconds", 'light_red'))
generate_output_config(time_cost)

View File

@ -0,0 +1,5 @@
lerobot==0.3.3
onnx
onnxsim
onnxruntime
av

View File

17
ACT/act_quant/Dockerfile Normal file
View File

@ -0,0 +1,17 @@
# ARG BASE_IMAGE=ccr-29eug8s3-pub.cnc.bj.baidubce.com/deliver/ai_toolchain_ubuntu_20_x5_cpu:v1.2.8
ARG BASE_IMAGE=ccr-29eug8s3-pub.cnc.bj.baidubce.com/aitools/ai_toolchain_ubuntu_22_j6_gpu:v3.3.0
# 可通过 --build-arg BASE_IMAGE=... 来替换基础镜像
FROM ${BASE_IMAGE}
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV TZ=Asia/Shanghai
RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
COPY . /app/
ENTRYPOINT ["bash", "convert.sh"]

24
ACT/act_quant/convert.sh Normal file
View File

@ -0,0 +1,24 @@
CONFIG=input/config.json
TASKID=$(python3 read_json.py $CONFIG task_id)
MARCH=$(python3 read_json.py $CONFIG quant.march)
OUTPUT=output/$TASKID
python3 load_config.py $CONFIG
echo "Convert PTQ YAML Haved been Prepared"
VISIONENCODER_YAML=$OUTPUT/ptq_yaml/VisionEncoder.yaml
TRANSFORMERLAYERS_YAML=$OUTPUT/ptq_yaml/TransformerLayers.yaml
if [[ "$MARCH" == *"nash"* ]]; then
echo -e "\033[44;37m===== Start Compiling TRANSFORMERLAYERS =====\033[0m"
hb_compile --config $TRANSFORMERLAYERS_YAML
echo -e "\033[44;37m===== Start Compiling TRANSFORMERLAYERS =====\033[0m"
hb_compile --config $VISIONENCODER_YAML
echo -e "\033[44;37m===== End Compiling Nash Model =====\033[0m"
else
echo -e "\033[44;37m===== Start Compiling TRANSFORMERLAYERS =====\033[0m"
hb_mapper makertbin --model-type onnx --config $TRANSFORMERLAYERS_YAML
echo -e "\033[44;37m===== Start Compiling VISIONENCODER =====\033[0m"
hb_mapper makertbin --model-type onnx --config $VISIONENCODER_YAML
echo -e "\033[44;37m===== End Compiling Bayes Model =====\033[0m"
fi

View File

@ -0,0 +1,76 @@
import json
import yaml
import sys
import os
def load_config(config_path):
with open(config_path, "r") as file:
config = yaml.safe_load(file)
if "quant" in config:
quant_info = config["quant"]
if "output_path" in quant_info:
output_path = os.path.join(quant_info["output_path"], config["task_id"])
if "march" in quant_info:
march = "nash" if "nash" in quant_info["march"] else "bayes"
convert_yaml_path = f"pyq_yaml/{march}/"
# prepare the nash and bayes bpu
## first prepare the VisionEncoder yaml
VisionEncoder_yaml_path = os.path.join(convert_yaml_path, "VisionEncoder.yaml")
with open(VisionEncoder_yaml_path, "r") as file:
VisionEncoder_yaml = yaml.safe_load(file)
VisionEncoder_yaml["model_parameters"]["onnx_model"] = quant_info["VisionEncoder"]["onnx_model"]
VisionEncoder_yaml["calibration_parameters"]["cal_data_dir"] = quant_info["VisionEncoder"]["calibration_data"]
VisionEncoder_yaml["model_parameters"]["march"] = quant_info["march"]
# Make sure output ptq_yaml directory exists
output_ptq_yaml_dir = os.path.join(output_path, "ptq_yaml")
os.makedirs(output_ptq_yaml_dir, exist_ok=True)
# Save VisionEncoder yaml to output/ptq_yaml
VisionEncoder_yaml_save_path = os.path.join(output_ptq_yaml_dir, "VisionEncoder.yaml")
with open(VisionEncoder_yaml_save_path, "w") as file:
yaml.safe_dump(VisionEncoder_yaml, file)
## second prepare the TransformerLayers yaml
TransformerLayers_yaml_path = os.path.join(convert_yaml_path, "TransformerLayers.yaml")
with open(TransformerLayers_yaml_path, "r") as file:
TransformerLayers_yaml = yaml.safe_load(file)
TransformerLayers_yaml["model_parameters"]["onnx_model"] = quant_info["TransformerLayers"]["onnx_model"]
TransformerLayers_yaml["model_parameters"]["march"] = quant_info["march"]
TransformerLayers_Cal_dir = quant_info["TransformerLayers"]["calibration_data"]
# (Fix cal_data_dir variable)
cal_data_dir = TransformerLayers_Cal_dir
sub_dirs = [d for d in os.listdir(cal_data_dir) if os.path.isdir(os.path.join(cal_data_dir, d))]
input_names = []
for name in sub_dirs:
if name == "state":
input_names.append("states")
else:
input_names.append(f"{name}_features")
input_name_str = ";".join(input_names) + ";"
TransformerLayers_yaml["input_parameters"]["input_name"] = input_name_str
TransformerLayers_yaml["input_parameters"]["input_type_rt"] = "featuremap;" * len(input_names)
TransformerLayers_yaml["input_parameters"]["input_layout_rt"] = "NCHW;" * len(input_names)
TransformerLayers_yaml["input_parameters"]["input_type_train"] = "featuremap;" * len(input_names)
TransformerLayers_yaml["input_parameters"]["input_layout_train"] = "NCHW;" * len(input_names)
TransformerLayers_yaml["input_parameters"]["norm_type"] = "no_preprocess;" * len(input_names)
TransformerLayers_yaml["calibration_parameters"]["cal_data_dir"] = ";".join([os.path.join(TransformerLayers_Cal_dir, name) for name in sub_dirs]) + ";"
TransformerLayers_yaml["calibration_parameters"]["cal_data_type"] = "float32;" * len(input_names)
# Save TransformerLayers yaml to output/ptq_yaml
TransformerLayers_yaml_save_path = os.path.join(output_ptq_yaml_dir, "TransformerLayers.yaml")
with open(TransformerLayers_yaml_save_path, "w") as file:
yaml.safe_dump(TransformerLayers_yaml, file)
if __name__ == "__main__":
config_path = sys.argv[1]
config = load_config(config_path)

View File

@ -0,0 +1,23 @@
model_parameters:
onnx_model: '{onnx_name_BPU_ACTPolicy_TransformerLayers}'
march: "{opt.type}"
layer_out_dump: False
working_dir: 'bpu_model_output'
output_model_file_prefix: 'BPU_TransformerLayers'
input_parameters:
input_name: "{input_name_str}"
input_type_rt: '{input_type_str}'
input_layout_rt: '{nchw_str}'
input_type_train: '{input_type_str}'
input_layout_train: '{nchw_str}'
norm_type: '{norm_type_str}'
calibration_parameters:
cal_data_dir: '{cal_data_dir_str}'
cal_data_type: '{cal_data_type_str}'
calibration_type: 'default'
optimization: set_all_nodes_int16;set_Softmax_input_int16;set_Softmax_output_int16;
compiler_parameters:
jobs: 6
compile_mode: 'latency'
debug: False
optimize_level: 'O3'

View File

@ -0,0 +1,23 @@
model_parameters:
onnx_model: 'onnx_name_BPU_ACTPolicy_VisionEncoder'
march: "opt.type"
layer_out_dump: False
working_dir: 'bpu_model_output'
output_model_file_prefix: 'BPU_VisionEncoder'
input_parameters:
input_name: ""
input_type_rt: 'featuremap'
input_layout_rt: 'NCHW'
input_type_train: 'featuremap'
input_layout_train: 'NCHW'
norm_type: 'no_preprocess'
calibration_parameters:
cal_data_dir: 'calbrate_data_name_BPU_ACTPolicy_VisionEncoder'
cal_data_type: 'float32'
calibration_type: 'default'
optimization: set_all_nodes_int16;set_Softmax_input_int16;set_Softmax_output_int16;
compiler_parameters:
jobs: 6
compile_mode: 'latency'
debug: true
optimize_level: 'O3'

View File

@ -0,0 +1,24 @@
model_parameters:
onnx_model: '{onnx_name_BPU_ACTPolicy_TransformerLayers}'
march: "{opt.type}"
layer_out_dump: False
working_dir: 'bpu_model_output'
output_model_file_prefix: 'BPU_TransformerLayers'
input_parameters:
input_name: "{input_name_str}"
input_type_rt: '{input_type_str}'
input_layout_rt: '{nchw_str}'
input_type_train: '{input_type_str}'
input_layout_train: '{nchw_str}'
norm_type: '{norm_type_str}'
calibration_parameters:
cal_data_dir: '{cal_data_dir_str}'
cal_data_type: '{cal_data_type_str}'
calibration_type: 'default'
optimization: set_all_nodes_int16
compiler_parameters:
extra_params: {'input_no_padding': True, 'output_no_padding': True}
jobs: 6
compile_mode: 'latency'
debug: False
optimize_level: 'O2'

View File

@ -0,0 +1,24 @@
model_parameters:
onnx_model: '{onnx_name_BPU_ACTPolicy_VisionEncoder}'
march: "{opt.type}"
layer_out_dump: False
working_dir: 'bpu_model_output'
output_model_file_prefix: 'BPU_VisionEncoder'
input_parameters:
input_name: ""
input_type_rt: 'featuremap'
input_layout_rt: 'NCHW'
input_type_train: 'featuremap'
input_layout_train: 'NCHW'
norm_type: 'no_preprocess'
calibration_parameters:
cal_data_dir: '{calbrate_data_name_BPU_ACTPolicy_VisionEncoder}'
cal_data_type: 'float32'
calibration_type: 'default'
optimization: set_all_nodes_int16
compiler_parameters:
extra_params: {'input_no_padding': True, 'output_no_padding': True}
jobs: 6
compile_mode: 'latency'
debug: true
optimize_level: 'O2'

View File

@ -0,0 +1,42 @@
import json
import sys
def read_config(config_file, key_path):
"""
Read a value from JSON config file.
Args:
config_file: Path to JSON config file
key_path: Dot-separated path to the key (e.g., "evaluation.checkpoint_path")
Returns:
The value at the specified key path
"""
with open(config_file, 'r') as f:
json_config = json.load(f)
# Navigate through nested keys
keys = key_path.split('.')
value = json_config
for key in keys:
if isinstance(value, dict):
value = value.get(key)
else:
return None
return value
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python read_config.py <config_file> <key_path>", file=sys.stderr)
sys.exit(1)
config_file = sys.argv[1]
key_path = sys.argv[2]
value = read_config(config_file, key_path)
if value is not None:
print(value)
else:
print("", file=sys.stderr)
sys.exit(1)

2
RDT/README.md Normal file
View File

@ -0,0 +1,2 @@
# d-robotics-rdt

View File

@ -0,0 +1,2 @@
input/*
output/*

View File

@ -0,0 +1,43 @@
FROM registry.d-robotics.cc/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
# ccr-29eug8s3-pub.cnc.bj.baidubce.com/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV TZ=Asia/Shanghai
RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
RUN apt-get update --allow-unauthenticated && apt-get install -y \
software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update \
&& apt-get install -y \
python3.10 \
python3.10-dev \
python3-pip \
python3.10-distutils \
libgl1-mesa-glx \
libglib2.0-0 \
wget \
libsm6 \
libxext6 \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
COPY . /app/
RUN python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
# RUN pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu121
RUN pip install torch==2.1.0 torchvision==0.16.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
RUN pip install packaging==24.0
# RUN mkdir -p /app/dataset/input /app/dataset/output
ENTRYPOINT ["bash", "convert.sh"]

View File

@ -0,0 +1,60 @@
BEGIN_TIME=$(date +%s)
CONFIG_FILE="input/config.json"
echo "CONFIG_FILE_PATH: $CONFIG_FILE"
# Read values directly from the config.json using python - no more nested key error by using a helper script
TASK_ID=$(python3 read_json.py "$CONFIG_FILE" "task_id")
DATA_DIR=$(python3 read_json.py "$CONFIG_FILE" "data_dir")
OUTPUT_DIR=$(python3 read_json.py "$CONFIG_FILE" "output_dir")
EPISODE_NUM=$(python3 read_json.py "$CONFIG_FILE" "episode_num")
GPU=$(python3 read_json.py "$CONFIG_FILE" "gpu")
T5_PATH="/weights/t5-v1_1-xxl"
NO_LANGUAGE=$(python3 read_json.py "$CONFIG_FILE" "no_language")
# For the camera keys, extract them in a way that avoids the error about 'images_info.key.*' not found
CAM_HIGH_KEY=$(python3 -c "import json; print(json.load(open('$CONFIG_FILE'))['images_info']['key'].get('cam_high', ''))")
CAM_RIGHT_WRIST_KEY=$(python3 -c "import json; print(json.load(open('$CONFIG_FILE'))['images_info']['key'].get('cam_right_wrist', ''))")
# create output path
if [ ! -d "$OUTPUT_DIR/$TASK_ID" ]; then
mkdir -p "$OUTPUT_DIR/$TASK_ID"
echo "Created output directory: $OUTPUT_DIR/$TASK_ID"
else
echo "Output directory already exists: $OUTPUT_DIR/$TASK_ID"
fi
if [ "$NO_LANGUAGE" = "true" ]; then
python3 lerobot2rdt.py \
--data_dir $DATA_DIR \
--output_dir $OUTPUT_DIR/$TASK_ID \
--episode_num $EPISODE_NUM \
--gpu $GPU \
--t5_path $T5_PATH \
--cam_high_key $CAM_HIGH_KEY \
--cam_right_wrist_key $CAM_RIGHT_WRIST_KEY \
--no_language
status=$?
else
python3 lerobot2rdt.py \
--data_dir $DATA_DIR \
--output_dir $OUTPUT_DIR/$TASK_ID \
--episode_num $EPISODE_NUM \
--gpu $GPU \
--t5_path $T5_PATH \
--cam_high_key $CAM_HIGH_KEY \
--cam_right_wrist_key $CAM_RIGHT_WRIST_KEY
status=$?
fi
END_TIME=$(date +%s)
echo "END_TIME: $END_TIME"
echo "TOTAL_TIME: $((END_TIME - BEGIN_TIME))"
if [ $status -eq 0 ]; then
python3 generate_output.py $CONFIG_FILE $((END_TIME - BEGIN_TIME))
else
echo "lerobot2rdt.py exited with status $status, skipping generate_output.py"
fi

View File

@ -0,0 +1,26 @@
import json
import os
import sys
def generate_output(input_config, time):
with open(input_config, "r") as f:
data = json.load(f)
output_dir_with_taskid = os.path.join(data["output_dir"], str(data["task_id"]))
# Ensure the output directory exists before writing the output file
os.makedirs(output_dir_with_taskid, exist_ok=True)
output_data = {
"task_id": data["task_id"],
"convert_time": time,
"data_dir": data["data_dir"],
"output_dir": output_dir_with_taskid,
"episode_num": data["episode_num"],
"no_language": data["no_language"],
}
output_json_path = os.path.join(output_dir_with_taskid, "output.json")
with open(output_json_path, "w") as f:
json.dump(output_data, f)
if __name__ == "__main__":
input_config = sys.argv[1]
time = int(sys.argv[2])
generate_output(input_config, time)

View File

@ -0,0 +1,368 @@
#!/usr/bin/env python3
"""
LeRobot到RDT数据转换脚本
LeRobot机器人结构
- 5个关节 (shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll)
- 1个夹爪 (gripper)
- 总计6个自由度 (6DOF)
维度映射匹配RDT训练代码
- left_arm_dim = 0 (单臂机器人左臂不存在)
- right_arm_dim = 6 (5关节 + 1夹爪映射到RDT的right_arm部分)
- 状态向量6 [joint1, joint2, joint3, joint4, joint5, gripper]
- RDT索引映射right_arm_joint_0_pos到right_arm_joint_5_pos (索引0-5)
"""
import sys
import os
import h5py
import numpy as np
import cv2
import argparse
import yaml
import json
import subprocess
from pathlib import Path
import pandas as pd
import torch
current_dir = os.path.dirname(__file__)
sys.path.append(os.path.join(current_dir, ".."))
from models.multimodal_encoder.t5_encoder import T5Embedder
def extract_frames_from_video(video_path, output_dir, episode_idx):
if not os.path.exists(video_path):
print(f" No video file: {video_path}")
return []
temp_dir = os.path.join(output_dir, f"temp_frames_{episode_idx}")
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
output_pattern = os.path.join(temp_dir, "frame_%04d.jpg")
try:
cmd = [
'ffmpeg', '-i', video_path,
'-vf', 'fps=30',
'-q:v', '2',
output_pattern,
'-y'
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f" Failed to extract frames with ffmpeg: {result.stderr}")
return []
frames = []
frame_files = sorted([f for f in os.listdir(temp_dir) if f.endswith('.jpg')])
for frame_file in frame_files:
frame_path = os.path.join(temp_dir, frame_file)
frame = cv2.imread(frame_path)
if frame is not None:
frame_resized = cv2.resize(frame, (640, 480))
frames.append(frame_resized)
print(f" Successfully extracted {len(frames)} frames")
for frame_file in frame_files:
os.remove(os.path.join(temp_dir, frame_file))
os.rmdir(temp_dir)
return frames
except Exception as e:
print(f" Error extracting frames: {e}")
return []
def load_lerobot_episode(data_dir, episode_idx, output_dir, cam_high_key="high", cam_right_wrist_key="arm"):
"""加载LeRobot的单个episode数据
LeRobot数据结构
- action: 6 [shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll, gripper]
- observation.state: 6 [shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll, gripper]
- 图像: 高位相机 + 手臂相机
"""
parquet_path = os.path.join(data_dir, "data/chunk-000", f"episode_{episode_idx:06d}.parquet")
if not os.path.exists(parquet_path):
print(f"Episode {episode_idx} parquet file does not exist: {parquet_path}")
return None
df = pd.read_parquet(parquet_path)
actions = []
qpos = []
for i in range(len(df)):
action = df['action'].iloc[i]
state = df['observation.state'].iloc[i]
if isinstance(action, np.ndarray):
actions.append(action.astype(np.float32))
else:
actions.append(np.array(action, dtype=np.float32))
if isinstance(state, np.ndarray):
qpos.append(state.astype(np.float32))
else:
qpos.append(np.array(state, dtype=np.float32))
high_cam_path = os.path.join(data_dir, f"videos/chunk-000/observation.images.{cam_high_key}", f"episode_{episode_idx:06d}.mp4")
arm_cam_path = os.path.join(data_dir, f"videos/chunk-000/observation.images.{cam_right_wrist_key}", f"episode_{episode_idx:06d}.mp4")
print(f" Extracting high camera frames...")
high_images = extract_frames_from_video(high_cam_path, output_dir, episode_idx)
print(f" Extracting arm camera frames...")
arm_images = extract_frames_from_video(arm_cam_path, output_dir, episode_idx)
target_frames = len(df)
if len(high_images) > target_frames:
high_images = high_images[:target_frames]
if len(arm_images) > target_frames:
arm_images = arm_images[:target_frames]
while len(high_images) < target_frames and high_images:
high_images.append(high_images[-1])
while len(arm_images) < target_frames and arm_images:
arm_images.append(arm_images[-1])
return {
'actions': np.array(actions),
'qpos': np.array(qpos),
'high_images': high_images,
'arm_images': arm_images,
'episode_length': len(df)
}
def images_encoding(imgs):
if not imgs:
return [], 0
encode_data = []
padded_data = []
max_len = 0
for i in range(len(imgs)):
success, encoded_image = cv2.imencode(".jpg", imgs[i])
if success:
jpeg_data = encoded_image.tobytes()
encode_data.append(jpeg_data)
max_len = max(max_len, len(jpeg_data))
else:
print(f" Image encoding failed: {i}")
empty_data = b""
encode_data.append(empty_data)
for i in range(len(imgs)):
padded_data.append(encode_data[i].ljust(max_len, b"\0"))
return encode_data, max_len
def load_task_instructions(data_dir):
tasks_file = os.path.join(data_dir, "meta/tasks.jsonl")
if not os.path.exists(tasks_file):
print(f"Warning: tasks file not found: {tasks_file}")
return None
instructions = []
with open(tasks_file, 'r') as f:
for line in f:
if line.strip():
task_data = json.loads(line.strip())
instructions.append(task_data["task"])
print(f" 加载了 {len(instructions)} 个任务指令")
return instructions
def encode_language_instruction(instruction_text, t5_embedder, device):
try:
text_embeds, attn_mask = t5_embedder.get_text_embeddings([instruction_text])
valid_embeds = text_embeds[0][attn_mask[0]].float()
return valid_embeds.cpu().numpy()
except Exception as e:
print(f" Language encoding failed: {e}")
return np.zeros((1, 4096))
def convert_lerobot_to_rdt(data_dir, output_dir, episode_num, gpu=0, no_language=False, t5_path=None, cam_high_key="high", cam_right_wrist_key="arm"):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Start converting LeRobot data to RDT format...")
print(f"Data source: {data_dir}")
print(f"Output directory: {output_dir}")
print(f"Processing episode number: {episode_num}")
print(f"GPU device: {gpu}")
scene_name = os.path.basename(data_dir)
instructions = None
if not no_language:
instructions = load_task_instructions(data_dir)
t5_embedder = None
if not no_language and instructions:
try:
print(f" Initializing T5 encoder...")
t5_embedder = T5Embedder(
from_pretrained=t5_path,
device=f"cuda:{gpu}" if torch.cuda.is_available() else "cpu",
model_max_length=1024,
use_offload_folder=None,
)
print(f" T5 encoder initialized successfully")
except Exception as e:
print(f" T5 encoder initialization failed: {e}")
print(f" Will skip language processing")
no_language = True
for i in range(episode_num):
print(f"Processing episode {i}...")
episode_data = load_lerobot_episode(data_dir, i, output_dir, cam_high_key=cam_high_key, cam_right_wrist_key=cam_right_wrist_key)
if episode_data is None:
print(f"Skipping episode {i}")
continue
episode_output_dir = os.path.join(output_dir, f"episode_{i}")
if not os.path.exists(episode_output_dir):
os.makedirs(episode_output_dir)
hdf5_path = os.path.join(episode_output_dir, f"episode_{i}.hdf5")
with h5py.File(hdf5_path, "w") as f:
f.create_dataset("action", data=episode_data['actions'])
obs = f.create_group("observations")
obs.create_dataset("qpos", data=episode_data['qpos'])
image = obs.create_group("images")
if episode_data['high_images']:
print(f" Encoding high camera images...")
high_enc, len_high = images_encoding(episode_data['high_images'])
if high_enc and len_high > 0:
image.create_dataset("cam_high", data=high_enc, dtype=f"S{len_high}")
print(f" Saved high camera images: {len(episode_data['high_images'])} frames")
else:
print(f" Warning: High camera images encoding failed")
if episode_data['arm_images']:
print(f" Encoding arm camera images...")
arm_enc, len_arm = images_encoding(episode_data['arm_images'])
if arm_enc and len_arm > 0:
image.create_dataset("cam_right_wrist", data=arm_enc, dtype=f"S{len_arm}")
print(f" Saved arm camera images: {len(episode_data['arm_images'])} frames")
else:
print(f" Warning: Arm camera images encoding failed")
# 添加机器人维度信息LeRobot: 5个关节 + 1个夹爪
# 根据process_data.py的逻辑每个时间步都需要记录维度信息
# LeRobot是单臂机器人只有右臂5个关节 + 1个夹爪 = 6维
# 左臂0维单臂机器人
# 为每个时间步记录维度信息
left_arm_dim = [0] * len(episode_data['actions']) # 左臂0维单臂机器人
right_arm_dim = [6] * len(episode_data['actions']) # 右臂6维5关节+1夹爪
obs.create_dataset("left_arm_dim", data=np.array(left_arm_dim))
obs.create_dataset("right_arm_dim", data=np.array(right_arm_dim))
print(f" Episode {i} converted successfully: {hdf5_path}")
print(f" Data length: {episode_data['episode_length']}")
print(f" Action shape: {episode_data['actions'].shape}")
print(f" Qpos shape: {episode_data['qpos'].shape}")
print(f" High camera frames: {len(episode_data['high_images'])}")
print(f" Arm camera frames: {len(episode_data['arm_images'])}")
if not no_language and t5_embedder and instructions:
print(f" Processing language instructions...")
try:
instruction = instructions[0]
language_features = encode_language_instruction(instruction, t5_embedder, f"cuda:{gpu}")
instructions_dir = os.path.join(episode_output_dir, "instructions")
if not os.path.exists(instructions_dir):
os.makedirs(instructions_dir)
lang_embed_path = os.path.join(instructions_dir, "lang_embed_0.pt")
torch.save(torch.from_numpy(language_features), lang_embed_path)
print(f" Language instruction encoded successfully: {instruction}")
print(f" Language features saved to: {lang_embed_path}")
print(f" Language features shape: {language_features.shape}, data type: {language_features.dtype}")
except Exception as e:
print(f" Language instruction processing failed: {e}")
print(f"\nConversion completed! Processed {episode_num} episodes")
print(f"Output directory: {output_dir}")
def main():
parser = argparse.ArgumentParser(description="Convert LeRobot data to RDT format")
parser.add_argument("--data_dir", type=str, required=True,
help="LeRobot data directory path")
parser.add_argument("--output_dir", type=str, required=True,
help="Output directory path")
parser.add_argument("--episode_num", type=int, default=10,
help="Number of episodes to process")
parser.add_argument("--gpu", type=int, default=0,
help="GPU device ID")
parser.add_argument("--no_language", action="store_true",
help="Skip language processing")
parser.add_argument("--cam_high_key", type=str, default="cam_high",
help="High camera key")
parser.add_argument("--cam_right_wrist_key", type=str, default="cam_right_wrist",
help="Right wrist camera key")
parser.add_argument("--cam_left_wrist_key", type=str, default="cam_left_wrist",
help="Left wrist camera key")
parser.add_argument("--t5_path", type=str, required=True,
help="T5 model path")
args = parser.parse_args()
if not os.path.exists(args.data_dir):
print(f"Error: Data directory does not exist: {args.data_dir}")
return
meta_file = os.path.join(args.data_dir, "meta/info.json")
if not os.path.exists(meta_file):
print(f"Error: Meta information file not found: {meta_file}")
return
try:
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
print("ffmpeg is available, will use ffmpeg to extract video frames")
except (subprocess.CalledProcessError, FileNotFoundError):
print("Warning: ffmpeg is not available, image data may not be extracted correctly")
print("Please install ffmpeg: conda install -c conda-forge ffmpeg=6.1")
return
with open(meta_file, 'r') as f:
meta_info = yaml.safe_load(f)
total_episodes = meta_info.get('total_episodes', 10)
if args.episode_num > total_episodes:
print(f"Warning: Requested episode number ({args.episode_num}) exceeds available number ({total_episodes})")
args.episode_num = total_episodes
convert_lerobot_to_rdt(
args.data_dir,
args.output_dir,
args.episode_num,
args.gpu,
args.no_language,
args.t5_path,
args.cam_high_key,
args.cam_right_wrist_key,
)
if __name__ == "__main__":
main()

View File

View File

@ -0,0 +1,82 @@
# Reference: DiffusionPolicy [https://github.com/real-stanford/diffusion_policy]
import torch
from torch.nn.modules.batchnorm import _BatchNorm
class EMAModel:
"""
Exponential Moving Average of models weights
"""
def __init__(self, model, update_after_step=0, inv_gamma=1.0, power=2 / 3, min_value=0.0, max_value=0.9999):
"""
@crowsonkb's notes on EMA Warmup:
If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
at 215.4k steps).
Args:
inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
power (float): Exponential factor of EMA warmup. Default: 2/3.
min_value (float): The minimum EMA decay rate. Default: 0.
"""
self.averaged_model = model
self.averaged_model.eval()
self.averaged_model.requires_grad_(False)
self.update_after_step = update_after_step
self.inv_gamma = inv_gamma
self.power = power
self.min_value = min_value
self.max_value = max_value
self.decay = 0.0
self.optimization_step = 0
def get_decay(self, optimization_step):
"""
Compute the decay factor for the exponential moving average.
"""
step = max(0, optimization_step - self.update_after_step - 1)
value = 1 - (1 + step / self.inv_gamma)**-self.power
if step <= 0:
return 0.0
return max(self.min_value, min(value, self.max_value))
@torch.no_grad()
def step(self, new_model):
self.decay = self.get_decay(self.optimization_step)
# old_all_dataptrs = set()
# for param in new_model.parameters():
# data_ptr = param.data_ptr()
# if data_ptr != 0:
# old_all_dataptrs.add(data_ptr)
all_dataptrs = set()
for module, ema_module in zip(new_model.modules(), self.averaged_model.modules()):
for param, ema_param in zip(module.parameters(recurse=False), ema_module.parameters(recurse=False)):
# iterative over immediate parameters only.
if isinstance(param, dict):
raise RuntimeError('Dict parameter not supported')
# data_ptr = param.data_ptr()
# if data_ptr != 0:
# all_dataptrs.add(data_ptr)
if isinstance(module, _BatchNorm):
# skip batchnorms
ema_param.copy_(param.to(dtype=ema_param.dtype).data)
elif not param.requires_grad:
ema_param.copy_(param.to(dtype=ema_param.dtype).data)
else:
ema_param.mul_(self.decay)
ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=1 - self.decay)
# verify that iterating over module and then parameters is identical to parameters recursively.
# assert old_all_dataptrs == all_dataptrs
self.optimization_step += 1

View File

@ -0,0 +1,75 @@
import os
from pathlib import Path
from typing import Dict, Optional, Union
from huggingface_hub import PyTorchModelHubMixin
from huggingface_hub.constants import (PYTORCH_WEIGHTS_NAME, SAFETENSORS_SINGLE_FILE)
from huggingface_hub.file_download import hf_hub_download
from huggingface_hub.utils import EntryNotFoundError, is_torch_available
if is_torch_available():
import torch # type: ignore
class CompatiblePyTorchModelHubMixin(PyTorchModelHubMixin):
"""Mixin class to load Pytorch models from the Hub."""
def _save_pretrained(self, save_directory: Path) -> None:
"""Save weights from a Pytorch model to a local directory."""
# To bypass saving into safetensor by default
model_to_save = self.module if hasattr(self, "module") else self # type: ignore
torch.save(model_to_save.state_dict(), save_directory / PYTORCH_WEIGHTS_NAME)
@classmethod
def _from_pretrained(
cls,
*,
model_id: str,
revision: Optional[str],
cache_dir: Optional[Union[str, Path]],
force_download: bool,
proxies: Optional[Dict],
resume_download: Optional[bool],
local_files_only: bool,
token: Union[str, bool, None],
map_location: str = "cpu",
strict: bool = False,
**model_kwargs,
):
"""Load Pytorch pretrained weights and return the loaded model."""
model = cls(**model_kwargs)
if os.path.isdir(model_id):
print("Loading weights from local directory")
try:
model_file = os.path.join(model_id, SAFETENSORS_SINGLE_FILE)
return cls._load_as_safetensor(model, model_file, map_location, strict)
except FileNotFoundError:
model_file = os.path.join(model_id, PYTORCH_WEIGHTS_NAME)
return cls._load_as_pickle(model, model_file, map_location, strict)
else:
try:
model_file = hf_hub_download(
repo_id=model_id,
filename=SAFETENSORS_SINGLE_FILE,
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
token=token,
local_files_only=local_files_only,
)
return cls._load_as_safetensor(model, model_file, map_location, strict)
except EntryNotFoundError:
model_file = hf_hub_download(
repo_id=model_id,
filename=PYTORCH_WEIGHTS_NAME,
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
token=token,
local_files_only=local_files_only,
)
return cls._load_as_pickle(model, model_file, map_location, strict)

View File

@ -0,0 +1,159 @@
import torch
import torch.nn as nn
from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
class CLIPVisionTower(nn.Module):
def __init__(self, vision_tower, args, delay_load=False):
super().__init__()
self.is_loaded = False
self.vision_tower_name = vision_tower
self.select_layer = args.mm_vision_select_layer
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
if not delay_load:
self.load_model()
elif getattr(args, 'unfreeze_mm_vision_tower', False):
self.load_model()
else:
self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
def load_model(self, device_map=None):
if self.is_loaded:
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
return
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
self.vision_tower.requires_grad_(False)
self.is_loaded = True
def feature_select(self, image_forward_outs):
image_features = image_forward_outs.hidden_states[self.select_layer]
if self.select_feature == 'patch':
image_features = image_features[:, 1:]
elif self.select_feature == 'cls_patch':
image_features = image_features
else:
raise ValueError(f'Unexpected select feature: {self.select_feature}')
return image_features
@torch.no_grad()
def forward(self, images):
if type(images) is list:
image_features = []
for image in images:
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
output_hidden_states=True)
image_feature = self.feature_select(image_forward_out).to(image.dtype)
image_features.append(image_feature)
else:
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
output_hidden_states=True)
image_features = self.feature_select(image_forward_outs).to(images.dtype)
return image_features
@property
def dummy_feature(self):
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
@property
def dtype(self):
return self.vision_tower.dtype
@property
def device(self):
return self.vision_tower.device
@property
def config(self):
if self.is_loaded:
return self.vision_tower.config
else:
return self.cfg_only
@property
def hidden_size(self):
return self.config.hidden_size
@property
def num_patches_per_side(self):
return self.config.image_size // self.config.patch_size
@property
def num_patches(self):
return (self.config.image_size // self.config.patch_size)**2
class CLIPVisionTowerS2(CLIPVisionTower):
def __init__(self, vision_tower, args, delay_load=False):
super().__init__(vision_tower, args, delay_load)
self.s2_scales = getattr(args, 's2_scales', '336,672,1008')
self.s2_scales = list(map(int, self.s2_scales.split(',')))
self.s2_scales.sort()
self.s2_split_size = self.s2_scales[0]
self.s2_image_size = self.s2_scales[-1]
try:
from s2wrapper import forward as multiscale_forward
except ImportError:
raise ImportError(
'Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git'
)
self.multiscale_forward = multiscale_forward
# change resize/crop size in preprocessing to the largest image size in s2_scale
if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False):
self.image_processor.size['shortest_edge'] = self.s2_image_size
self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
def load_model(self, device_map=None):
if self.is_loaded:
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
return
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
self.vision_tower.requires_grad_(False)
self.image_processor.size['shortest_edge'] = self.s2_image_size
self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
self.is_loaded = True
@torch.no_grad()
def forward_feature(self, images):
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
output_hidden_states=True)
image_features = self.feature_select(image_forward_outs).to(images.dtype)
return image_features
@torch.no_grad()
def forward(self, images):
if type(images) is list:
image_features = []
for image in images:
image_feature = self.multiscale_forward(self.forward_feature,
image.unsqueeze(0),
img_sizes=self.s2_scales,
max_split_size=self.s2_split_size)
image_features.append(image_feature)
else:
image_features = self.multiscale_forward(self.forward_feature,
images,
img_sizes=self.s2_scales,
max_split_size=self.s2_split_size)
return image_features
@property
def hidden_size(self):
return self.config.hidden_size * len(self.s2_scales)

View File

@ -0,0 +1,87 @@
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoImageProcessor, AutoModel, Dinov2Model
class DinoV2VisionTower(nn.Module):
def __init__(self, vision_tower, args, delay_load=False):
super().__init__()
self.is_loaded = False
self.vision_tower_name = vision_tower
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
if not delay_load:
self.load_model()
elif getattr(args, 'unfreeze_mm_vision_tower', False):
self.load_model()
else:
self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
def load_model(self, device_map=None):
if self.is_loaded:
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
return
self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name)
self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, device_map=device_map)
self.vision_tower.requires_grad_(False) # FIXME:
self.is_loaded = True
def feature_select(self, image_forward_outs):
image_features = image_forward_outs.last_hidden_state
if self.select_feature == 'patch':
image_features = image_features[:, 1:] # (B, 1369, 1536)
elif self.select_feature == 'cls_patch':
image_features = image_features # (B, 1, 1536)
else:
raise ValueError(f'Unexpected select feature: {self.select_feature}')
return image_features
@torch.no_grad()
def forward(self, images):
if type(images) is list:
image_features = []
for image in images:
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
image_feature = self.feature_select(image_forward_out).to(image.dtype)
image_features.append(image_feature)
else:
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
image_features = self.feature_select(image_forward_outs).to(images.dtype)
return image_features
@property
def dummy_feature(self):
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
@property
def dtype(self):
return self.vision_tower.dtype
@property
def device(self):
return self.vision_tower.device
@property
def config(self):
if self.is_loaded:
return self.vision_tower.config
else:
return self.cfg_only
@property
def hidden_size(self):
return self.config.hidden_size
@property
def num_patches_per_side(self):
return self.config.image_size // self.config.patch_size
@property
def num_patches(self):
return (self.config.image_size // self.config.patch_size)**2

View File

@ -0,0 +1,86 @@
import torch
import torch.nn as nn
from transformers import AutoConfig, SiglipImageProcessor, SiglipVisionModel
class SiglipVisionTower(nn.Module):
def __init__(self, vision_tower, args, delay_load=False):
super().__init__()
self.is_loaded = False
self.vision_tower_name = vision_tower
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
if not delay_load:
self.load_model()
elif getattr(args, 'unfreeze_mm_vision_tower', False):
self.load_model()
else:
self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
def load_model(self, device_map=None):
if self.is_loaded:
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
return
self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
self.vision_tower.eval()
self.is_loaded = True
def feature_select(self, image_forward_outs):
if self.select_feature == 'patch':
image_features = image_forward_outs.last_hidden_state # (B, 729, 1536)
elif self.select_feature == 'cls_patch':
image_features = image_forward_outs.pooler_output # (B, 1, 1536)
else:
raise ValueError(f'Unexpected select feature: {self.select_feature}')
return image_features
@torch.no_grad()
def forward(self, images):
if type(images) is list:
image_features = []
for image in images:
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
image_feature = self.feature_select(image_forward_out).to(image.dtype)
image_features.append(image_feature)
else:
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
image_features = self.feature_select(image_forward_outs).to(images.dtype)
return image_features
@property
def dummy_feature(self):
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
@property
def dtype(self):
return self.vision_tower.dtype
@property
def device(self):
return self.vision_tower.device
@property
def config(self):
if self.is_loaded:
return self.vision_tower.config
else:
return self.cfg_only
@property
def hidden_size(self):
return self.config.hidden_size
@property
def num_patches_per_side(self):
return self.config.image_size // self.config.patch_size
@property
def num_patches(self):
return (self.config.image_size // self.config.patch_size)**2

View File

@ -0,0 +1,111 @@
import torch
from transformers import AutoTokenizer, T5EncoderModel
class T5Embedder:
# available_models = ["google/t5-v1_1-xxl"]
def __init__(
self,
device,
from_pretrained=None,
*,
cache_dir=None,
hf_token=None,
use_text_preprocessing=True,
t5_model_kwargs=None,
torch_dtype=None,
use_offload_folder=None,
model_max_length=120,
local_files_only=False,
):
# from_pretrained="google/t5-v1_1-xxl" # zijian
self.device = torch.device(device)
self.torch_dtype = torch_dtype or torch.bfloat16
self.cache_dir = cache_dir
if t5_model_kwargs is None:
t5_model_kwargs = {
"low_cpu_mem_usage": True,
"torch_dtype": self.torch_dtype,
}
if use_offload_folder is not None:
t5_model_kwargs["offload_folder"] = use_offload_folder
t5_model_kwargs["device_map"] = {
"shared": self.device,
"encoder.embed_tokens": self.device,
"encoder.block.0": self.device,
"encoder.block.1": self.device,
"encoder.block.2": self.device,
"encoder.block.3": self.device,
"encoder.block.4": self.device,
"encoder.block.5": self.device,
"encoder.block.6": self.device,
"encoder.block.7": self.device,
"encoder.block.8": self.device,
"encoder.block.9": self.device,
"encoder.block.10": self.device,
"encoder.block.11": self.device,
"encoder.block.12": "disk",
"encoder.block.13": "disk",
"encoder.block.14": "disk",
"encoder.block.15": "disk",
"encoder.block.16": "disk",
"encoder.block.17": "disk",
"encoder.block.18": "disk",
"encoder.block.19": "disk",
"encoder.block.20": "disk",
"encoder.block.21": "disk",
"encoder.block.22": "disk",
"encoder.block.23": "disk",
"encoder.final_layer_norm": "disk",
"encoder.dropout": "disk",
}
else:
t5_model_kwargs["device_map"] = {
"shared": self.device,
"encoder": self.device,
}
self.use_text_preprocessing = use_text_preprocessing
self.hf_token = hf_token
# assert from_pretrained in self.available_models
self.tokenizer = AutoTokenizer.from_pretrained(
from_pretrained,
model_max_length=model_max_length,
cache_dir=cache_dir,
local_files_only=local_files_only,
)
self.model = T5EncoderModel.from_pretrained(
from_pretrained,
cache_dir=cache_dir,
local_files_only=local_files_only,
**t5_model_kwargs,
).eval()
self.model_max_length = model_max_length
def get_text_embeddings(self, texts):
text_tokens_and_mask = self.tokenizer(
texts,
max_length=self.model_max_length,
padding="longest",
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
return_tensors="pt",
)
input_ids = text_tokens_and_mask["input_ids"].to(self.device)
attention_mask = text_tokens_and_mask["attention_mask"].to(self.device)
with torch.no_grad():
text_encoder_embs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
)["last_hidden_state"].detach()
return text_encoder_embs, attention_mask
if __name__ == "__main__":
T5Embedder(from_pretrained="google/t5-v1_1-xxl", device='cuda:7')

View File

@ -0,0 +1,304 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DiT: https://github.com/facebookresearch/DiT
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
import math
from collections import OrderedDict
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.jit import Final
from timm.models.vision_transformer import Attention, Mlp, RmsNorm, use_fused_attn
#################################################################################
# Embedding Layers for Timesteps and Condition Inptus #
#################################################################################
class TimestepEmbedder(nn.Module):
"""
Embeds scalar timesteps into vector representations.
"""
def __init__(self, hidden_size, frequency_embedding_size=256, dtype=torch.bfloat16):
super().__init__()
self.mlp = nn.Sequential(
nn.Linear(frequency_embedding_size, hidden_size, bias=True),
nn.SiLU(),
nn.Linear(hidden_size, hidden_size, bias=True),
)
self.frequency_embedding_size = frequency_embedding_size
self.dtype = dtype
def timestep_embedding(self, t, dim, max_period=10000):
"""
Create sinusoidal timestep embeddings.
:param t: a 1-D Tensor of N indices, one per batch element.
These may be fractional.
:param dim: the dimension of the output.
:param max_period: controls the minimum frequency of the embeddings.
:return: an (N, D) Tensor of positional embeddings.
"""
# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
half = dim // 2
freqs = torch.exp(-math.log(max_period) *
torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
args = t[:, None].float() * freqs[None]
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
if dim % 2:
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
return embedding.to(self.dtype)
def forward(self, t):
t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
t_emb = self.mlp(t_freq)
return t_emb
#################################################################################
# Cross Attention Layers #
#################################################################################
class CrossAttention(nn.Module):
"""
A cross-attention layer with flash attention.
"""
fused_attn: Final[bool]
def __init__(
self,
dim: int,
num_heads: int = 8,
qkv_bias: bool = False,
qk_norm: bool = False,
attn_drop: float = 0,
proj_drop: float = 0,
norm_layer: nn.Module = nn.LayerNorm,
) -> None:
super().__init__()
assert dim % num_heads == 0, 'dim should be divisible by num_heads'
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.scale = self.head_dim**-0.5
self.fused_attn = use_fused_attn()
self.q = nn.Linear(dim, dim, bias=qkv_bias)
self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x: torch.Tensor, c: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
B, N, C = x.shape
_, L, _ = c.shape
q = self.q(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
kv = self.kv(c).reshape(B, L, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
k, v = kv.unbind(0)
q, k = self.q_norm(q), self.k_norm(k)
# Prepare attn mask (B, L) to mask the conditioion
if mask is not None:
mask = mask.reshape(B, 1, 1, L)
mask = mask.expand(-1, -1, N, -1)
if self.fused_attn:
x = F.scaled_dot_product_attention(query=q,
key=k,
value=v,
dropout_p=self.attn_drop.p if self.training else 0.,
attn_mask=mask)
else:
q = q * self.scale
attn = q @ k.transpose(-2, -1)
if mask is not None:
attn = attn.masked_fill_(mask.logical_not(), float('-inf'))
attn = attn.softmax(dim=-1)
if self.attn_drop.p > 0:
attn = self.attn_drop(attn)
x = attn @ v
x = x.permute(0, 2, 1, 3).reshape(B, N, C)
x = self.proj(x)
if self.proj_drop.p > 0:
x = self.proj_drop(x)
return x
#################################################################################
# RDT Block #
#################################################################################
class RDTBlock(nn.Module):
"""
A RDT block with cross-attention conditioning.
"""
def __init__(self, hidden_size, num_heads, **block_kwargs):
super().__init__()
self.norm1 = RmsNorm(hidden_size, eps=1e-6)
self.attn = Attention(dim=hidden_size,
num_heads=num_heads,
qkv_bias=True,
qk_norm=True,
norm_layer=RmsNorm,
**block_kwargs)
self.cross_attn = CrossAttention(hidden_size,
num_heads=num_heads,
qkv_bias=True,
qk_norm=True,
norm_layer=RmsNorm,
**block_kwargs)
self.norm2 = RmsNorm(hidden_size, eps=1e-6)
approx_gelu = lambda: nn.GELU(approximate="tanh")
self.ffn = Mlp(in_features=hidden_size, hidden_features=hidden_size, act_layer=approx_gelu, drop=0)
self.norm3 = RmsNorm(hidden_size, eps=1e-6)
def forward(self, x, c, mask=None):
origin_x = x
x = self.norm1(x)
x = self.attn(x)
x = x + origin_x
origin_x = x
x = self.norm2(x)
x = self.cross_attn(x, c, mask)
x = x + origin_x
origin_x = x
x = self.norm3(x)
x = self.ffn(x)
x = x + origin_x
return x
class FinalLayer(nn.Module):
"""
The final layer of RDT.
"""
def __init__(self, hidden_size, out_channels):
super().__init__()
self.norm_final = RmsNorm(hidden_size, eps=1e-6)
approx_gelu = lambda: nn.GELU(approximate="tanh")
self.ffn_final = Mlp(in_features=hidden_size,
hidden_features=hidden_size,
out_features=out_channels,
act_layer=approx_gelu,
drop=0)
def forward(self, x):
x = self.norm_final(x)
x = self.ffn_final(x)
return x
#################################################################################
# Sine/Cosine Positional Embedding Functions #
#################################################################################
# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
"""
embed_dim: output dimension for each position
pos: a list of positions to be encoded: size (M,)
out: (M, D)
"""
assert embed_dim % 2 == 0
omega = np.arange(embed_dim // 2, dtype=np.float64)
omega /= embed_dim / 2.
omega = 1. / 10000**omega # (D/2,)
if not isinstance(pos, np.ndarray):
pos = np.array(pos, dtype=np.float64)
pos = pos.reshape(-1) # (M,)
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
emb_sin = np.sin(out) # (M, D/2)
emb_cos = np.cos(out) # (M, D/2)
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
return emb
def get_nd_sincos_pos_embed_from_grid(embed_dim, grid_sizes):
"""
embed_dim: output dimension for each position
grid_sizes: the grids sizes in each dimension (K,).
out: (grid_sizes[0], ..., grid_sizes[K-1], D)
"""
num_sizes = len(grid_sizes)
# For grid size of 1, we do not need to add any positional embedding
num_valid_sizes = len([x for x in grid_sizes if x > 1])
emb = np.zeros(grid_sizes + (embed_dim, ))
# Uniformly divide the embedding dimension for each grid size
dim_for_each_grid = embed_dim // num_valid_sizes
# To make it even
if dim_for_each_grid % 2 != 0:
dim_for_each_grid -= 1
valid_size_idx = 0
for size_idx in range(num_sizes):
grid_size = grid_sizes[size_idx]
if grid_size <= 1:
continue
pos = np.arange(grid_size)
posemb_shape = [1] * len(grid_sizes) + [dim_for_each_grid]
posemb_shape[size_idx] = -1
emb[..., valid_size_idx * dim_for_each_grid:(valid_size_idx + 1) * dim_for_each_grid] += \
get_1d_sincos_pos_embed_from_grid(dim_for_each_grid, pos).reshape(posemb_shape)
valid_size_idx += 1
return emb
def get_multimodal_cond_pos_embed(embed_dim, mm_cond_lens: OrderedDict, embed_modality=True):
"""
Generate position embeddings for multimodal conditions.
mm_cond_lens: an OrderedDict containing
(modality name, modality token length) pairs.
For `"image"` modality, the value can be a multi-dimensional tuple.
If the length < 0, it means there is no position embedding for the modality or grid.
embed_modality: whether to embed the modality information. Default is True.
"""
num_modalities = len(mm_cond_lens)
modality_pos_embed = np.zeros((num_modalities, embed_dim))
if embed_modality:
# Get embeddings for various modalites
# We put it in the first half
modality_sincos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, torch.arange(num_modalities))
modality_pos_embed[:, :embed_dim // 2] = modality_sincos_embed
# The second half is for position embeddings
pos_embed_dim = embed_dim // 2
else:
# The whole embedding is for position embeddings
pos_embed_dim = embed_dim
# Get embeddings for positions inside each modality
c_pos_emb = np.zeros((0, embed_dim))
for idx, (modality, cond_len) in enumerate(mm_cond_lens.items()):
if modality == "image" and \
(isinstance(cond_len, tuple) or isinstance(cond_len, list)):
all_grid_sizes = tuple([abs(x) for x in cond_len])
embed_grid_sizes = tuple([x if x > 0 else 1 for x in cond_len])
cond_sincos_embed = get_nd_sincos_pos_embed_from_grid(pos_embed_dim, embed_grid_sizes)
cond_pos_embed = np.zeros(all_grid_sizes + (embed_dim, ))
cond_pos_embed[..., -pos_embed_dim:] += cond_sincos_embed
cond_pos_embed = cond_pos_embed.reshape((-1, embed_dim))
else:
cond_sincos_embed = get_1d_sincos_pos_embed_from_grid(pos_embed_dim,
torch.arange(cond_len if cond_len > 0 else 1))
cond_pos_embed = np.zeros((abs(cond_len), embed_dim))
cond_pos_embed[:, -pos_embed_dim:] += cond_sincos_embed
cond_pos_embed += modality_pos_embed[idx]
c_pos_emb = np.concatenate([c_pos_emb, cond_pos_embed], axis=0)
return c_pos_emb

View File

@ -0,0 +1,156 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DiT: https://github.com/facebookresearch/DiT
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
from collections import OrderedDict
import torch
import torch.nn as nn
from pathlib import Path
import sys, os
# get current workspace
current_file = Path(__file__)
sys.path.append(str(current_file.parent.parent))
from rdt.blocks import (FinalLayer, RDTBlock, TimestepEmbedder, get_1d_sincos_pos_embed_from_grid,
get_multimodal_cond_pos_embed)
class RDT(nn.Module):
"""
Class for Robotics Diffusion Transformers.
"""
def __init__(self,
output_dim=128,
horizon=32,
hidden_size=1152,
depth=28,
num_heads=16,
max_lang_cond_len=1024,
img_cond_len=4096,
lang_pos_embed_config=None,
img_pos_embed_config=None,
dtype=torch.bfloat16):
super().__init__()
self.horizon = horizon
self.hidden_size = hidden_size
self.max_lang_cond_len = max_lang_cond_len
self.img_cond_len = img_cond_len
self.dtype = dtype
self.lang_pos_embed_config = lang_pos_embed_config
self.img_pos_embed_config = img_pos_embed_config
self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
self.freq_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
# We will use trainable sin-cos embeddings
# [timestep; state; action]
self.x_pos_embed = nn.Parameter(torch.zeros(1, horizon + 3, hidden_size))
# Language conditions
self.lang_cond_pos_embed = nn.Parameter(torch.zeros(1, max_lang_cond_len, hidden_size))
# Image conditions
self.img_cond_pos_embed = nn.Parameter(torch.zeros(1, img_cond_len, hidden_size))
self.blocks = nn.ModuleList([RDTBlock(hidden_size, num_heads) for _ in range(depth)])
self.final_layer = FinalLayer(hidden_size, output_dim)
self.initialize_weights()
def initialize_weights(self):
# Initialize transformer layers:
def _basic_init(module):
if isinstance(module, nn.Linear):
torch.nn.init.xavier_uniform_(module.weight)
if module.bias is not None:
nn.init.constant_(module.bias, 0)
self.apply(_basic_init)
# Initialize pos_embed by sin-cos embedding
x_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
mm_cond_lens=OrderedDict([
('timestep', 1),
('ctrl_freq', 1),
('state', 1),
('action', self.horizon),
]))
self.x_pos_embed.data.copy_(torch.from_numpy(x_pos_embed).float().unsqueeze(0))
if self.lang_pos_embed_config is None:
lang_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size,
torch.arange(self.max_lang_cond_len))
else:
lang_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
mm_cond_lens=OrderedDict(self.lang_pos_embed_config),
embed_modality=False)
self.lang_cond_pos_embed.data.copy_(torch.from_numpy(lang_cond_pos_embed).float().unsqueeze(0))
if self.img_pos_embed_config is None:
img_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size, torch.arange(self.img_cond_len))
else:
img_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
mm_cond_lens=OrderedDict(self.img_pos_embed_config),
embed_modality=False)
self.img_cond_pos_embed.data.copy_(torch.from_numpy(img_cond_pos_embed).float().unsqueeze(0))
# Initialize timestep and control freq embedding MLP
nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
nn.init.normal_(self.freq_embedder.mlp[0].weight, std=0.02)
nn.init.normal_(self.freq_embedder.mlp[2].weight, std=0.02)
# Initialize the final layer: zero-out the final linear layer
nn.init.constant_(self.final_layer.ffn_final.fc2.weight, 0)
nn.init.constant_(self.final_layer.ffn_final.fc2.bias, 0)
# Move all the params to given data type:
self.to(self.dtype)
def forward(self, x, freq, t, lang_c, img_c, lang_mask=None, img_mask=None):
"""
Forward pass of RDT.
x: (B, T, D), state + action token sequence, T = horizon + 1,
dimension D is assumed to be the same as the hidden size.
freq: (B,), a scalar indicating control frequency.
t: (B,) or (1,), diffusion timesteps.
lang_c: (B, L_lang, D) or None, language condition tokens (variable length),
dimension D is assumed to be the same as the hidden size.
img_c: (B, L_img, D) or None, image condition tokens (fixed length),
dimension D is assumed to be the same as the hidden size.
lang_mask: (B, L_lang) or None, language condition mask (True for valid).
img_mask: (B, L_img) or None, image condition mask (True for valid).
"""
t = self.t_embedder(t).unsqueeze(1) # (B, 1, D) or (1, 1, D)
freq = self.freq_embedder(freq).unsqueeze(1) # (B, 1, D)
# Append timestep to the input tokens
if t.shape[0] == 1:
t = t.expand(x.shape[0], -1, -1)
x = torch.cat([t, freq, x], dim=1) # (B, T+1, D)
# Add multimodal position embeddings
x = x + self.x_pos_embed
# Note the lang is of variable length
lang_c = lang_c + self.lang_cond_pos_embed[:, :lang_c.shape[1]]
img_c = img_c + self.img_cond_pos_embed
# Forward pass
conds = [lang_c, img_c]
masks = [lang_mask, img_mask]
for i, block in enumerate(self.blocks):
c, mask = conds[i % 2], masks[i % 2]
x = block(x, c, mask) # (B, T+1, D)
# Inject the language condition at the final layer
x = self.final_layer(x) # (B, T+1, out_channels)
# Only preserve the action tokens
x = x[:, -self.horizon:]
return x

View File

@ -0,0 +1,246 @@
import re, sys, os
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
from diffusers.schedulers.scheduling_dpmsolver_multistep import \
DPMSolverMultistepScheduler
from pathlib import Path
# get current workspace
current_file = Path(__file__)
sys.path.append(os.path.join(current_file.parent))
from hub_mixin import CompatiblePyTorchModelHubMixin
from rdt.model import RDT
class RDTRunner(nn.Module,
CompatiblePyTorchModelHubMixin,
repo_url="https://huggingface.co/robotics-diffusion-transformer/rdt-1b"):
def __init__(self,
*,
action_dim,
pred_horizon,
config,
lang_token_dim,
img_token_dim,
state_token_dim,
max_lang_cond_len,
img_cond_len,
lang_pos_embed_config=None,
img_pos_embed_config=None,
dtype=torch.bfloat16):
super(RDTRunner, self).__init__()
# Create diffusion model
hidden_size = config['rdt']['hidden_size']
self.model = RDT(
output_dim=action_dim,
horizon=pred_horizon,
hidden_size=hidden_size,
depth=config['rdt']['depth'],
num_heads=config['rdt']['num_heads'],
max_lang_cond_len=max_lang_cond_len,
img_cond_len=img_cond_len,
lang_pos_embed_config=lang_pos_embed_config,
img_pos_embed_config=img_pos_embed_config,
dtype=dtype,
)
# Create adpators for various conditional inputs
self.lang_adaptor = self.build_condition_adapter(config['lang_adaptor'],
in_features=lang_token_dim,
out_features=hidden_size)
self.img_adaptor = self.build_condition_adapter(config['img_adaptor'],
in_features=img_token_dim,
out_features=hidden_size)
# A `state` refers to an action or a proprioception vector
self.state_adaptor = self.build_condition_adapter(
config['state_adaptor'],
in_features=state_token_dim * 2, # state + state mask (indicator)
out_features=hidden_size)
# Create the noise scheduler
noise_scheduler_config = config['noise_scheduler']
self.noise_scheduler = DDPMScheduler(
num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
beta_schedule=noise_scheduler_config['beta_schedule'],
prediction_type=noise_scheduler_config['prediction_type'],
clip_sample=noise_scheduler_config['clip_sample'],
)
self.noise_scheduler_sample = DPMSolverMultistepScheduler(
num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
beta_schedule=noise_scheduler_config['beta_schedule'],
prediction_type=noise_scheduler_config['prediction_type'],
)
self.num_train_timesteps = noise_scheduler_config['num_train_timesteps']
self.num_inference_timesteps = noise_scheduler_config['num_inference_timesteps']
self.prediction_type = noise_scheduler_config['prediction_type']
self.pred_horizon = pred_horizon
self.action_dim = action_dim
print("Diffusion params: %e" %
sum([p.numel() for p in self.model.parameters()] + [p.numel() for p in self.lang_adaptor.parameters()] +
[p.numel()
for p in self.img_adaptor.parameters()] + [p.numel() for p in self.state_adaptor.parameters()]))
def build_condition_adapter(self, projector_type, in_features, out_features):
projector = None
if projector_type == 'linear':
projector = nn.Linear(in_features, out_features)
else:
mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
if mlp_gelu_match:
mlp_depth = int(mlp_gelu_match.group(1))
modules = [nn.Linear(in_features, out_features)]
for _ in range(1, mlp_depth):
modules.append(nn.GELU(approximate="tanh"))
modules.append(nn.Linear(out_features, out_features))
projector = nn.Sequential(*modules)
if projector is None:
raise ValueError(f'Unknown projector type: {projector_type}')
return projector
def adapt_conditions(self, lang_tokens, img_tokens, state_tokens):
'''
lang_tokens: (batch_size, lang_len, lang_token_dim)
img_tokens: (batch_size, img_len, img_token_dim)
state_tokens: (batch_size, state_len, state_token_dim)
return: adpated (..., hidden_size) for all input tokens
'''
adpated_lang = self.lang_adaptor(lang_tokens)
adpated_img = self.img_adaptor(img_tokens)
adpated_state = self.state_adaptor(state_tokens)
return adpated_lang, adpated_img, adpated_state
def conditional_sample(self, lang_cond, lang_attn_mask, img_cond, state_traj, action_mask, ctrl_freqs):
'''
lang_cond: language conditional data, (batch_size, lang_len, hidden_size).
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
which should be True-False bool tensor.
img_cond: image conditional data, (batch_size, img_len, hidden_size).
state_traj: (batch_size, 1, hidden_size), state trajectory.
action_mask: (batch_size, 1, action_dim), a 0-1 **float** tensor
indicating the valid action dimensions.
ctrl_freqs: (batch_size,), control frequency for each sample.
return: (batch_size, horizon, action_dim)
'''
device = state_traj.device
dtype = state_traj.dtype
noisy_action = torch.randn(size=(state_traj.shape[0], self.pred_horizon, self.action_dim),
dtype=dtype,
device=device)
action_mask = action_mask.expand(-1, self.pred_horizon, -1)
# Set step values
self.noise_scheduler_sample.set_timesteps(self.num_inference_timesteps)
for t in self.noise_scheduler_sample.timesteps:
# Prepare state-action trajectory
action_traj = torch.cat([noisy_action, action_mask], dim=2)
action_traj = self.state_adaptor(action_traj)
state_action_traj = torch.cat([state_traj, action_traj], dim=1)
# Predict the model output
model_output = self.model(state_action_traj,
ctrl_freqs,
t.unsqueeze(-1).to(device),
lang_cond,
img_cond,
lang_mask=lang_attn_mask)
# Compute previous actions: x_t -> x_t-1
noisy_action = self.noise_scheduler_sample.step(model_output, t, noisy_action).prev_sample
noisy_action = noisy_action.to(state_traj.dtype)
# Finally apply the action mask to mask invalid action dimensions
noisy_action = noisy_action * action_mask
return noisy_action
# ========= Train ============
def compute_loss(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_gt, action_mask,
ctrl_freqs) -> torch.Tensor:
'''
lang_tokens: (batch_size, lang_len, lang_token_dim)
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
which should be True-False bool tensor.
img_tokens: (batch_size, img_len, img_token_dim)
state_tokens: (batch_size, 1, state_token_dim)
action_gt: (batch_size, horizon, state_token_dim), ground-truth actions for supervision
action_mask: (batch_size, 1, state_token_dim), a 0-1 **float** tensor.
ctrl_freqs: (batch_size,), control frequency for each sample.
return: loss_value, a scalar tensor
'''
batch_size = lang_tokens.shape[0]
device = lang_tokens.device
# Sample noise that we'll add to the actions
noise = torch.randn(action_gt.shape, dtype=action_gt.dtype, device=device)
# Sample random diffusion timesteps
timesteps = torch.randint(0, self.num_train_timesteps, (batch_size, ), device=device).long()
# Add noise to the clean actions according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_action = self.noise_scheduler.add_noise(action_gt, noise, timesteps)
# Concatenate the state and action tokens to form the input sequence
state_action_traj = torch.cat([state_tokens, noisy_action], dim=1)
# Append the action mask to the input sequence
action_mask = action_mask.expand(-1, state_action_traj.shape[1], -1)
state_action_traj = torch.cat([state_action_traj, action_mask], dim=2)
# Align the dimension with the hidden size
lang_cond, img_cond, state_action_traj = self.adapt_conditions(lang_tokens, img_tokens, state_action_traj)
# Predict the denoised result
pred = self.model(state_action_traj, ctrl_freqs, timesteps, lang_cond, img_cond, lang_mask=lang_attn_mask)
pred_type = self.prediction_type
if pred_type == 'epsilon':
target = noise
elif pred_type == 'sample':
target = action_gt
else:
raise ValueError(f"Unsupported prediction type {pred_type}")
loss = F.mse_loss(pred, target)
return loss
# ========= Inference ============
def predict_action(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_mask, ctrl_freqs):
'''
lang_tokens: (batch_size, lang_len, lang_token_dim)
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
which should be True-False bool tensor.
img_tokens: (batch_size, img_len, img_token_dim)
state_tokens: (batch_size, 1, state_token_dim)
action_mask: (batch_size, 1, action_dim),
which should be a 0-1 **float** tensor.
ctrl_freqs: (batch_size,), control frequency for each sample.
return: (batch_size, horizon, action_dim), predicted action sequence
'''
# Prepare the state and conditions
state_tokens = torch.cat([state_tokens, action_mask], dim=2)
lang_cond, img_cond, state_traj = self.adapt_conditions(lang_tokens, img_tokens, state_tokens)
# Run sampling
action_pred = self.conditional_sample(
lang_cond,
lang_attn_mask,
img_cond,
state_traj,
action_mask,
ctrl_freqs,
)
return action_pred
def forward(self, *args, **kwargs) -> torch.Tensor:
return self.compute_loss(*args, **kwargs)

View File

@ -0,0 +1,20 @@
import sys
import json
def read_json_value(file_path, key):
with open(file_path, "r") as file:
data = json.load(file)
value = data.get(key)
if value is not None:
print(value)
else:
print(f"Key '{key}' not found in {file_path}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python read_json.py <file_path> <key>")
sys.exit(1)
file_path = sys.argv[1]
key = sys.argv[2]
read_json_value(file_path, key)

View File

@ -0,0 +1,24 @@
numpy<2.0
packaging==24.0
deepspeed==0.14.2
accelerate==0.30.1
diffusers==0.27.2
timm==1.0.3
transformers==4.41.0
sentencepiece==0.2.0
h5py==3.11.0
opencv-python==4.9.0.80
imgaug==0.4.0
pytz==2022.1
huggingface_hub==0.23.0
pandas==2.3.3
# requirements_data.txt
# tfds-nightly==4.9.4.dev202402070044
gsutil==5.27
tensorflow==2.15.0.post1
pillow==10.2.0
pyyaml==6.0.1
tensorflow-graphics==2021.12.3
imageio==2.34.0
imageio-ffmpeg==0.4.9

View File

@ -0,0 +1,2 @@
input/*
output/*

48
RDT/rdt-export/Dockerfile Normal file
View File

@ -0,0 +1,48 @@
FROM registry.d-robotics.cc/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
# ccr-29eug8s3-pub.cnc.bj.baidubce.com/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV TZ=Asia/Shanghai
RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
RUN apt-get update --allow-unauthenticated && apt-get install -y \
software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update \
&& apt-get install -y \
python3.10 \
python3.10-dev \
python3-pip \
python3.10-distutils \
libgl1-mesa-glx \
libglib2.0-0 \
wget \
ffmpeg \
libsm6 \
libxext6 \
&& rm -rf /var/lib/apt/lists/*
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
COPY . /app/
RUN python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
# RUN pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu121
# RUN pip install torch==2.1.0 torchvision==0.16.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
RUN pip install packaging==24.0
RUN pip install tfds-nightly==4.9.4.dev202402070044
RUN pip install flash_attn-2.7.2.post1+cu12torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# RUN mkdir -p /app/dataset/input /app/dataset/output
ENTRYPOINT ["bash", "deploy.sh"]

View File

@ -0,0 +1,71 @@
common:
# The number of historical images
img_history_size: 2
# The number of future actions to predict
action_chunk_size: 64
# The number of cameras to be used in the model
num_cameras: 3
# Dimension for state/action, we use the same space for both state and action
# This MUST be equal to configs/state_vec.py
state_dim: 128
dataset:
# We will extract the data from raw dataset
# and store them in the disk buffer by producer
# When training, we will read the data
# randomly from the buffer by consumer
# The producer will replace the data which has been
# read by the consumer with new data
# The path to the buffer (at least 400GB)
buf_path: /path/to/buffer
# The number of chunks in the buffer
buf_num_chunks: 512
# The number of samples (step rather than episode) in each chunk
buf_chunk_size: 512
# We will filter the episodes with length less than `epsd_len_thresh_low`
epsd_len_thresh_low: 32
# For those more than `epsd_len_thresh_high`,
# we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
# to better balance the training datasets
epsd_len_thresh_high: 2048
# How to fit the image size
image_aspect_ratio: pad
# Maximum number of language tokens
tokenizer_max_length: 1024
model:
# Config for condition adpators
lang_adaptor: mlp2x_gelu
img_adaptor: mlp2x_gelu
state_adaptor: mlp3x_gelu
lang_token_dim: 4096
img_token_dim: 1152
# Dim of action or proprioception vector
# A `state` refers to an action or a proprioception vector
state_token_dim: 128
# Config for RDT structure
rdt:
# 1B: num_head 32 hidden_size 2048
hidden_size: 1024
depth: 14
num_heads: 32
cond_pos_embed_type: multimodal
# For noise scheduler
noise_scheduler:
type: ddpm
num_train_timesteps: 1000
num_inference_timesteps: 5
beta_schedule: squaredcos_cap_v2 # Critical choice
prediction_type: sample
clip_sample: False
# For EMA (params averaging)
# We do not use EMA currently
ema:
update_after_step: 0
inv_gamma: 1.0
power: 0.75
min_value: 0.0
max_value: 0.9999

View File

@ -0,0 +1,71 @@
common:
# The number of historical images
img_history_size: 2
# The number of future actions to predict
action_chunk_size: 64
# The number of cameras to be used in the model
num_cameras: 3
# Dimension for state/action, we use the same space for both state and action
# This MUST be equal to configs/state_vec.py
state_dim: 128
dataset:
# We will extract the data from raw dataset
# and store them in the disk buffer by producer
# When training, we will read the data
# randomly from the buffer by consumer
# The producer will replace the data which has been
# read by the consumer with new data
# The path to the buffer (at least 400GB)
buf_path: /path/to/buffer
# The number of chunks in the buffer
buf_num_chunks: 512
# The number of samples (step rather than episode) in each chunk
buf_chunk_size: 512
# We will filter the episodes with length less than `epsd_len_thresh_low`
epsd_len_thresh_low: 32
# For those more than `epsd_len_thresh_high`,
# we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
# to better balance the training datasets
epsd_len_thresh_high: 2048
# How to fit the image size
image_aspect_ratio: pad
# Maximum number of language tokens
tokenizer_max_length: 1024
model:
# Config for condition adpators
lang_adaptor: mlp2x_gelu
img_adaptor: mlp2x_gelu
state_adaptor: mlp3x_gelu
lang_token_dim: 4096
img_token_dim: 1152
# Dim of action or proprioception vector
# A `state` refers to an action or a proprioception vector
state_token_dim: 128
# Config for RDT structure
rdt:
# 1B: num_head 32 hidden_size 2048
hidden_size: 2048
depth: 28
num_heads: 32
cond_pos_embed_type: multimodal
# For noise scheduler
noise_scheduler:
type: ddpm
num_train_timesteps: 1000
num_inference_timesteps: 5
beta_schedule: squaredcos_cap_v2 # Critical choice
prediction_type: sample
clip_sample: False
# For EMA (params averaging)
# We do not use EMA currently
ema:
update_after_step: 0
inv_gamma: 1.0
power: 0.75
min_value: 0.0
max_value: 0.9999

View File

@ -0,0 +1,50 @@
{
"A": [
[
-0.2691913843154907,
-0.21995729207992554,
-0.182277649641037
],
[
0.35127854347229004,
0.2769763469696045,
0.17159393429756165
]
],
"B": [
[
-0.2576896846294403,
-0.22244493663311005,
-0.20557966828346252
],
[
0.32854634523391724,
0.2922680974006653,
0.17373555898666382
]
],
"C": [
[
-0.29205888509750366,
-0.24688798189163208,
-0.17577645182609558
],
[
0.25053921341896057,
0.3277084231376648,
0.16431939601898193
]
],
"D": [
[
-0.25131964683532715,
-0.15233077108860016,
-0.13294968008995056
],
[
0.19209328293800354,
0.19344553351402283,
0.1370421051979065
]
]
}

View File

@ -0,0 +1,65 @@
{
"fractal20220817_data": 3,
"taco_play": 15,
"jaco_play": 10,
"berkeley_cable_routing": 10,
"nyu_door_opening_surprising_effectiveness": 3,
"viola": 20,
"berkeley_autolab_ur5": 5,
"toto": 30,
"kuka": 10,
"language_table": 10,
"columbia_cairlab_pusht_real": 10,
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 20,
"nyu_rot_dataset_converted_externally_to_rlds":3,
"stanford_hydra_dataset_converted_externally_to_rlds": 10,
"austin_buds_dataset_converted_externally_to_rlds": 20,
"nyu_franka_play_dataset_converted_externally_to_rlds": 3,
"maniskill_dataset_converted_externally_to_rlds": 20,
"furniture_bench_dataset_converted_externally_to_rlds": 10,
"ucsd_kitchen_dataset_converted_externally_to_rlds": 2,
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": 3,
"austin_sailor_dataset_converted_externally_to_rlds": 20,
"austin_sirius_dataset_converted_externally_to_rlds": 20,
"bc_z": 10,
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": 10,
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 10,
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
"utokyo_xarm_bimanual_converted_externally_to_rlds": 10,
"berkeley_mvp_converted_externally_to_rlds": 5,
"berkeley_rpt_converted_externally_to_rlds": 30,
"kaist_nonprehensile_converted_externally_to_rlds": 10,
"stanford_mask_vit_converted_externally_to_rlds": 0,
"tokyo_u_lsmo_converted_externally_to_rlds": 10,
"dlr_sara_pour_converted_externally_to_rlds": 10,
"dlr_sara_grid_clamp_converted_externally_to_rlds": 10,
"dlr_edan_shared_control_converted_externally_to_rlds": 5,
"asu_table_top_converted_externally_to_rlds": 12.5,
"stanford_robocook_converted_externally_to_rlds": 5,
"eth_agent_affordances": 66.6,
"imperialcollege_sawyer_wrist_cam": 10,
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": 20,
"uiuc_d3field": 1,
"utaustin_mutex": 20,
"berkeley_fanuc_manipulation": 10,
"cmu_play_fusion": 5,
"cmu_stretch": 10,
"berkeley_gnm_recon": 3,
"berkeley_gnm_cory_hall": 5,
"berkeley_gnm_sac_son": 10,
"robo_net": 1,
"roboturk_real_towercreation": 10,
"roboturk_real_laundrylayout": 10,
"roboturk_real_objectsearch": 10,
"aloha_mobile": 50,
"aloha_static": 50,
"roboset": 5,
"droid": 15,
"fmb": 10,
"dobbe": 30,
"qut_dexterous_manpulation": 30,
"agilex": 25,
"rh20t": 10,
"calvin": 30,
"bridgev2": 5
}

View File

@ -0,0 +1,575 @@
{
"fractal20220817_data": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[
1,0,0,0
]
},
"taco_play": {
"image_keys": [
"rgb_static",
"rgb_gripper",
"rgb_static",
"rgb_static"
],
"image_mask":[
1,1,0,0
]
},
"jaco_play": {
"image_keys": [
"image",
"image_wrist",
"image_wrist",
"image_wrist"
],
"image_mask":[
1,1,0,0
]
},
"berkeley_cable_routing": {
"image_keys": [
"image",
"wrist45_image",
"wrist225_image",
"top_image"
],
"image_mask":[1,1,0,1]
},
"nyu_door_opening_surprising_effectiveness": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"viola": {
"image_keys": [
"agentview_rgb",
"eye_in_hand_rgb",
"eye_in_hand_rgb",
"eye_in_hand_rgb"
],
"image_mask":[1,1,0,0]
},
"berkeley_autolab_ur5": {
"image_keys": [
"image",
"hand_image",
"hand_image",
"hand_image"
],
"image_mask":[1,1,0,0]
},
"toto": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"kuka": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"language_table": {
"image_keys": [
"rgb",
"rgb",
"rgb",
"rgb"
],
"image_mask":[1,0,0,0]
},
"columbia_cairlab_pusht_real": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"nyu_rot_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"stanford_hydra_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"austin_buds_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"nyu_franka_play_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"image_additional_view",
"image_additional_view",
"image_additional_view"
],
"image_mask":[1,0,0,1]
},
"maniskill_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"furniture_bench_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"ucsd_kitchen_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"austin_sailor_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"austin_sirius_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"bc_z": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": {
"image_keys": [
"image",
"hand_image",
"hand_image",
"image2"
],
"image_mask":[1,1,0,1]
},
"utokyo_xarm_bimanual_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"berkeley_mvp_converted_externally_to_rlds": {
"image_keys": [
"hand_image",
"hand_image",
"hand_image",
"hand_image"
],
"image_mask":[0,1,0,0]
},
"berkeley_rpt_converted_externally_to_rlds": {
"image_keys": [
"hand_image",
"hand_image",
"hand_image",
"hand_image"
],
"image_mask":[0,1,0,0]
},
"kaist_nonprehensile_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"stanford_mask_vit_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"tokyo_u_lsmo_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"dlr_sara_pour_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"dlr_sara_grid_clamp_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"dlr_edan_shared_control_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"asu_table_top_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"stanford_robocook_converted_externally_to_rlds": {
"image_keys": [
"image_2",
"image_1",
"image_3",
"image_4"
],
"image_mask":[1,0,0,1]
},
"eth_agent_affordances": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"imperialcollege_sawyer_wrist_cam": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[0,1,0,0]
},
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"uiuc_d3field": {
"image_keys": [
"image_1",
"image_2",
"image_3",
"image_4"
],
"image_mask":[1,0,0,1]
},
"utaustin_mutex": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"berkeley_fanuc_manipulation": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"cmu_play_fusion": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"cmu_stretch": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"berkeley_gnm_recon": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"berkeley_gnm_cory_hall": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"berkeley_gnm_sac_son": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"robo_net": {
"image_keys": [
"image",
"image1",
"image2",
"image2"
],
"image_mask":[1,0,0,1]
},
"roboturk_real_towercreation": {
"image_keys": [
"top_rgb_frame",
"front_rgb_frame",
"front_rgb_frame",
"front_rgb_frame"
],
"image_mask":[1,0,0,1]
},
"roboturk_real_laundrylayout": {
"image_keys": [
"top_rgb_frame",
"front_rgb_frame",
"front_rgb_frame",
"front_rgb_frame"
],
"image_mask":[1,0,0,1]
},
"roboturk_real_objectsearch": {
"image_keys": [
"top_rgb_frame",
"front_rgb_frame",
"front_rgb_frame",
"front_rgb_frame"
],
"image_mask":[1,0,0,1]
},
"aloha_mobile": {
"image_keys": [
"cam_high",
"cam_right_wrist",
"cam_left_wrist",
"cam_right_wrist"
],
"image_mask":[1,1,1,0]
},
"aloha_static": {
"image_keys": [
"cam_high",
"cam_right_wrist",
"cam_left_wrist",
"cam_low"
],
"image_mask":[1,1,1,1]
},
"roboset": {
"image_keys": [
"rgb_top",
"rgb_right",
"rgb_left",
"rgb_right"
],
"image_mask":[1,1,1,0]
},
"droid": {
"image_keys": [
"exterior_image_1_left",
"wrist_image_left",
"wrist_image_left",
"exterior_image_2_left"
],
"image_mask":[1,1,0,1]
},
"fmb": {
"image_keys": [
"image_side_1",
"image_wrist_1",
"image_wrist_1",
"image_side_2"
],
"image_mask":[1,1,0,1]
},
"dobbe": {
"image_keys": [
"wrist_image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[0,1,0,0]
},
"qut_dexterous_manpulation": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"agilex": {
"image_keys": [
"cam_high",
"cam_right_wrist",
"cam_left_wrist",
"cam_right_wrist"
],
"image_mask":[1,1,1,0]
},
"rh20t": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"calvin": {
"image_keys": [
"rgb_static",
"rgb_gripper",
"rgb_gripper",
"rgb_gripper"
],
"image_mask":[1,1,0,0]
},
"bridgev2": {
"image_keys": [
"images0",
"images0",
"images0",
"images0"
],
"image_mask":[1,0,0,0]
}
}

View File

@ -0,0 +1,525 @@
{
"agilex": {
"dataset_name": "agilex",
"state_mean": [
-0.0036545392947090432,
-0.2773659935760079,
0.3147616748061523,
0.3813313179910183,
0.04028575944090457,
0.034888520819083294,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0
],
"state_std": [
0.05763674563578847,
0.2580181064167735,
0.19785840483767897,
0.05020347749331385,
0.054529239104671424,
0.05020521339363586,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0
],
"state_min": [
-0.17447535196940103,
-0.5522612677680121,
-0.3340397516886393,
0.21861712137858072,
-0.09725829230414497,
0.003396739231215583,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0
],
"state_max": [
0.21961932712131077,
0.30613206227620443,
0.5444545321994357,
0.4866888682047526,
0.31486290825737845,
0.3355223337809245,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0
]
}
}

View File

@ -0,0 +1,3 @@
[
"agilex"
]

View File

@ -0,0 +1,3 @@
{
"agilex": 100
}

View File

@ -0,0 +1,48 @@
[
"fractal20220817_data",
"jaco_play",
"taco_play",
"berkeley_cable_routing",
"viola",
"berkeley_autolab_ur5",
"toto",
"nyu_door_opening_surprising_effectiveness",
"columbia_cairlab_pusht_real",
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds",
"austin_buds_dataset_converted_externally_to_rlds",
"kuka",
"utokyo_xarm_bimanual_converted_externally_to_rlds",
"stanford_hydra_dataset_converted_externally_to_rlds",
"maniskill_dataset_converted_externally_to_rlds",
"ucsd_kitchen_dataset_converted_externally_to_rlds",
"ucsd_pick_and_place_dataset_converted_externally_to_rlds",
"austin_sailor_dataset_converted_externally_to_rlds",
"austin_sirius_dataset_converted_externally_to_rlds",
"bc_z",
"utokyo_pr2_opening_fridge_converted_externally_to_rlds",
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds",
"utokyo_xarm_pick_and_place_converted_externally_to_rlds",
"berkeley_mvp_converted_externally_to_rlds",
"berkeley_rpt_converted_externally_to_rlds",
"kaist_nonprehensile_converted_externally_to_rlds",
"tokyo_u_lsmo_converted_externally_to_rlds",
"dlr_sara_grid_clamp_converted_externally_to_rlds",
"stanford_robocook_converted_externally_to_rlds",
"imperialcollege_sawyer_wrist_cam",
"iamlab_cmu_pickup_insert_converted_externally_to_rlds",
"utaustin_mutex",
"berkeley_fanuc_manipulation",
"cmu_play_fusion",
"language_table",
"furniture_bench_dataset_converted_externally_to_rlds",
"droid",
"fmb",
"dobbe",
"qut_dexterous_manpulation",
"aloha_mobile",
"aloha_static",
"roboset",
"rh20t",
"calvin",
"bridgev2"
]

View File

@ -0,0 +1,48 @@
{
"fractal20220817_data": 271,
"taco_play": 60,
"jaco_play": 33,
"berkeley_cable_routing": 8,
"nyu_door_opening_surprising_effectiveness": 10,
"viola": 12,
"berkeley_autolab_ur5": 32,
"toto": 32,
"kuka": 50,
"language_table": 100,
"columbia_cairlab_pusht_real": 12,
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 55,
"stanford_hydra_dataset_converted_externally_to_rlds": 24,
"austin_buds_dataset_converted_externally_to_rlds": 7,
"maniskill_dataset_converted_externally_to_rlds": 174,
"furniture_bench_dataset_converted_externally_to_rlds": 71,
"ucsd_kitchen_dataset_converted_externally_to_rlds": 12,
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": 37,
"austin_sailor_dataset_converted_externally_to_rlds": 15,
"austin_sirius_dataset_converted_externally_to_rlds": 24,
"bc_z": 208,
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": 9,
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 15,
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
"utokyo_xarm_bimanual_converted_externally_to_rlds": 1,
"berkeley_mvp_converted_externally_to_rlds": 22,
"berkeley_rpt_converted_externally_to_rlds": 30,
"kaist_nonprehensile_converted_externally_to_rlds": 14,
"tokyo_u_lsmo_converted_externally_to_rlds": 7,
"dlr_sara_grid_clamp_converted_externally_to_rlds": 1,
"stanford_robocook_converted_externally_to_rlds": 50,
"imperialcollege_sawyer_wrist_cam": 13,
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": 25,
"utaustin_mutex": 39,
"berkeley_fanuc_manipulation": 20,
"cmu_play_fusion": 24,
"droid": 303,
"fmb": 42,
"dobbe": 36,
"qut_dexterous_manpulation": 14,
"aloha_mobile": 150,
"aloha_static": 150,
"roboset": 135,
"rh20t": 331,
"calvin": 100,
"bridgev2": 224
}

View File

@ -0,0 +1,126 @@
STATE_VEC_IDX_MAPPING = {
# [0, 10): right arm joint positions
**{
"arm_joint_{}_pos".format(i): i
for i in range(10)
},
**{
"right_arm_joint_{}_pos".format(i): i
for i in range(10)
},
# [10, 15): right gripper joint positions
**{
"gripper_joint_{}_pos".format(i): i + 10
for i in range(5)
},
**{
"right_gripper_joint_{}_pos".format(i): i + 10
for i in range(5)
},
"gripper_open": 10, # alias of right_gripper_joint_0_pos
"right_gripper_open": 10,
# [15, 25): right arm joint velocities
**{
"arm_joint_{}_vel".format(i): i + 15
for i in range(10)
},
**{
"right_arm_joint_{}_vel".format(i): i + 15
for i in range(10)
},
# [25, 30): right gripper joint velocities
**{
"gripper_joint_{}_vel".format(i): i + 25
for i in range(5)
},
**{
"right_gripper_joint_{}_vel".format(i): i + 25
for i in range(5)
},
"gripper_open_vel": 25, # alias of right_gripper_joint_0_vel
"right_gripper_open_vel": 25,
# [30, 33): right end effector positions
"eef_pos_x": 30,
"right_eef_pos_x": 30,
"eef_pos_y": 31,
"right_eef_pos_y": 31,
"eef_pos_z": 32,
"right_eef_pos_z": 32,
# [33, 39): right end effector 6D pose
"eef_angle_0": 33,
"right_eef_angle_0": 33,
"eef_angle_1": 34,
"right_eef_angle_1": 34,
"eef_angle_2": 35,
"right_eef_angle_2": 35,
"eef_angle_3": 36,
"right_eef_angle_3": 36,
"eef_angle_4": 37,
"right_eef_angle_4": 37,
"eef_angle_5": 38,
"right_eef_angle_5": 38,
# [39, 42): right end effector velocities
"eef_vel_x": 39,
"right_eef_vel_x": 39,
"eef_vel_y": 40,
"right_eef_vel_y": 40,
"eef_vel_z": 41,
"right_eef_vel_z": 41,
# [42, 45): right end effector angular velocities
"eef_angular_vel_roll": 42,
"right_eef_angular_vel_roll": 42,
"eef_angular_vel_pitch": 43,
"right_eef_angular_vel_pitch": 43,
"eef_angular_vel_yaw": 44,
"right_eef_angular_vel_yaw": 44,
# [45, 50): reserved
# [50, 60): left arm joint positions
**{
"left_arm_joint_{}_pos".format(i): i + 50
for i in range(10)
},
# [60, 65): left gripper joint positions
**{
"left_gripper_joint_{}_pos".format(i): i + 60
for i in range(5)
},
"left_gripper_open": 60, # alias of left_gripper_joint_0_pos
# [65, 75): left arm joint velocities
**{
"left_arm_joint_{}_vel".format(i): i + 65
for i in range(10)
},
# [75, 80): left gripper joint velocities
**{
"left_gripper_joint_{}_vel".format(i): i + 75
for i in range(5)
},
"left_gripper_open_vel": 75, # alias of left_gripper_joint_0_vel
# [80, 83): left end effector positions
"left_eef_pos_x": 80,
"left_eef_pos_y": 81,
"left_eef_pos_z": 82,
# [83, 89): left end effector 6D pose
"left_eef_angle_0": 83,
"left_eef_angle_1": 84,
"left_eef_angle_2": 85,
"left_eef_angle_3": 86,
"left_eef_angle_4": 87,
"left_eef_angle_5": 88,
# [89, 92): left end effector velocities
"left_eef_vel_x": 89,
"left_eef_vel_y": 90,
"left_eef_vel_z": 91,
# [92, 95): left end effector angular velocities
"left_eef_angular_vel_roll": 92,
"left_eef_angular_vel_pitch": 93,
"left_eef_angular_vel_yaw": 94,
# [95, 100): reserved
# [100, 102): base linear velocities
"base_vel_x": 100,
"base_vel_y": 101,
# [102, 103): base angular velocities
"base_angular_vel": 102,
# [103, 128): reserved
}
STATE_VEC_LEN = 128

View File

@ -0,0 +1,14 @@
{
"bf16": {
"enabled": "auto"
},
"train_micro_batch_size_per_gpu": "auto",
"train_batch_size": "auto",
"gradient_accumulation_steps": "auto",
"zero_optimization": {
"stage": 2,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9
}
}

993
RDT/rdt-export/export.py Normal file
View File

@ -0,0 +1,993 @@
import os
import re
import json
import logging
import argparse
from time import time
from collections import OrderedDict
from dataclasses import dataclass
import yaml
import cv2
import numpy as np
import torch
import h5py
from PIL import Image as PImage
import onnx
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
from diffusers.schedulers.scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
from scripts.agilex_model import create_model
from configs.state_vec import STATE_VEC_IDX_MAPPING
from models.hub_mixin import CompatiblePyTorchModelHubMixin
from models.rdt.blocks import (FinalLayer, RDTBlock, TimestepEmbedder, get_1d_sincos_pos_embed_from_grid, get_multimodal_cond_pos_embed)
from models.multimodal_encoder.siglip_encoder import SiglipVisionTower
from models.multimodal_encoder.t5_encoder import T5Embedder
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%H:%M:%S')
logger = logging.getLogger("RDT_EXPORT")
os.environ["WANDB_MODE"] = "disabled"
@dataclass
class ExportConfig:
task_id: str = None
output_path: str = None
model_path: str = None
calibration_num: int = 100
lang_calibration_num: int = 1
dataset_path: str = None
gpu_id: str = "0"
march: str = None
model_type: str = None
pretrained_vision_encoder_name_or_path: str = None
ctrl_freq: int = 25
cal_data_device: str = "cuda"
AGILEX_STATE_INDICES = [
STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"] for i in range(6)
]
def dump_img_adaptor(img_tokens):
global img_adaptor_cal_ws
global dump_cnt, dump_dataset_name
np.save(os.path.join(img_adaptor_cal_ws, f"img_adaptor_{dump_dataset_name}_{dump_cnt}.npy"), img_tokens.float().contiguous().cpu().detach().numpy())
def dump_dit(state_action_traj, ctrl_freqs, t, lang_cond, img_cond, lang_attn_mask):
t_str = str(t)
x = state_action_traj.float().contiguous().cpu().detach().numpy()
freq = ctrl_freqs.float().contiguous().cpu().detach().numpy().astype(np.int32).copy()
t_ = t.float().contiguous().cpu().detach().numpy()
t_ = np.expand_dims(t_.astype(np.int32), axis=0).copy()
lang_c = lang_cond.float().contiguous().cpu().detach().numpy()
img_c = img_cond.float().contiguous().cpu().detach().numpy()
lang_mask = lang_attn_mask.float().contiguous().cpu().detach().numpy()
pad_rows = 64 - lang_mask.shape[1]
padded = np.pad(lang_mask, ((0,0), (0,pad_rows)), mode="constant")
mask_float = np.where(padded, 0.0, -512.0).astype(np.float32)
lang_cond_padded = np.pad(lang_c, pad_width=((0, 0), (0, pad_rows), (0,0)), mode="constant", constant_values=0)
global dit_cal_path_x, dit_cal_path_freq, dit_cal_path_t, dit_cal_path_lang_c, dit_cal_path_img_c, dit_cal_path_lang_mask
global dump_cnt, dump_dataset_name
np.save(os.path.join(dit_cal_path_x, f"x_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), x)
np.save(os.path.join(dit_cal_path_freq, f"freq_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), freq)
np.save(os.path.join(dit_cal_path_t, f"t_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), t_)
np.save(os.path.join(dit_cal_path_lang_c, f"lang_c_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), lang_cond_padded)
np.save(os.path.join(dit_cal_path_img_c, f"img_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), img_c)
np.save(os.path.join(dit_cal_path_lang_mask, f"lang_mask_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), mask_float)
def create_dump_model(args, **kwargs):
# left_arm_dim, right_arm_dim = (args["arm_dim"]["left_arm_dim"], args["arm_dim"]["right_arm_dim"],)
# AGILEX_STATE_INDICES = ([STATE_VEC_IDX_MAPPING[f"left_arm_joint_{i}_pos"]
# for i in range(left_arm_dim)] + [STATE_VEC_IDX_MAPPING["left_gripper_open"]] +
# [STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"]
# for i in range(right_arm_dim)] + [STATE_VEC_IDX_MAPPING[f"right_gripper_open"]])
model = RoboticDiffusionTransformerModel_Dump(args, **kwargs)
pretrained = kwargs.get("pretrained", None)
if pretrained is not None and os.path.isfile(pretrained):
model.load_pretrained_weights(pretrained)
return model
class RDT_Dump(nn.Module):
def __init__(self,
output_dim=128,
horizon=32,
hidden_size=1152,
depth=28,
num_heads=16,
max_lang_cond_len=1024,
img_cond_len=4096,
lang_pos_embed_config=None,
img_pos_embed_config=None,
dtype=torch.bfloat16):
super().__init__()
self.horizon = horizon
self.hidden_size = hidden_size
self.max_lang_cond_len = max_lang_cond_len
self.img_cond_len = img_cond_len
self.dtype = dtype
self.lang_pos_embed_config = lang_pos_embed_config
self.img_pos_embed_config = img_pos_embed_config
self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
self.freq_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
# We will use trainable sin-cos embeddings
# [timestep; state; action]
self.x_pos_embed = nn.Parameter(torch.zeros(1, horizon + 3, hidden_size))
# Language conditions
self.lang_cond_pos_embed = nn.Parameter(torch.zeros(1, max_lang_cond_len, hidden_size))
# Image conditions
self.img_cond_pos_embed = nn.Parameter(torch.zeros(1, img_cond_len, hidden_size))
self.blocks = nn.ModuleList([RDTBlock(hidden_size, num_heads) for _ in range(depth)])
self.final_layer = FinalLayer(hidden_size, output_dim)
self.initialize_weights()
def initialize_weights(self):
# Initialize transformer layers:
def _basic_init(module):
if isinstance(module, nn.Linear):
torch.nn.init.xavier_uniform_(module.weight)
if module.bias is not None:
nn.init.constant_(module.bias, 0)
self.apply(_basic_init)
# Initialize pos_embed by sin-cos embedding
x_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
mm_cond_lens=OrderedDict([
('timestep', 1),
('ctrl_freq', 1),
('state', 1),
('action', self.horizon),
]))
self.x_pos_embed.data.copy_(torch.from_numpy(x_pos_embed).float().unsqueeze(0))
if self.lang_pos_embed_config is None:
lang_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size, torch.arange(self.max_lang_cond_len))
else:
lang_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size, mm_cond_lens=OrderedDict(self.lang_pos_embed_config), embed_modality=False)
self.lang_cond_pos_embed.data.copy_(torch.from_numpy(lang_cond_pos_embed).float().unsqueeze(0))
if self.img_pos_embed_config is None:
img_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size, torch.arange(self.img_cond_len))
else:
img_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size, mm_cond_lens=OrderedDict(self.img_pos_embed_config), embed_modality=False)
self.img_cond_pos_embed.data.copy_(torch.from_numpy(img_cond_pos_embed).float().unsqueeze(0))
# Initialize timestep and control freq embedding MLP
nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
nn.init.normal_(self.freq_embedder.mlp[0].weight, std=0.02)
nn.init.normal_(self.freq_embedder.mlp[2].weight, std=0.02)
# Initialize the final layer: zero-out the final linear layer
nn.init.constant_(self.final_layer.ffn_final.fc2.weight, 0)
nn.init.constant_(self.final_layer.ffn_final.fc2.bias, 0)
# Move all the params to given data type:
self.to(self.dtype)
def forward(self, x, freq, t, lang_c, img_c, lang_mask=None, img_mask=None):
t = self.t_embedder(t).unsqueeze(1) # (B, 1, D) or (1, 1, D)
freq = self.freq_embedder(freq).unsqueeze(1) # (B, 1, D)
# Append timestep to the input tokens
if t.shape[0] == 1:
t = t.expand(x.shape[0], -1, -1)
x = torch.cat([t, freq, x], dim=1) # (B, T+1, D)
# Add multimodal position embeddings
x = x + self.x_pos_embed
# Note the lang is of variable length
lang_c = lang_c + self.lang_cond_pos_embed[:, :lang_c.shape[1]]
img_c = img_c + self.img_cond_pos_embed
# Forward pass
conds = [lang_c, img_c]
masks = [lang_mask, img_mask]
for i, block in enumerate(self.blocks):
c, mask = conds[i % 2], masks[i % 2]
x = block(x, c, mask) # (B, T+1, D)
# Inject the language condition at the final layer
x = self.final_layer(x) # (B, T+1, out_channels)
# Only preserve the action tokens
x = x[:, -self.horizon:]
return x
class RDTRunner_Dump(nn.Module,
CompatiblePyTorchModelHubMixin,
repo_url="https://huggingface.co/robotics-diffusion-transformer/rdt-1b"):
def __init__(self,
*,
action_dim,
pred_horizon,
config,
lang_token_dim,
img_token_dim,
state_token_dim,
max_lang_cond_len,
img_cond_len,
lang_pos_embed_config=None,
img_pos_embed_config=None,
dtype=torch.bfloat16):
super(RDTRunner_Dump, self).__init__()
# Create diffusion model
hidden_size = config['rdt']['hidden_size']
self.model = RDT_Dump(
output_dim=action_dim,
horizon=pred_horizon,
hidden_size=hidden_size,
depth=config['rdt']['depth'],
num_heads=config['rdt']['num_heads'],
max_lang_cond_len=max_lang_cond_len,
img_cond_len=img_cond_len,
lang_pos_embed_config=lang_pos_embed_config,
img_pos_embed_config=img_pos_embed_config,
dtype=dtype,
)
# Create adpators for various conditional inputs
self.lang_adaptor = self.build_condition_adapter(config['lang_adaptor'], in_features=lang_token_dim, out_features=hidden_size)
self.img_adaptor = self.build_condition_adapter(config['img_adaptor'], in_features=img_token_dim, out_features=hidden_size)
# A `state` refers to an action or a proprioception vector
self.state_adaptor = self.build_condition_adapter(
config['state_adaptor'],
in_features=state_token_dim * 2, # state + state mask (indicator)
out_features=hidden_size)
# Create the noise scheduler
noise_scheduler_config = config['noise_scheduler']
self.noise_scheduler = DDPMScheduler(
num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
beta_schedule=noise_scheduler_config['beta_schedule'],
prediction_type=noise_scheduler_config['prediction_type'],
clip_sample=noise_scheduler_config['clip_sample'],
)
self.noise_scheduler_sample = DPMSolverMultistepScheduler(
num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
beta_schedule=noise_scheduler_config['beta_schedule'],
prediction_type=noise_scheduler_config['prediction_type'],
)
self.num_train_timesteps = noise_scheduler_config['num_train_timesteps']
self.num_inference_timesteps = noise_scheduler_config['num_inference_timesteps']
self.prediction_type = noise_scheduler_config['prediction_type']
self.pred_horizon = pred_horizon
self.action_dim = action_dim
print("Diffusion params: %e" %
sum([p.numel() for p in self.model.parameters()] + [p.numel() for p in self.lang_adaptor.parameters()] +
[p.numel()
for p in self.img_adaptor.parameters()] + [p.numel() for p in self.state_adaptor.parameters()]))
def build_condition_adapter(self, projector_type, in_features, out_features):
projector = None
if projector_type == 'linear':
projector = nn.Linear(in_features, out_features)
else:
mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
if mlp_gelu_match:
mlp_depth = int(mlp_gelu_match.group(1))
modules = [nn.Linear(in_features, out_features)]
for _ in range(1, mlp_depth):
modules.append(nn.GELU(approximate="tanh"))
modules.append(nn.Linear(out_features, out_features))
projector = nn.Sequential(*modules)
if projector is None:
raise ValueError(f'Unknown projector type: {projector_type}')
return projector
def adapt_conditions(self, lang_tokens, img_tokens, state_tokens):
adpated_lang = self.lang_adaptor(lang_tokens)
dump_img_adaptor(img_tokens)
adpated_img = self.img_adaptor(img_tokens)
adpated_state = self.state_adaptor(state_tokens)
return adpated_lang, adpated_img, adpated_state
def conditional_sample(self, lang_cond, lang_attn_mask, img_cond, state_traj, action_mask, ctrl_freqs):
device = state_traj.device
dtype = state_traj.dtype
noisy_action = torch.randn(size=(state_traj.shape[0], self.pred_horizon, self.action_dim), dtype=dtype, device=device)
action_mask = action_mask.expand(-1, self.pred_horizon, -1)
# Set step values
self.noise_scheduler_sample.set_timesteps(self.num_inference_timesteps)
for t in self.noise_scheduler_sample.timesteps:
# Prepare state-action trajectory
action_traj = torch.cat([noisy_action, action_mask], dim=2)
action_traj = self.state_adaptor(action_traj)
state_action_traj = torch.cat([state_traj, action_traj], dim=1)
# dump
dump_dit(state_action_traj, ctrl_freqs, t, lang_cond, img_cond, lang_attn_mask)
# Predict the model output
model_output = self.model(state_action_traj,
ctrl_freqs,
t.unsqueeze(-1).to(device),
lang_cond,
img_cond,
lang_mask=lang_attn_mask)
# Compute previous actions: x_t -> x_t-1
noisy_action = self.noise_scheduler_sample.step(model_output, t, noisy_action).prev_sample
noisy_action = noisy_action.to(state_traj.dtype)
# Finally apply the action mask to mask invalid action dimensions
noisy_action = noisy_action * action_mask
return noisy_action
def compute_loss(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_gt, action_mask,
ctrl_freqs) -> torch.Tensor:
'''
lang_tokens: (batch_size, lang_len, lang_token_dim)
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
which should be True-False bool tensor.
img_tokens: (batch_size, img_len, img_token_dim)
state_tokens: (batch_size, 1, state_token_dim)
action_gt: (batch_size, horizon, state_token_dim), ground-truth actions for supervision
action_mask: (batch_size, 1, state_token_dim), a 0-1 **float** tensor.
ctrl_freqs: (batch_size,), control frequency for each sample.
return: loss_value, a scalar tensor
'''
batch_size = lang_tokens.shape[0]
device = lang_tokens.device
# Sample noise that we'll add to the actions
noise = torch.randn(action_gt.shape, dtype=action_gt.dtype, device=device)
# Sample random diffusion timesteps
timesteps = torch.randint(0, self.num_train_timesteps, (batch_size, ), device=device).long()
# Add noise to the clean actions according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_action = self.noise_scheduler.add_noise(action_gt, noise, timesteps)
# Concatenate the state and action tokens to form the input sequence
state_action_traj = torch.cat([state_tokens, noisy_action], dim=1)
# Append the action mask to the input sequence
action_mask = action_mask.expand(-1, state_action_traj.shape[1], -1)
state_action_traj = torch.cat([state_action_traj, action_mask], dim=2)
# Align the dimension with the hidden size
lang_cond, img_cond, state_action_traj = self.adapt_conditions(lang_tokens, img_tokens, state_action_traj)
# Predict the denoised result
pred = self.model(state_action_traj, ctrl_freqs, timesteps, lang_cond, img_cond, lang_mask=lang_attn_mask)
pred_type = self.prediction_type
if pred_type == 'epsilon':
target = noise
elif pred_type == 'sample':
target = action_gt
else:
raise ValueError(f"Unsupported prediction type {pred_type}")
loss = F.mse_loss(pred, target)
return loss
def predict_action(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_mask, ctrl_freqs):
'''
lang_tokens: (batch_size, lang_len, lang_token_dim)
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
which should be True-False bool tensor.
img_tokens: (batch_size, img_len, img_token_dim)
state_tokens: (batch_size, 1, state_token_dim)
action_mask: (batch_size, 1, action_dim),
which should be a 0-1 **float** tensor.
ctrl_freqs: (batch_size,), control frequency for each sample.
return: (batch_size, horizon, action_dim), predicted action sequence
'''
# Prepare the state and conditions
state_tokens = torch.cat([state_tokens, action_mask], dim=2)
lang_cond, img_cond, state_traj = self.adapt_conditions(lang_tokens, img_tokens, state_tokens)
# Run sampling
action_pred = self.conditional_sample(
lang_cond,
lang_attn_mask,
img_cond,
state_traj,
action_mask,
ctrl_freqs,
)
return action_pred
def forward(self, *args, **kwargs) -> torch.Tensor:
return self.compute_loss(*args, **kwargs)
class RoboticDiffusionTransformerModel_Dump(object):
def __init__(
self,
args,
device="cuda",
dtype=torch.bfloat16,
image_size=None,
control_frequency=25,
pretrained=None,
pretrained_vision_encoder_name_or_path=None,
):
self.args = args
self.dtype = dtype
self.image_size = image_size
self.device = device
self.control_frequency = control_frequency
# We do not use the text encoder due to limited GPU memory
# self.text_tokenizer, self.text_model = self.get_text_encoder(pretrained_text_encoder_name_or_path)
self.image_processor, self.vision_model = self.get_vision_encoder(pretrained_vision_encoder_name_or_path)
self.policy = self.get_policy(pretrained)
self.reset()
def get_policy(self, pretrained):
# Initialize model with arguments
if pretrained is None or os.path.isfile(pretrained):
img_cond_len = (self.args["common"]["img_history_size"] * self.args["common"]["num_cameras"] *
self.vision_model.num_patches)
_model = RDTRunner_Dump(
action_dim=self.args["common"]["state_dim"],
pred_horizon=self.args["common"]["action_chunk_size"],
config=self.args["model"],
lang_token_dim=self.args["model"]["lang_token_dim"],
img_token_dim=self.args["model"]["img_token_dim"],
state_token_dim=self.args["model"]["state_token_dim"],
max_lang_cond_len=self.args["dataset"]["tokenizer_max_length"],
img_cond_len=img_cond_len,
img_pos_embed_config=[
# No initial pos embed in the last grid size
# since we've already done in ViT
(
"image",
(
self.args["common"]["img_history_size"],
self.args["common"]["num_cameras"],
-self.vision_model.num_patches,
),
),
],
lang_pos_embed_config=[
# Similarly, no initial pos embed for language
("lang", -self.args["dataset"]["tokenizer_max_length"]),
],
dtype=self.dtype,
)
else:
_model = RDTRunner_Dump.from_pretrained(pretrained)
return _model
def get_text_encoder(self, pretrained_text_encoder_name_or_path):
text_embedder = T5Embedder(
from_pretrained=pretrained_text_encoder_name_or_path,
model_max_length=self.args["dataset"]["tokenizer_max_length"],
device=self.device,
)
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
return tokenizer, text_encoder
def get_vision_encoder(self, pretrained_vision_encoder_name_or_path):
vision_encoder = SiglipVisionTower(vision_tower=pretrained_vision_encoder_name_or_path, args=None)
image_processor = vision_encoder.image_processor
return image_processor, vision_encoder
def reset(self):
device = self.device
weight_dtype = self.dtype
self.policy.eval()
# self.text_model.eval()
self.vision_model.eval()
self.policy = self.policy.to(device, dtype=weight_dtype)
# self.text_model = self.text_model.to(device, dtype=weight_dtype)
self.vision_model = self.vision_model.to(device, dtype=weight_dtype)
def load_pretrained_weights(self, pretrained=None):
if pretrained is None:
return
print(f"Loading weights from {pretrained}")
filename = os.path.basename(pretrained)
if filename.endswith(".pt"):
checkpoint = torch.load(pretrained)
self.policy.load_state_dict(checkpoint["module"])
elif filename.endswith(".safetensors"):
from safetensors.torch import load_model
load_model(self.policy, pretrained)
else:
raise NotImplementedError(f"Unknown checkpoint format: {pretrained}")
def encode_instruction(self, instruction, device="cuda"):
tokens = self.text_tokenizer(instruction, return_tensors="pt", padding="longest",
truncation=True)["input_ids"].to(device)
tokens = tokens.view(1, -1)
with torch.no_grad():
pred = self.text_model(tokens).last_hidden_state.detach()
return pred
def _format_joint_to_state(self, joints):
# Rescale the gripper to the range of [0, 1]
joints = joints / torch.tensor(
[[[180, 180, 180, 180, 180, 180]]],
device=joints.device,
dtype=joints.dtype,
)
B, N, _ = joints.shape
state = torch.zeros(
(B, N, self.args["model"]["state_token_dim"]),
device=joints.device,
dtype=joints.dtype,
)
# Fill into the unified state vector
state[:, :, AGILEX_STATE_INDICES] = joints
# Assemble the mask indicating each dimension's availability
state_elem_mask = torch.zeros(
(B, self.args["model"]["state_token_dim"]),
device=joints.device,
dtype=joints.dtype,
)
state_elem_mask[:, AGILEX_STATE_INDICES] = 1
return state, state_elem_mask
def _unformat_action_to_joint(self, action):
action_indices = AGILEX_STATE_INDICES
joints = action[:, :, action_indices]
# Rescale the gripper back to the action range
# Note that the action range and proprioception range are different
# for Mobile ALOHA robot
joints = joints * torch.tensor(
[[[180, 180, 180, 180, 180, 180]]],
device=joints.device,
dtype=joints.dtype,
)
return joints
@torch.no_grad()
def step(self, proprio, images, text_embeds):
device = self.device
dtype = self.dtype
# The background image used for padding
background_color = np.array([int(x * 255) for x in self.image_processor.image_mean], dtype=np.uint8).reshape(1, 1, 3)
background_image = (np.ones(
(
self.image_processor.size["height"],
self.image_processor.size["width"],
3,
),
dtype=np.uint8,
) * background_color)
# Preprocess the images by order and encode them
image_tensor_list = []
for image in images:
if image is None:
# Replace it with the background image
image = PImage.fromarray(background_image)
else:
# Convert numpy array to PIL Image if needed
if isinstance(image, np.ndarray):
image = PImage.fromarray(image)
if self.image_size is not None:
image = transforms.Resize(self.data_args.image_size)(image)
if self.args["dataset"].get("auto_adjust_image_brightness", False):
pixel_values = list(image.getdata())
average_brightness = sum(sum(pixel) for pixel in pixel_values) / (len(pixel_values) * 255.0 * 3)
if average_brightness <= 0.15:
image = transforms.ColorJitter(brightness=(1.75, 1.75))(image)
if self.args["dataset"].get("image_aspect_ratio", "pad") == "pad":
def expand2square(pil_img, background_color):
width, height = pil_img.size
if width == height:
return pil_img
elif width > height:
result = PImage.new(pil_img.mode, (width, width), background_color)
result.paste(pil_img, (0, (width - height) // 2))
return result
else:
result = PImage.new(pil_img.mode, (height, height), background_color)
result.paste(pil_img, ((height - width) // 2, 0))
return result
image = expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean))
image = self.image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
image_tensor_list.append(image)
image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
image_embeds = self.vision_model(image_tensor).detach()
image_embeds = image_embeds.reshape(-1, self.vision_model.hidden_size).unsqueeze(0)
# Prepare the proprioception states and the control frequency
joints = proprio.to(device).unsqueeze(0) # (1, 1, 14)
states, state_elem_mask = self._format_joint_to_state(joints) # (1, 1, 128), (1, 128)
states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(device, dtype=dtype)
states = states[:, -1:, :] # (1, 1, 128)
ctrl_freqs = torch.tensor([self.control_frequency]).to(device)
text_embeds = text_embeds.to(device, dtype=dtype)
# Predict the next action chunk given the inputs
trajectory = self.policy.predict_action(
lang_tokens=text_embeds,
lang_attn_mask=torch.ones(text_embeds.shape[:2], dtype=torch.bool, device=text_embeds.device),
img_tokens=image_embeds,
state_tokens=states,
action_mask=state_elem_mask.unsqueeze(1),
ctrl_freqs=ctrl_freqs,
)
trajectory = self._unformat_action_to_joint(trajectory).to(torch.float32)
return trajectory
def get_training_samples(data_dirs, num_samples=5, instructions_per_episode=1):
"""
Get training samples from one or multiple data directories.
Args:
data_dirs: A single directory path (str) or a list of directory paths
num_samples: Total number of samples to generate across all directories
instructions_per_episode: Number of instructions per episode
"""
training_samples = []
# Handle both single directory and list of directories
if isinstance(data_dirs, str):
data_dirs = [data_dirs]
logger.info(f"Get Training Data From: {len(data_dirs)} dataset(s).")
# First, collect all available episode files from all directories
episode_files = []
for data_dir in data_dirs:
if not os.path.isdir(data_dir):
logger.warning(f"Directory not found: {data_dir}, skipping")
continue
for root, dirs, files in os.walk(data_dir):
for file in files:
if file.endswith('.hdf5'):
file_path = os.path.join(root, file)
episode_files.append(file_path)
if len(episode_files) == 0:
logger.warning(f"No episode files found in the provided directories")
return training_samples
logger.info(f"Found {len(episode_files)} episode files across all datasets.")
# Generate samples by randomly selecting from episodes
while len(training_samples) < num_samples:
# Randomly select an episode file
file_path = np.random.choice(episode_files)
try:
with h5py.File(file_path, 'r') as f:
observations = f['observations']
actions = f['action'][:]
images = observations['images']
qpos = observations['qpos'][:]
episode_dir = os.path.dirname(file_path)
instructions_dir = os.path.join(episode_dir, 'instructions')
num_steps = len(qpos)
if num_steps > 1: # Image部分需要左中右三帧加上对饮历史帧组成4374维
lang_step_idx = int(np.random.randint(0, max(instructions_per_episode, 1)))
instructions_dir = os.path.join(os.path.dirname(file_path), "instructions")
lang_embed, lang_str = None, None
# lang embed (optional)
lang_embed_path = os.path.join(instructions_dir, f"lang_embed_{lang_step_idx}.pt")
if os.path.exists(lang_embed_path):
try:
lang_embed = torch.load(lang_embed_path, map_location="cpu")
except Exception as e:
logger.error(f"Error reading {lang_embed_path}: {e}")
# lang string (optional)
lang_str_path = os.path.join(instructions_dir, f"txt_lang_embed_{lang_step_idx}.txt")
if os.path.exists(lang_str_path):
try:
with open(lang_str_path, "r", encoding="utf-8") as tf:
lang_str = tf.read().strip()
except Exception as e:
logger.error(f"Error reading {lang_str_path}: {e}")
lang_str = lang_str or ""
# 获取多摄像头多历史帧图像
step_idx = np.random.randint(0, num_steps)
multi_cam_images = {}
ref_frame = images['cam_high'][0]
ref_img = cv2.imdecode(np.frombuffer(ref_frame, np.uint8), cv2.IMREAD_COLOR)
IMG_HEIGHT, IMG_WIDTH = ref_img.shape[:2]
# IMG_HEIGHT, IMG_WIDTH = images['cam_high'][0].shape[:2]
ground_image = np.zeros((IMG_HEIGHT, IMG_WIDTH, 3), dtype=np.uint8)
for cam_name in ['cam_high', 'cam_left_wrist', 'cam_right_wrist']:
if cam_name in images:
cam_images = []
# 获取2个历史帧的图像
for i in range(max(step_idx - 1, 0), step_idx + 1): # 2个历史帧
img_bits = images[cam_name][i]
img = cv2.imdecode(np.frombuffer(img_bits, np.uint8), cv2.IMREAD_COLOR)
# img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
cam_images.append(img)
if len(cam_images) < 2:
cam_images = [cam_images[0]] * 2
multi_cam_images[cam_name] = cam_images
else:
cam_images = []
for i in range(max(step_idx - 1, 0), step_idx + 1): # 2个历史帧
img_bits = ground_image
# img = cv2.imdecode(np.frombuffer(img_bits, np.uint8), cv2.IMREAD_COLOR)
cam_images.append(img_bits)
if len(cam_images) < 2:
cam_images = [cam_images[0]] * 2
multi_cam_images[cam_name] = cam_images
training_samples.append({
'multi_cam_images': multi_cam_images,
'joints': actions[step_idx],
'lang_embed': lang_embed,
'lang_str': lang_str,
'source': file_path,
'step': step_idx
})
logger.debug(f"TimeStep: {step_idx}, Sample: {file_path}")
except Exception as e:
logger.error(f"Faild: {file_path} : {e}")
continue
logger.info(f"Total Num: {len(training_samples)}.")
return training_samples
def main(config_path):
with open(config_path, "r") as f:
cfg = json.load(f)
export_info = cfg.get("export", {})
opt = ExportConfig(
task_id=cfg.get("task_id"),
output_path=os.path.join(export_info.get("output_path", "."), cfg.get("task_id", "")),
model_path=export_info.get("model_path"),
calibration_num=export_info.get("calibration_num", 100),
dataset_path=export_info.get("dataset_path"),
gpu_id=cfg.get("gpu_id", "0"),
march=export_info.get("march"),
model_type=export_info.get("model_type"),
pretrained_vision_encoder_name_or_path="/home/qi.xiong/DualArm/Work_Docker/RDT/weights/siglip-so400m-patch14-384",
ctrl_freq=export_info.get("ctrl_freq", 25),
cal_data_device=cfg.get("cal_data_device", "cuda"),
lang_calibration_num=export_info.get("lang_calibration_num", 1)
)
if opt.model_type not in ["170M", "1B"]:
raise ValueError(f"RDT ONLY SUPPORT 170M AND 1B, BUT GOT {opt.model_type}")
logger.info(f"Export config loaded: {opt}")
os.makedirs(opt.output_path, exist_ok=True)
# PrePare Output Workspace
## BPU_RDT_Policy
bpu_rdt_name = "BPU_RDT_Policy_170M" if opt.model_type == "170M" else "BPU_RDT_Policy_1B"
bpu_rdt_path = os.path.join(opt.output_path, bpu_rdt_name)
os.makedirs(bpu_rdt_path, exist_ok=True)
os.system(f"cp configs/base_{opt.model_type}.yaml {bpu_rdt_path}/base.yaml")
rdt_config_path = os.path.join(bpu_rdt_path, "base.yaml")
## Test_Datas
test_data_name = "test_data"
test_data_path = os.path.join(opt.output_path, test_data_name)
os.makedirs(test_data_path, exist_ok=True)
## instruction
instruction_ws_name = "instructions"
instruction_ws_path = os.path.join(opt.output_path, instruction_ws_name)
os.makedirs(instruction_ws_path, exist_ok=True)
for name in os.listdir(opt.dataset_path):
os.makedirs(os.path.join(instruction_ws_path, name), exist_ok=True)
## image adaptor
global img_adaptor_cal_ws
img_adaptor_ws_name = "img_adaptor_WorkSpace"
img_adaptor_cal_name = "rdt_image_adaptor_calibration"
img_adaptor_name = "rdt_image_adaptor.onnx"
img_adaptor_config_name = "config.yaml"
img_adaptor_ws = os.path.join(opt.output_path, img_adaptor_ws_name)
img_adaptor_path = os.path.join(img_adaptor_ws, img_adaptor_name)
img_adaptor_cal_ws = os.path.join(img_adaptor_ws, img_adaptor_cal_name)
os.makedirs(img_adaptor_ws, exist_ok=True)
os.makedirs(img_adaptor_cal_ws, exist_ok=True)
## action adaptor
state_adaptor_name1 = "rdt_state_adaptor_1x1x256.onnx"
state_adaptor_path1 = os.path.join(opt.output_path, bpu_rdt_name, state_adaptor_name1)
state_adaptor_name2 = "rdt_state_adaptor_1x64x256.onnx"
state_adaptor_path2 = os.path.join(opt.output_path, bpu_rdt_name, state_adaptor_name2)
## lang adaptor
lang_adaptor_name = "rdt_lang_adaptor.onnx"
lang_adaptor_path = os.path.join(opt.output_path, bpu_rdt_name, lang_adaptor_name)
## DiT Policy
dit_ws_name = "DiT_WorkSpace"
dit_cal_name = "rdt_dit_calibration"
dit_name = "rdt_dit.onnx"
dit_config_name = "config.yaml"
dit_json_name = "quant_config.json"
dit_ws = os.path.join(opt.output_path, dit_ws_name)
dit_path = os.path.join(dit_ws, dit_name)
dit_cal_ws = os.path.join(dit_ws, dit_cal_name)
os.makedirs(dit_ws, exist_ok=True)
os.makedirs(dit_cal_ws, exist_ok=True)
global dit_cal_path_x, dit_cal_path_freq, dit_cal_path_t, dit_cal_path_lang_c, dit_cal_path_img_c, dit_cal_path_lang_mask
dit_cal_path_x = os.path.join(dit_cal_ws, "x")
os.makedirs(dit_cal_path_x, exist_ok=True)
dit_cal_path_freq = os.path.join(dit_cal_ws, "freq")
os.makedirs(dit_cal_path_freq, exist_ok=True)
dit_cal_path_t = os.path.join(dit_cal_ws, "t")
os.makedirs(dit_cal_path_t, exist_ok=True)
dit_cal_path_lang_c = os.path.join(dit_cal_ws, "lang_c")
os.makedirs(dit_cal_path_lang_c, exist_ok=True)
dit_cal_path_img_c = os.path.join(dit_cal_ws, "img_c")
os.makedirs(dit_cal_path_img_c, exist_ok=True)
dit_cal_path_lang_mask = os.path.join(dit_cal_ws, "lang_mask")
os.makedirs(dit_cal_path_lang_mask, exist_ok=True)
# Prepare Calibrate Data
with open(rdt_config_path, "r") as f:
rdt_config = yaml.safe_load(f)
dump_model = create_dump_model(
args=rdt_config,
dtype=torch.float32,
pretrained=opt.model_path,
pretrained_vision_encoder_name_or_path=opt.pretrained_vision_encoder_name_or_path,
control_frequency=opt.ctrl_freq,
device=opt.cal_data_device
)
# Prepare Calbriation Data
# load training data from all datasets
global dump_cnt, dump_dataset_name
test_data_cnt = 0
# Collect all dataset paths
all_dataset_paths = []
for dump_dataset_name in os.listdir(opt.dataset_path):
dump_dataset_path = os.path.join(opt.dataset_path, dump_dataset_name)
if os.path.isdir(dump_dataset_path):
all_dataset_paths.append(dump_dataset_path)
# Get training samples from all datasets together
training_samples = get_training_samples(all_dataset_paths, num_samples=opt.calibration_num, instructions_per_episode=opt.lang_calibration_num)
if len(training_samples) == 0:
logger.warning("No training samples found, skipping calibration data generation")
else:
# Only process up to the number of samples we actually have
num_samples_to_process = min(len(training_samples), opt.calibration_num)
for dump_cnt in range(num_samples_to_process):
sample = training_samples[dump_cnt]
# Extract dataset name from the sample's source path
sample_source = sample['source']
dump_dataset_name = os.path.basename(os.path.dirname(os.path.dirname(sample_source)))
instruction_emb = {
"lang_cond": sample["lang_embed"].float().cpu(),
"lang_str": sample["lang_str"]
}
ins_str_name = sample["lang_str"].replace(" ", "_") + "__"
torch.save(instruction_emb, os.path.join(instruction_ws_path, dump_dataset_name, f"{ins_str_name}.pt"))
image_arrs = [
sample['multi_cam_images']['cam_high'][0],
sample['multi_cam_images']['cam_right_wrist'][0],
sample['multi_cam_images']['cam_left_wrist'][0],
sample['multi_cam_images']['cam_high'][1],
sample['multi_cam_images']['cam_right_wrist'][1],
sample['multi_cam_images']['cam_left_wrist'][1],
]
test_data_cnt += 1
np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_high_0.npy"), sample['multi_cam_images']['cam_high'][0])
np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_right_wrist_0.npy"), sample['multi_cam_images']['cam_right_wrist'][0])
np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_left_wrist_0.npy"), sample['multi_cam_images']['cam_left_wrist'][0])
np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_high_1.npy"), sample['multi_cam_images']['cam_high'][1])
np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_right_wrist_1.npy"), sample['multi_cam_images']['cam_right_wrist'][1])
np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_left_wrist_1.npy"), sample['multi_cam_images']['cam_left_wrist'][1])
images = [PImage.fromarray(arr) if arr is not None else None for arr in image_arrs]
proprio = torch.from_numpy(sample['joints']).float().unsqueeze(0).to(opt.cal_data_device)
np.save(os.path.join(test_data_path, f"{test_data_cnt}_joints.npy"), sample['joints'])
lang_embeddings = sample['lang_embed'].float().unsqueeze(0).to(opt.cal_data_device)
torch.save(lang_embeddings, os.path.join(test_data_path, f"{test_data_cnt}_lang_embeddings.pt"))
dump_model.reset()
begin_time = time()
actions = dump_model.step(proprio=proprio, images=images, text_embeds=lang_embeddings).squeeze(0).cpu().numpy()
np.save(os.path.join(test_data_path, f"{test_data_cnt}_actions.npy"), actions)
logger.debug(f"Dump: Cost {(1000*(time() - begin_time)):.1f} ms, cnt: {dump_cnt}, name: {dump_dataset_name}")
logger.info("End Generate Calibration Data.")
del dump_model
# Load RDT Policy: CPU Model For ONNX Export
with open(rdt_config_path, "r") as f:
rdt_config = yaml.safe_load(f)
model = create_model(
args=rdt_config,
dtype=torch.float32,
pretrained=opt.model_path,
pretrained_vision_encoder_name_or_path=opt.pretrained_vision_encoder_name_or_path,
control_frequency=opt.ctrl_freq,
device="cpu"
)
# image adaptor: ONNX Model
m = model.policy.img_adaptor
m.eval()
input_data = torch.randn(1, 4374, rdt_config['model']['img_token_dim']) # 假设批量大小为1
output = m(input_data)
torch.onnx.export(
m,
input_data,
img_adaptor_path,
opset_version=14,
do_constant_folding=True,
input_names=["img_tokens"],
output_names=["adapted_img"],
dynamic_axes=None,
verbose=False
)
logger.info("Export RDT [img_adaptor] Model Success.")
# DiT
hidden_size = rdt_config['model']["rdt"]['hidden_size']
m = model.policy.model
m = m.eval().cpu()
x = torch.randn(1, 65, hidden_size)
freq = torch.tensor([1], dtype=torch.int32)
t = torch.tensor([10], dtype=torch.int32)
lang_c = torch.randn(1, 64, hidden_size)
img_c = torch.randn(1, 4374, hidden_size)
lang_mask = torch.ones(1, 64, dtype=torch.float32)
dummy_inputs = (x, freq, t, lang_c, img_c, lang_mask)
# outputs = m(x, freq, t, lang_c, img_c, lang_mask)
torch.onnx.export(
m,
dummy_inputs,
dit_path,
# export_params=True,
opset_version=14,
do_constant_folding=True,
input_names=["x", "freq", "t", "lang_c", "img_c", "lang_mask"],
output_names=["actions"],
verbose=False
)
logger.info("Export RDT [DiT] Model Success.")
# state adaptor
m = model.policy.state_adaptor
m.eval()
input_data = torch.randn(1, 1, 256) # 假设批量大小为1
output = m(input_data)
torch.onnx.export(
m,
input_data,
state_adaptor_path1,
export_params=True,
opset_version=14,
do_constant_folding=True,
input_names=["state_tokens"],
output_names=["state_traj"],
dynamic_axes=None,
verbose=False
)
logging.info("Export RDT [state 1x1x256] Model Success.")
input_data = torch.randn(1, 64, 256)
output = m(input_data)
torch.onnx.export(
m,
input_data,
state_adaptor_path2,
export_params=True,
opset_version=14,
do_constant_folding=True,
input_names=['state_tokens'],
output_names=['state_traj'],
dynamic_axes=None,
verbose=False
)
logging.info("Export RDT [state 1x64x256] Model Success.")
# lang adaptor
m = model.policy.lang_adaptor
m.eval()
input_data = torch.randn(1, 14, 4096)
output = m(input_data)
torch.onnx.export(
m,
input_data,
lang_adaptor_path,
export_params=True,
opset_version=14,
do_constant_folding=True,
input_names=["text_embeds"],
output_names=["lang_cond"],
dynamic_axes={
"text_embeds": {1: "N"},
"lang_cond": {1: "N"}
},
verbose=False
)
logger.info("Export RDT [lang adaptor] Model Success.")
######## Prepare Calbibration Data
if __name__ == "__main__":
main("/home/qi.xiong/DualArm/Work_Docker/RDT/rdt-export/input/config.json")
logger.info("All Models Have Been Exported Success.")

View File

View File

@ -0,0 +1,82 @@
# Reference: DiffusionPolicy [https://github.com/real-stanford/diffusion_policy]
import torch
from torch.nn.modules.batchnorm import _BatchNorm
class EMAModel:
"""
Exponential Moving Average of models weights
"""
def __init__(self, model, update_after_step=0, inv_gamma=1.0, power=2 / 3, min_value=0.0, max_value=0.9999):
"""
@crowsonkb's notes on EMA Warmup:
If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
at 215.4k steps).
Args:
inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
power (float): Exponential factor of EMA warmup. Default: 2/3.
min_value (float): The minimum EMA decay rate. Default: 0.
"""
self.averaged_model = model
self.averaged_model.eval()
self.averaged_model.requires_grad_(False)
self.update_after_step = update_after_step
self.inv_gamma = inv_gamma
self.power = power
self.min_value = min_value
self.max_value = max_value
self.decay = 0.0
self.optimization_step = 0
def get_decay(self, optimization_step):
"""
Compute the decay factor for the exponential moving average.
"""
step = max(0, optimization_step - self.update_after_step - 1)
value = 1 - (1 + step / self.inv_gamma)**-self.power
if step <= 0:
return 0.0
return max(self.min_value, min(value, self.max_value))
@torch.no_grad()
def step(self, new_model):
self.decay = self.get_decay(self.optimization_step)
# old_all_dataptrs = set()
# for param in new_model.parameters():
# data_ptr = param.data_ptr()
# if data_ptr != 0:
# old_all_dataptrs.add(data_ptr)
all_dataptrs = set()
for module, ema_module in zip(new_model.modules(), self.averaged_model.modules()):
for param, ema_param in zip(module.parameters(recurse=False), ema_module.parameters(recurse=False)):
# iterative over immediate parameters only.
if isinstance(param, dict):
raise RuntimeError('Dict parameter not supported')
# data_ptr = param.data_ptr()
# if data_ptr != 0:
# all_dataptrs.add(data_ptr)
if isinstance(module, _BatchNorm):
# skip batchnorms
ema_param.copy_(param.to(dtype=ema_param.dtype).data)
elif not param.requires_grad:
ema_param.copy_(param.to(dtype=ema_param.dtype).data)
else:
ema_param.mul_(self.decay)
ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=1 - self.decay)
# verify that iterating over module and then parameters is identical to parameters recursively.
# assert old_all_dataptrs == all_dataptrs
self.optimization_step += 1

View File

@ -0,0 +1,75 @@
import os
from pathlib import Path
from typing import Dict, Optional, Union
from huggingface_hub import PyTorchModelHubMixin
from huggingface_hub.constants import (PYTORCH_WEIGHTS_NAME, SAFETENSORS_SINGLE_FILE)
from huggingface_hub.file_download import hf_hub_download
from huggingface_hub.utils import EntryNotFoundError, is_torch_available
if is_torch_available():
import torch # type: ignore
class CompatiblePyTorchModelHubMixin(PyTorchModelHubMixin):
"""Mixin class to load Pytorch models from the Hub."""
def _save_pretrained(self, save_directory: Path) -> None:
"""Save weights from a Pytorch model to a local directory."""
# To bypass saving into safetensor by default
model_to_save = self.module if hasattr(self, "module") else self # type: ignore
torch.save(model_to_save.state_dict(), save_directory / PYTORCH_WEIGHTS_NAME)
@classmethod
def _from_pretrained(
cls,
*,
model_id: str,
revision: Optional[str],
cache_dir: Optional[Union[str, Path]],
force_download: bool,
proxies: Optional[Dict],
resume_download: Optional[bool],
local_files_only: bool,
token: Union[str, bool, None],
map_location: str = "cpu",
strict: bool = False,
**model_kwargs,
):
"""Load Pytorch pretrained weights and return the loaded model."""
model = cls(**model_kwargs)
if os.path.isdir(model_id):
print("Loading weights from local directory")
try:
model_file = os.path.join(model_id, SAFETENSORS_SINGLE_FILE)
return cls._load_as_safetensor(model, model_file, map_location, strict)
except FileNotFoundError:
model_file = os.path.join(model_id, PYTORCH_WEIGHTS_NAME)
return cls._load_as_pickle(model, model_file, map_location, strict)
else:
try:
model_file = hf_hub_download(
repo_id=model_id,
filename=SAFETENSORS_SINGLE_FILE,
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
token=token,
local_files_only=local_files_only,
)
return cls._load_as_safetensor(model, model_file, map_location, strict)
except EntryNotFoundError:
model_file = hf_hub_download(
repo_id=model_id,
filename=PYTORCH_WEIGHTS_NAME,
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
token=token,
local_files_only=local_files_only,
)
return cls._load_as_pickle(model, model_file, map_location, strict)

View File

@ -0,0 +1,159 @@
import torch
import torch.nn as nn
from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
class CLIPVisionTower(nn.Module):
def __init__(self, vision_tower, args, delay_load=False):
super().__init__()
self.is_loaded = False
self.vision_tower_name = vision_tower
self.select_layer = args.mm_vision_select_layer
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
if not delay_load:
self.load_model()
elif getattr(args, 'unfreeze_mm_vision_tower', False):
self.load_model()
else:
self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
def load_model(self, device_map=None):
if self.is_loaded:
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
return
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
self.vision_tower.requires_grad_(False)
self.is_loaded = True
def feature_select(self, image_forward_outs):
image_features = image_forward_outs.hidden_states[self.select_layer]
if self.select_feature == 'patch':
image_features = image_features[:, 1:]
elif self.select_feature == 'cls_patch':
image_features = image_features
else:
raise ValueError(f'Unexpected select feature: {self.select_feature}')
return image_features
@torch.no_grad()
def forward(self, images):
if type(images) is list:
image_features = []
for image in images:
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
output_hidden_states=True)
image_feature = self.feature_select(image_forward_out).to(image.dtype)
image_features.append(image_feature)
else:
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
output_hidden_states=True)
image_features = self.feature_select(image_forward_outs).to(images.dtype)
return image_features
@property
def dummy_feature(self):
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
@property
def dtype(self):
return self.vision_tower.dtype
@property
def device(self):
return self.vision_tower.device
@property
def config(self):
if self.is_loaded:
return self.vision_tower.config
else:
return self.cfg_only
@property
def hidden_size(self):
return self.config.hidden_size
@property
def num_patches_per_side(self):
return self.config.image_size // self.config.patch_size
@property
def num_patches(self):
return (self.config.image_size // self.config.patch_size)**2
class CLIPVisionTowerS2(CLIPVisionTower):
def __init__(self, vision_tower, args, delay_load=False):
super().__init__(vision_tower, args, delay_load)
self.s2_scales = getattr(args, 's2_scales', '336,672,1008')
self.s2_scales = list(map(int, self.s2_scales.split(',')))
self.s2_scales.sort()
self.s2_split_size = self.s2_scales[0]
self.s2_image_size = self.s2_scales[-1]
try:
from s2wrapper import forward as multiscale_forward
except ImportError:
raise ImportError(
'Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git'
)
self.multiscale_forward = multiscale_forward
# change resize/crop size in preprocessing to the largest image size in s2_scale
if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False):
self.image_processor.size['shortest_edge'] = self.s2_image_size
self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
def load_model(self, device_map=None):
if self.is_loaded:
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
return
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
self.vision_tower.requires_grad_(False)
self.image_processor.size['shortest_edge'] = self.s2_image_size
self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
self.is_loaded = True
@torch.no_grad()
def forward_feature(self, images):
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
output_hidden_states=True)
image_features = self.feature_select(image_forward_outs).to(images.dtype)
return image_features
@torch.no_grad()
def forward(self, images):
if type(images) is list:
image_features = []
for image in images:
image_feature = self.multiscale_forward(self.forward_feature,
image.unsqueeze(0),
img_sizes=self.s2_scales,
max_split_size=self.s2_split_size)
image_features.append(image_feature)
else:
image_features = self.multiscale_forward(self.forward_feature,
images,
img_sizes=self.s2_scales,
max_split_size=self.s2_split_size)
return image_features
@property
def hidden_size(self):
return self.config.hidden_size * len(self.s2_scales)

View File

@ -0,0 +1,87 @@
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoImageProcessor, AutoModel, Dinov2Model
class DinoV2VisionTower(nn.Module):
def __init__(self, vision_tower, args, delay_load=False):
super().__init__()
self.is_loaded = False
self.vision_tower_name = vision_tower
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
if not delay_load:
self.load_model()
elif getattr(args, 'unfreeze_mm_vision_tower', False):
self.load_model()
else:
self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
def load_model(self, device_map=None):
if self.is_loaded:
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
return
self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name)
self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, device_map=device_map)
self.vision_tower.requires_grad_(False) # FIXME:
self.is_loaded = True
def feature_select(self, image_forward_outs):
image_features = image_forward_outs.last_hidden_state
if self.select_feature == 'patch':
image_features = image_features[:, 1:] # (B, 1369, 1536)
elif self.select_feature == 'cls_patch':
image_features = image_features # (B, 1, 1536)
else:
raise ValueError(f'Unexpected select feature: {self.select_feature}')
return image_features
@torch.no_grad()
def forward(self, images):
if type(images) is list:
image_features = []
for image in images:
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
image_feature = self.feature_select(image_forward_out).to(image.dtype)
image_features.append(image_feature)
else:
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
image_features = self.feature_select(image_forward_outs).to(images.dtype)
return image_features
@property
def dummy_feature(self):
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
@property
def dtype(self):
return self.vision_tower.dtype
@property
def device(self):
return self.vision_tower.device
@property
def config(self):
if self.is_loaded:
return self.vision_tower.config
else:
return self.cfg_only
@property
def hidden_size(self):
return self.config.hidden_size
@property
def num_patches_per_side(self):
return self.config.image_size // self.config.patch_size
@property
def num_patches(self):
return (self.config.image_size // self.config.patch_size)**2

View File

@ -0,0 +1,86 @@
import torch
import torch.nn as nn
from transformers import AutoConfig, SiglipImageProcessor, SiglipVisionModel
class SiglipVisionTower(nn.Module):
def __init__(self, vision_tower, args, delay_load=False):
super().__init__()
self.is_loaded = False
self.vision_tower_name = vision_tower
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
if not delay_load:
self.load_model()
elif getattr(args, 'unfreeze_mm_vision_tower', False):
self.load_model()
else:
self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
def load_model(self, device_map=None):
if self.is_loaded:
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
return
self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
self.vision_tower.eval()
self.is_loaded = True
def feature_select(self, image_forward_outs):
if self.select_feature == 'patch':
image_features = image_forward_outs.last_hidden_state # (B, 729, 1536)
elif self.select_feature == 'cls_patch':
image_features = image_forward_outs.pooler_output # (B, 1, 1536)
else:
raise ValueError(f'Unexpected select feature: {self.select_feature}')
return image_features
@torch.no_grad()
def forward(self, images):
if type(images) is list:
image_features = []
for image in images:
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
image_feature = self.feature_select(image_forward_out).to(image.dtype)
image_features.append(image_feature)
else:
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
image_features = self.feature_select(image_forward_outs).to(images.dtype)
return image_features
@property
def dummy_feature(self):
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
@property
def dtype(self):
return self.vision_tower.dtype
@property
def device(self):
return self.vision_tower.device
@property
def config(self):
if self.is_loaded:
return self.vision_tower.config
else:
return self.cfg_only
@property
def hidden_size(self):
return self.config.hidden_size
@property
def num_patches_per_side(self):
return self.config.image_size // self.config.patch_size
@property
def num_patches(self):
return (self.config.image_size // self.config.patch_size)**2

View File

@ -0,0 +1,111 @@
import torch
from transformers import AutoTokenizer, T5EncoderModel
class T5Embedder:
# available_models = ["google/t5-v1_1-xxl"]
def __init__(
self,
device,
from_pretrained=None,
*,
cache_dir=None,
hf_token=None,
use_text_preprocessing=True,
t5_model_kwargs=None,
torch_dtype=None,
use_offload_folder=None,
model_max_length=120,
local_files_only=False,
):
# from_pretrained="google/t5-v1_1-xxl" # zijian
self.device = torch.device(device)
self.torch_dtype = torch_dtype or torch.bfloat16
self.cache_dir = cache_dir
if t5_model_kwargs is None:
t5_model_kwargs = {
"low_cpu_mem_usage": True,
"torch_dtype": self.torch_dtype,
}
if use_offload_folder is not None:
t5_model_kwargs["offload_folder"] = use_offload_folder
t5_model_kwargs["device_map"] = {
"shared": self.device,
"encoder.embed_tokens": self.device,
"encoder.block.0": self.device,
"encoder.block.1": self.device,
"encoder.block.2": self.device,
"encoder.block.3": self.device,
"encoder.block.4": self.device,
"encoder.block.5": self.device,
"encoder.block.6": self.device,
"encoder.block.7": self.device,
"encoder.block.8": self.device,
"encoder.block.9": self.device,
"encoder.block.10": self.device,
"encoder.block.11": self.device,
"encoder.block.12": "disk",
"encoder.block.13": "disk",
"encoder.block.14": "disk",
"encoder.block.15": "disk",
"encoder.block.16": "disk",
"encoder.block.17": "disk",
"encoder.block.18": "disk",
"encoder.block.19": "disk",
"encoder.block.20": "disk",
"encoder.block.21": "disk",
"encoder.block.22": "disk",
"encoder.block.23": "disk",
"encoder.final_layer_norm": "disk",
"encoder.dropout": "disk",
}
else:
t5_model_kwargs["device_map"] = {
"shared": self.device,
"encoder": self.device,
}
self.use_text_preprocessing = use_text_preprocessing
self.hf_token = hf_token
# assert from_pretrained in self.available_models
self.tokenizer = AutoTokenizer.from_pretrained(
from_pretrained,
model_max_length=model_max_length,
cache_dir=cache_dir,
local_files_only=local_files_only,
)
self.model = T5EncoderModel.from_pretrained(
from_pretrained,
cache_dir=cache_dir,
local_files_only=local_files_only,
**t5_model_kwargs,
).eval()
self.model_max_length = model_max_length
def get_text_embeddings(self, texts):
text_tokens_and_mask = self.tokenizer(
texts,
max_length=self.model_max_length,
padding="longest",
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
return_tensors="pt",
)
input_ids = text_tokens_and_mask["input_ids"].to(self.device)
attention_mask = text_tokens_and_mask["attention_mask"].to(self.device)
with torch.no_grad():
text_encoder_embs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
)["last_hidden_state"].detach()
return text_encoder_embs, attention_mask
if __name__ == "__main__":
T5Embedder(from_pretrained="google/t5-v1_1-xxl", device='cuda:7')

View File

@ -0,0 +1,304 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DiT: https://github.com/facebookresearch/DiT
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
import math
from collections import OrderedDict
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.jit import Final
from timm.models.vision_transformer import Attention, Mlp, RmsNorm, use_fused_attn
#################################################################################
# Embedding Layers for Timesteps and Condition Inptus #
#################################################################################
class TimestepEmbedder(nn.Module):
"""
Embeds scalar timesteps into vector representations.
"""
def __init__(self, hidden_size, frequency_embedding_size=256, dtype=torch.bfloat16):
super().__init__()
self.mlp = nn.Sequential(
nn.Linear(frequency_embedding_size, hidden_size, bias=True),
nn.SiLU(),
nn.Linear(hidden_size, hidden_size, bias=True),
)
self.frequency_embedding_size = frequency_embedding_size
self.dtype = dtype
def timestep_embedding(self, t, dim, max_period=10000):
"""
Create sinusoidal timestep embeddings.
:param t: a 1-D Tensor of N indices, one per batch element.
These may be fractional.
:param dim: the dimension of the output.
:param max_period: controls the minimum frequency of the embeddings.
:return: an (N, D) Tensor of positional embeddings.
"""
# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
half = dim // 2
freqs = torch.exp(-math.log(max_period) *
torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
args = t[:, None].float() * freqs[None]
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
if dim % 2:
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
return embedding.to(self.dtype)
def forward(self, t):
t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
t_emb = self.mlp(t_freq)
return t_emb
#################################################################################
# Cross Attention Layers #
#################################################################################
class CrossAttention(nn.Module):
"""
A cross-attention layer with flash attention.
"""
fused_attn: Final[bool]
def __init__(
self,
dim: int,
num_heads: int = 8,
qkv_bias: bool = False,
qk_norm: bool = False,
attn_drop: float = 0,
proj_drop: float = 0,
norm_layer: nn.Module = nn.LayerNorm,
) -> None:
super().__init__()
assert dim % num_heads == 0, 'dim should be divisible by num_heads'
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.scale = self.head_dim**-0.5
self.fused_attn = use_fused_attn()
self.q = nn.Linear(dim, dim, bias=qkv_bias)
self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x: torch.Tensor, c: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
B, N, C = x.shape
_, L, _ = c.shape
q = self.q(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
kv = self.kv(c).reshape(B, L, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
k, v = kv.unbind(0)
q, k = self.q_norm(q), self.k_norm(k)
# Prepare attn mask (B, L) to mask the conditioion
if mask is not None:
mask = mask.reshape(B, 1, 1, L)
mask = mask.expand(-1, -1, N, -1)
if self.fused_attn:
x = F.scaled_dot_product_attention(query=q,
key=k,
value=v,
dropout_p=self.attn_drop.p if self.training else 0.,
attn_mask=mask)
else:
q = q * self.scale
attn = q @ k.transpose(-2, -1)
if mask is not None:
attn = attn.masked_fill_(mask.logical_not(), float('-inf'))
attn = attn.softmax(dim=-1)
if self.attn_drop.p > 0:
attn = self.attn_drop(attn)
x = attn @ v
x = x.permute(0, 2, 1, 3).reshape(B, N, C)
x = self.proj(x)
if self.proj_drop.p > 0:
x = self.proj_drop(x)
return x
#################################################################################
# RDT Block #
#################################################################################
class RDTBlock(nn.Module):
"""
A RDT block with cross-attention conditioning.
"""
def __init__(self, hidden_size, num_heads, **block_kwargs):
super().__init__()
self.norm1 = RmsNorm(hidden_size, eps=1e-6)
self.attn = Attention(dim=hidden_size,
num_heads=num_heads,
qkv_bias=True,
qk_norm=True,
norm_layer=RmsNorm,
**block_kwargs)
self.cross_attn = CrossAttention(hidden_size,
num_heads=num_heads,
qkv_bias=True,
qk_norm=True,
norm_layer=RmsNorm,
**block_kwargs)
self.norm2 = RmsNorm(hidden_size, eps=1e-6)
approx_gelu = lambda: nn.GELU(approximate="tanh")
self.ffn = Mlp(in_features=hidden_size, hidden_features=hidden_size, act_layer=approx_gelu, drop=0)
self.norm3 = RmsNorm(hidden_size, eps=1e-6)
def forward(self, x, c, mask=None):
origin_x = x
x = self.norm1(x)
x = self.attn(x)
x = x + origin_x
origin_x = x
x = self.norm2(x)
x = self.cross_attn(x, c, mask)
x = x + origin_x
origin_x = x
x = self.norm3(x)
x = self.ffn(x)
x = x + origin_x
return x
class FinalLayer(nn.Module):
"""
The final layer of RDT.
"""
def __init__(self, hidden_size, out_channels):
super().__init__()
self.norm_final = RmsNorm(hidden_size, eps=1e-6)
approx_gelu = lambda: nn.GELU(approximate="tanh")
self.ffn_final = Mlp(in_features=hidden_size,
hidden_features=hidden_size,
out_features=out_channels,
act_layer=approx_gelu,
drop=0)
def forward(self, x):
x = self.norm_final(x)
x = self.ffn_final(x)
return x
#################################################################################
# Sine/Cosine Positional Embedding Functions #
#################################################################################
# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
"""
embed_dim: output dimension for each position
pos: a list of positions to be encoded: size (M,)
out: (M, D)
"""
assert embed_dim % 2 == 0
omega = np.arange(embed_dim // 2, dtype=np.float64)
omega /= embed_dim / 2.
omega = 1. / 10000**omega # (D/2,)
if not isinstance(pos, np.ndarray):
pos = np.array(pos, dtype=np.float64)
pos = pos.reshape(-1) # (M,)
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
emb_sin = np.sin(out) # (M, D/2)
emb_cos = np.cos(out) # (M, D/2)
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
return emb
def get_nd_sincos_pos_embed_from_grid(embed_dim, grid_sizes):
"""
embed_dim: output dimension for each position
grid_sizes: the grids sizes in each dimension (K,).
out: (grid_sizes[0], ..., grid_sizes[K-1], D)
"""
num_sizes = len(grid_sizes)
# For grid size of 1, we do not need to add any positional embedding
num_valid_sizes = len([x for x in grid_sizes if x > 1])
emb = np.zeros(grid_sizes + (embed_dim, ))
# Uniformly divide the embedding dimension for each grid size
dim_for_each_grid = embed_dim // num_valid_sizes
# To make it even
if dim_for_each_grid % 2 != 0:
dim_for_each_grid -= 1
valid_size_idx = 0
for size_idx in range(num_sizes):
grid_size = grid_sizes[size_idx]
if grid_size <= 1:
continue
pos = np.arange(grid_size)
posemb_shape = [1] * len(grid_sizes) + [dim_for_each_grid]
posemb_shape[size_idx] = -1
emb[..., valid_size_idx * dim_for_each_grid:(valid_size_idx + 1) * dim_for_each_grid] += \
get_1d_sincos_pos_embed_from_grid(dim_for_each_grid, pos).reshape(posemb_shape)
valid_size_idx += 1
return emb
def get_multimodal_cond_pos_embed(embed_dim, mm_cond_lens: OrderedDict, embed_modality=True):
"""
Generate position embeddings for multimodal conditions.
mm_cond_lens: an OrderedDict containing
(modality name, modality token length) pairs.
For `"image"` modality, the value can be a multi-dimensional tuple.
If the length < 0, it means there is no position embedding for the modality or grid.
embed_modality: whether to embed the modality information. Default is True.
"""
num_modalities = len(mm_cond_lens)
modality_pos_embed = np.zeros((num_modalities, embed_dim))
if embed_modality:
# Get embeddings for various modalites
# We put it in the first half
modality_sincos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, torch.arange(num_modalities))
modality_pos_embed[:, :embed_dim // 2] = modality_sincos_embed
# The second half is for position embeddings
pos_embed_dim = embed_dim // 2
else:
# The whole embedding is for position embeddings
pos_embed_dim = embed_dim
# Get embeddings for positions inside each modality
c_pos_emb = np.zeros((0, embed_dim))
for idx, (modality, cond_len) in enumerate(mm_cond_lens.items()):
if modality == "image" and \
(isinstance(cond_len, tuple) or isinstance(cond_len, list)):
all_grid_sizes = tuple([abs(x) for x in cond_len])
embed_grid_sizes = tuple([x if x > 0 else 1 for x in cond_len])
cond_sincos_embed = get_nd_sincos_pos_embed_from_grid(pos_embed_dim, embed_grid_sizes)
cond_pos_embed = np.zeros(all_grid_sizes + (embed_dim, ))
cond_pos_embed[..., -pos_embed_dim:] += cond_sincos_embed
cond_pos_embed = cond_pos_embed.reshape((-1, embed_dim))
else:
cond_sincos_embed = get_1d_sincos_pos_embed_from_grid(pos_embed_dim,
torch.arange(cond_len if cond_len > 0 else 1))
cond_pos_embed = np.zeros((abs(cond_len), embed_dim))
cond_pos_embed[:, -pos_embed_dim:] += cond_sincos_embed
cond_pos_embed += modality_pos_embed[idx]
c_pos_emb = np.concatenate([c_pos_emb, cond_pos_embed], axis=0)
return c_pos_emb

View File

@ -0,0 +1,156 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DiT: https://github.com/facebookresearch/DiT
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
from collections import OrderedDict
import torch
import torch.nn as nn
from pathlib import Path
import sys, os
# get current workspace
current_file = Path(__file__)
sys.path.append(str(current_file.parent.parent))
from rdt.blocks import (FinalLayer, RDTBlock, TimestepEmbedder, get_1d_sincos_pos_embed_from_grid,
get_multimodal_cond_pos_embed)
class RDT(nn.Module):
"""
Class for Robotics Diffusion Transformers.
"""
def __init__(self,
output_dim=128,
horizon=32,
hidden_size=1152,
depth=28,
num_heads=16,
max_lang_cond_len=1024,
img_cond_len=4096,
lang_pos_embed_config=None,
img_pos_embed_config=None,
dtype=torch.bfloat16):
super().__init__()
self.horizon = horizon
self.hidden_size = hidden_size
self.max_lang_cond_len = max_lang_cond_len
self.img_cond_len = img_cond_len
self.dtype = dtype
self.lang_pos_embed_config = lang_pos_embed_config
self.img_pos_embed_config = img_pos_embed_config
self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
self.freq_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
# We will use trainable sin-cos embeddings
# [timestep; state; action]
self.x_pos_embed = nn.Parameter(torch.zeros(1, horizon + 3, hidden_size))
# Language conditions
self.lang_cond_pos_embed = nn.Parameter(torch.zeros(1, max_lang_cond_len, hidden_size))
# Image conditions
self.img_cond_pos_embed = nn.Parameter(torch.zeros(1, img_cond_len, hidden_size))
self.blocks = nn.ModuleList([RDTBlock(hidden_size, num_heads) for _ in range(depth)])
self.final_layer = FinalLayer(hidden_size, output_dim)
self.initialize_weights()
def initialize_weights(self):
# Initialize transformer layers:
def _basic_init(module):
if isinstance(module, nn.Linear):
torch.nn.init.xavier_uniform_(module.weight)
if module.bias is not None:
nn.init.constant_(module.bias, 0)
self.apply(_basic_init)
# Initialize pos_embed by sin-cos embedding
x_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
mm_cond_lens=OrderedDict([
('timestep', 1),
('ctrl_freq', 1),
('state', 1),
('action', self.horizon),
]))
self.x_pos_embed.data.copy_(torch.from_numpy(x_pos_embed).float().unsqueeze(0))
if self.lang_pos_embed_config is None:
lang_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size,
torch.arange(self.max_lang_cond_len))
else:
lang_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
mm_cond_lens=OrderedDict(self.lang_pos_embed_config),
embed_modality=False)
self.lang_cond_pos_embed.data.copy_(torch.from_numpy(lang_cond_pos_embed).float().unsqueeze(0))
if self.img_pos_embed_config is None:
img_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size, torch.arange(self.img_cond_len))
else:
img_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
mm_cond_lens=OrderedDict(self.img_pos_embed_config),
embed_modality=False)
self.img_cond_pos_embed.data.copy_(torch.from_numpy(img_cond_pos_embed).float().unsqueeze(0))
# Initialize timestep and control freq embedding MLP
nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
nn.init.normal_(self.freq_embedder.mlp[0].weight, std=0.02)
nn.init.normal_(self.freq_embedder.mlp[2].weight, std=0.02)
# Initialize the final layer: zero-out the final linear layer
nn.init.constant_(self.final_layer.ffn_final.fc2.weight, 0)
nn.init.constant_(self.final_layer.ffn_final.fc2.bias, 0)
# Move all the params to given data type:
self.to(self.dtype)
def forward(self, x, freq, t, lang_c, img_c, lang_mask=None, img_mask=None):
"""
Forward pass of RDT.
x: (B, T, D), state + action token sequence, T = horizon + 1,
dimension D is assumed to be the same as the hidden size.
freq: (B,), a scalar indicating control frequency.
t: (B,) or (1,), diffusion timesteps.
lang_c: (B, L_lang, D) or None, language condition tokens (variable length),
dimension D is assumed to be the same as the hidden size.
img_c: (B, L_img, D) or None, image condition tokens (fixed length),
dimension D is assumed to be the same as the hidden size.
lang_mask: (B, L_lang) or None, language condition mask (True for valid).
img_mask: (B, L_img) or None, image condition mask (True for valid).
"""
t = self.t_embedder(t).unsqueeze(1) # (B, 1, D) or (1, 1, D)
freq = self.freq_embedder(freq).unsqueeze(1) # (B, 1, D)
# Append timestep to the input tokens
if t.shape[0] == 1:
t = t.expand(x.shape[0], -1, -1)
x = torch.cat([t, freq, x], dim=1) # (B, T+1, D)
# Add multimodal position embeddings
x = x + self.x_pos_embed
# Note the lang is of variable length
lang_c = lang_c + self.lang_cond_pos_embed[:, :lang_c.shape[1]]
img_c = img_c + self.img_cond_pos_embed
# Forward pass
conds = [lang_c, img_c]
masks = [lang_mask, img_mask]
for i, block in enumerate(self.blocks):
c, mask = conds[i % 2], masks[i % 2]
x = block(x, c, mask) # (B, T+1, D)
# Inject the language condition at the final layer
x = self.final_layer(x) # (B, T+1, out_channels)
# Only preserve the action tokens
x = x[:, -self.horizon:]
return x

View File

@ -0,0 +1,246 @@
import re, sys, os
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
from diffusers.schedulers.scheduling_dpmsolver_multistep import \
DPMSolverMultistepScheduler
from pathlib import Path
# get current workspace
current_file = Path(__file__)
sys.path.append(os.path.join(current_file.parent))
from hub_mixin import CompatiblePyTorchModelHubMixin
from rdt.model import RDT
class RDTRunner(nn.Module,
CompatiblePyTorchModelHubMixin,
repo_url="https://huggingface.co/robotics-diffusion-transformer/rdt-1b"):
def __init__(self,
*,
action_dim,
pred_horizon,
config,
lang_token_dim,
img_token_dim,
state_token_dim,
max_lang_cond_len,
img_cond_len,
lang_pos_embed_config=None,
img_pos_embed_config=None,
dtype=torch.bfloat16):
super(RDTRunner, self).__init__()
# Create diffusion model
hidden_size = config['rdt']['hidden_size']
self.model = RDT(
output_dim=action_dim,
horizon=pred_horizon,
hidden_size=hidden_size,
depth=config['rdt']['depth'],
num_heads=config['rdt']['num_heads'],
max_lang_cond_len=max_lang_cond_len,
img_cond_len=img_cond_len,
lang_pos_embed_config=lang_pos_embed_config,
img_pos_embed_config=img_pos_embed_config,
dtype=dtype,
)
# Create adpators for various conditional inputs
self.lang_adaptor = self.build_condition_adapter(config['lang_adaptor'],
in_features=lang_token_dim,
out_features=hidden_size)
self.img_adaptor = self.build_condition_adapter(config['img_adaptor'],
in_features=img_token_dim,
out_features=hidden_size)
# A `state` refers to an action or a proprioception vector
self.state_adaptor = self.build_condition_adapter(
config['state_adaptor'],
in_features=state_token_dim * 2, # state + state mask (indicator)
out_features=hidden_size)
# Create the noise scheduler
noise_scheduler_config = config['noise_scheduler']
self.noise_scheduler = DDPMScheduler(
num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
beta_schedule=noise_scheduler_config['beta_schedule'],
prediction_type=noise_scheduler_config['prediction_type'],
clip_sample=noise_scheduler_config['clip_sample'],
)
self.noise_scheduler_sample = DPMSolverMultistepScheduler(
num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
beta_schedule=noise_scheduler_config['beta_schedule'],
prediction_type=noise_scheduler_config['prediction_type'],
)
self.num_train_timesteps = noise_scheduler_config['num_train_timesteps']
self.num_inference_timesteps = noise_scheduler_config['num_inference_timesteps']
self.prediction_type = noise_scheduler_config['prediction_type']
self.pred_horizon = pred_horizon
self.action_dim = action_dim
print("Diffusion params: %e" %
sum([p.numel() for p in self.model.parameters()] + [p.numel() for p in self.lang_adaptor.parameters()] +
[p.numel()
for p in self.img_adaptor.parameters()] + [p.numel() for p in self.state_adaptor.parameters()]))
def build_condition_adapter(self, projector_type, in_features, out_features):
projector = None
if projector_type == 'linear':
projector = nn.Linear(in_features, out_features)
else:
mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
if mlp_gelu_match:
mlp_depth = int(mlp_gelu_match.group(1))
modules = [nn.Linear(in_features, out_features)]
for _ in range(1, mlp_depth):
modules.append(nn.GELU(approximate="tanh"))
modules.append(nn.Linear(out_features, out_features))
projector = nn.Sequential(*modules)
if projector is None:
raise ValueError(f'Unknown projector type: {projector_type}')
return projector
def adapt_conditions(self, lang_tokens, img_tokens, state_tokens):
'''
lang_tokens: (batch_size, lang_len, lang_token_dim)
img_tokens: (batch_size, img_len, img_token_dim)
state_tokens: (batch_size, state_len, state_token_dim)
return: adpated (..., hidden_size) for all input tokens
'''
adpated_lang = self.lang_adaptor(lang_tokens)
adpated_img = self.img_adaptor(img_tokens)
adpated_state = self.state_adaptor(state_tokens)
return adpated_lang, adpated_img, adpated_state
def conditional_sample(self, lang_cond, lang_attn_mask, img_cond, state_traj, action_mask, ctrl_freqs):
'''
lang_cond: language conditional data, (batch_size, lang_len, hidden_size).
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
which should be True-False bool tensor.
img_cond: image conditional data, (batch_size, img_len, hidden_size).
state_traj: (batch_size, 1, hidden_size), state trajectory.
action_mask: (batch_size, 1, action_dim), a 0-1 **float** tensor
indicating the valid action dimensions.
ctrl_freqs: (batch_size,), control frequency for each sample.
return: (batch_size, horizon, action_dim)
'''
device = state_traj.device
dtype = state_traj.dtype
noisy_action = torch.randn(size=(state_traj.shape[0], self.pred_horizon, self.action_dim),
dtype=dtype,
device=device)
action_mask = action_mask.expand(-1, self.pred_horizon, -1)
# Set step values
self.noise_scheduler_sample.set_timesteps(self.num_inference_timesteps)
for t in self.noise_scheduler_sample.timesteps:
# Prepare state-action trajectory
action_traj = torch.cat([noisy_action, action_mask], dim=2)
action_traj = self.state_adaptor(action_traj)
state_action_traj = torch.cat([state_traj, action_traj], dim=1)
# Predict the model output
model_output = self.model(state_action_traj,
ctrl_freqs,
t.unsqueeze(-1).to(device),
lang_cond,
img_cond,
lang_mask=lang_attn_mask)
# Compute previous actions: x_t -> x_t-1
noisy_action = self.noise_scheduler_sample.step(model_output, t, noisy_action).prev_sample
noisy_action = noisy_action.to(state_traj.dtype)
# Finally apply the action mask to mask invalid action dimensions
noisy_action = noisy_action * action_mask
return noisy_action
# ========= Train ============
def compute_loss(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_gt, action_mask,
ctrl_freqs) -> torch.Tensor:
'''
lang_tokens: (batch_size, lang_len, lang_token_dim)
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
which should be True-False bool tensor.
img_tokens: (batch_size, img_len, img_token_dim)
state_tokens: (batch_size, 1, state_token_dim)
action_gt: (batch_size, horizon, state_token_dim), ground-truth actions for supervision
action_mask: (batch_size, 1, state_token_dim), a 0-1 **float** tensor.
ctrl_freqs: (batch_size,), control frequency for each sample.
return: loss_value, a scalar tensor
'''
batch_size = lang_tokens.shape[0]
device = lang_tokens.device
# Sample noise that we'll add to the actions
noise = torch.randn(action_gt.shape, dtype=action_gt.dtype, device=device)
# Sample random diffusion timesteps
timesteps = torch.randint(0, self.num_train_timesteps, (batch_size, ), device=device).long()
# Add noise to the clean actions according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_action = self.noise_scheduler.add_noise(action_gt, noise, timesteps)
# Concatenate the state and action tokens to form the input sequence
state_action_traj = torch.cat([state_tokens, noisy_action], dim=1)
# Append the action mask to the input sequence
action_mask = action_mask.expand(-1, state_action_traj.shape[1], -1)
state_action_traj = torch.cat([state_action_traj, action_mask], dim=2)
# Align the dimension with the hidden size
lang_cond, img_cond, state_action_traj = self.adapt_conditions(lang_tokens, img_tokens, state_action_traj)
# Predict the denoised result
pred = self.model(state_action_traj, ctrl_freqs, timesteps, lang_cond, img_cond, lang_mask=lang_attn_mask)
pred_type = self.prediction_type
if pred_type == 'epsilon':
target = noise
elif pred_type == 'sample':
target = action_gt
else:
raise ValueError(f"Unsupported prediction type {pred_type}")
loss = F.mse_loss(pred, target)
return loss
# ========= Inference ============
def predict_action(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_mask, ctrl_freqs):
'''
lang_tokens: (batch_size, lang_len, lang_token_dim)
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
which should be True-False bool tensor.
img_tokens: (batch_size, img_len, img_token_dim)
state_tokens: (batch_size, 1, state_token_dim)
action_mask: (batch_size, 1, action_dim),
which should be a 0-1 **float** tensor.
ctrl_freqs: (batch_size,), control frequency for each sample.
return: (batch_size, horizon, action_dim), predicted action sequence
'''
# Prepare the state and conditions
state_tokens = torch.cat([state_tokens, action_mask], dim=2)
lang_cond, img_cond, state_traj = self.adapt_conditions(lang_tokens, img_tokens, state_tokens)
# Run sampling
action_pred = self.conditional_sample(
lang_cond,
lang_attn_mask,
img_cond,
state_traj,
action_mask,
ctrl_freqs,
)
return action_pred
def forward(self, *args, **kwargs) -> torch.Tensor:
return self.compute_loss(*args, **kwargs)

View File

@ -0,0 +1,35 @@
numpy<2.0
packaging==24.0
wandb==0.17.0
deepspeed==0.14.2
accelerate==0.30.1
diffusers==0.27.2
timm==1.0.3
transformers==4.41.0
sentencepiece==0.2.0
h5py==3.11.0
opencv-python==4.9.0.80
imgaug==0.4.0
pytz==2022.1
huggingface_hub==0.23.0
torch==2.1.0
torchvision==0.16.0
pyzmq
msgpack
msgpack_numpy
zstandard
onnx
onnxruntime
onnxsim
onnxsim
# requirements_data.txt
# tfds-nightly==4.9.4.dev202402070044
gsutil==5.27
tensorflow==2.15.0.post1
pillow==10.2.0
pyyaml==6.0.1
tensorflow-graphics==2021.12.3
imageio==2.34.0
imageio-ffmpeg==0.4.9

View File

@ -0,0 +1,941 @@
#!/home/lin/software/miniconda3/envs/aloha/bin/python
# -- coding: UTF-8
"""
#!/usr/bin/python3
"""
import argparse
import sys
import threading
import time
import yaml
from collections import deque
import numpy as np
import rospy
import torch
from cv_bridge import CvBridge
from geometry_msgs.msg import Twist
from nav_msgs.msg import Odometry
from PIL import Image as PImage
from sensor_msgs.msg import Image, JointState
from std_msgs.msg import Header
import cv2
from scripts.agilex_model import create_model
# sys.path.append("./")
CAMERA_NAMES = ["cam_high", "cam_right_wrist", "cam_left_wrist"]
observation_window = None
lang_embeddings = None
# debug
preload_images = None
# Initialize the model
def make_policy(args):
with open(args.config_path, "r") as fp:
config = yaml.safe_load(fp)
args.config = config
# pretrained_text_encoder_name_or_path = "google/t5-v1_1-xxl"
pretrained_vision_encoder_name_or_path = "google/siglip-so400m-patch14-384"
model = create_model(
args=args.config,
dtype=torch.bfloat16,
pretrained=args.pretrained_model_name_or_path,
# pretrained_text_encoder_name_or_path=pretrained_text_encoder_name_or_path,
pretrained_vision_encoder_name_or_path=pretrained_vision_encoder_name_or_path,
control_frequency=args.ctrl_freq,
)
return model
def set_seed(seed):
torch.manual_seed(seed)
np.random.seed(seed)
# Interpolate the actions to make the robot move smoothly
def interpolate_action(args, prev_action, cur_action):
steps = np.concatenate((np.array(args.arm_steps_length), np.array(args.arm_steps_length)), axis=0)
diff = np.abs(cur_action - prev_action)
step = np.ceil(diff / steps).astype(int)
step = np.max(step)
if step <= 1:
return cur_action[np.newaxis, :]
new_actions = np.linspace(prev_action, cur_action, step + 1)
return new_actions[1:]
def get_config(args):
config = {
"episode_len": args.max_publish_step,
"state_dim": 14,
"chunk_size": args.chunk_size,
"camera_names": CAMERA_NAMES,
}
return config
# Get the observation from the ROS topic
def get_ros_observation(args, ros_operator):
rate = rospy.Rate(args.publish_rate)
print_flag = True
while True and not rospy.is_shutdown():
result = ros_operator.get_frame()
if not result:
if print_flag:
print("syn fail when get_ros_observation")
print_flag = False
rate.sleep()
continue
print_flag = True
(
img_front,
img_left,
img_right,
img_front_depth,
img_left_depth,
img_right_depth,
puppet_arm_left,
puppet_arm_right,
robot_base,
) = result
# print(f"sync success when get_ros_observation")
return (img_front, img_left, img_right, puppet_arm_left, puppet_arm_right)
# Update the observation window buffer
def update_observation_window(args, config, ros_operator):
# JPEG transformation
# Align with training
def jpeg_mapping(img):
img = cv2.imencode(".jpg", img)[1].tobytes()
img = cv2.imdecode(np.frombuffer(img, np.uint8), cv2.IMREAD_COLOR)
return img
global observation_window
if observation_window is None:
observation_window = deque(maxlen=2)
# Append the first dummy image
observation_window.append({
"qpos": None,
"images": {
config["camera_names"][0]: None,
config["camera_names"][1]: None,
config["camera_names"][2]: None,
},
})
img_front, img_left, img_right, puppet_arm_left, puppet_arm_right = (get_ros_observation(args, ros_operator))
img_front = jpeg_mapping(img_front)
img_left = jpeg_mapping(img_left)
img_right = jpeg_mapping(img_right)
qpos = np.concatenate(
(np.array(puppet_arm_left.position), np.array(puppet_arm_right.position)),
axis=0,
)
qpos = torch.from_numpy(qpos).float().cuda()
observation_window.append({
"qpos": qpos,
"images": {
config["camera_names"][0]: img_front,
config["camera_names"][1]: img_right,
config["camera_names"][2]: img_left,
},
})
# RDT inference
def inference_fn(args, config, policy, t):
global observation_window
global lang_embeddings
# print(f"Start inference_thread_fn: t={t}")
while True and not rospy.is_shutdown():
time1 = time.time()
# fetch images in sequence [front, right, left]
image_arrs = [
observation_window[-2]["images"][config["camera_names"][0]],
observation_window[-2]["images"][config["camera_names"][1]],
observation_window[-2]["images"][config["camera_names"][2]],
observation_window[-1]["images"][config["camera_names"][0]],
observation_window[-1]["images"][config["camera_names"][1]],
observation_window[-1]["images"][config["camera_names"][2]],
]
# fetch debug images in sequence [front, right, left]
# image_arrs = [
# preload_images[config['camera_names'][0]][max(t - 1, 0)],
# preload_images[config['camera_names'][2]][max(t - 1, 0)],
# preload_images[config['camera_names'][1]][max(t - 1, 0)],
# preload_images[config['camera_names'][0]][t],
# preload_images[config['camera_names'][2]][t],
# preload_images[config['camera_names'][1]][t]
# ]
# # encode the images
# for i in range(len(image_arrs)):
# image_arrs[i] = cv2.imdecode(np.frombuffer(image_arrs[i], np.uint8), cv2.IMREAD_COLOR)
# proprio = torch.from_numpy(preload_images['qpos'][t]).float().cuda()
images = [PImage.fromarray(arr) if arr is not None else None for arr in image_arrs]
# for i, pos in enumerate(['f', 'r', 'l'] * 2):
# images[i].save(f'{t}-{i}-{pos}.png')
# get last qpos in shape [14, ]
proprio = observation_window[-1]["qpos"]
# unsqueeze to [1, 14]
proprio = proprio.unsqueeze(0)
# actions shaped as [1, 64, 14] in format [left, right]
actions = (policy.step(proprio=proprio, images=images, text_embeds=lang_embeddings).squeeze(0).cpu().numpy())
# print(f"inference_actions: {actions.squeeze()}")
# print(f"Model inference time: {time.time() - time1} s")
# print(f"Finish inference_thread_fn: t={t}")
return actions
# Main loop for the manipulation task
def model_inference(args, config, ros_operator):
global lang_embeddings
# Load rdt model
policy = make_policy(args)
lang_dict = torch.load(args.lang_embeddings_path)
print(f"Running with instruction: \"{lang_dict['instruction']}\" from \"{lang_dict['name']}\"")
lang_embeddings = lang_dict["embeddings"]
max_publish_step = config["episode_len"]
chunk_size = config["chunk_size"]
# Initialize position of the puppet arm
left0 = [
-0.00133514404296875,
0.00209808349609375,
0.01583099365234375,
-0.032616615295410156,
-0.00286102294921875,
0.00095367431640625,
3.557830810546875,
]
right0 = [
-0.00133514404296875,
0.00438690185546875,
0.034523963928222656,
-0.053597450256347656,
-0.00476837158203125,
-0.00209808349609375,
3.557830810546875,
]
left1 = [
-0.00133514404296875,
0.00209808349609375,
0.01583099365234375,
-0.032616615295410156,
-0.00286102294921875,
0.00095367431640625,
-0.3393220901489258,
]
right1 = [
-0.00133514404296875,
0.00247955322265625,
0.01583099365234375,
-0.032616615295410156,
-0.00286102294921875,
0.00095367431640625,
-0.3397035598754883,
]
ros_operator.puppet_arm_publish_continuous(left0, right0)
input("Press enter to continue")
ros_operator.puppet_arm_publish_continuous(left1, right1)
# Initialize the previous action to be the initial robot state
pre_action = np.zeros(config["state_dim"])
pre_action[:14] = np.array([
-0.00133514404296875,
0.00209808349609375,
0.01583099365234375,
-0.032616615295410156,
-0.00286102294921875,
0.00095367431640625,
-0.3393220901489258,
] + [
-0.00133514404296875,
0.00247955322265625,
0.01583099365234375,
-0.032616615295410156,
-0.00286102294921875,
0.00095367431640625,
-0.3397035598754883,
])
action = None
# Inference loop
with torch.inference_mode():
while True and not rospy.is_shutdown():
# The current time step
t = 0
rate = rospy.Rate(args.publish_rate)
action_buffer = np.zeros([chunk_size, config["state_dim"]])
while t < max_publish_step and not rospy.is_shutdown():
# Update observation window
update_observation_window(args, config, ros_operator)
# When coming to the end of the action chunk
if t % chunk_size == 0:
# Start inference
action_buffer = inference_fn(args, config, policy, t).copy()
raw_action = action_buffer[t % chunk_size]
action = raw_action
# Interpolate the original action sequence
if args.use_actions_interpolation:
# print(f"Time {t}, pre {pre_action}, act {action}")
interp_actions = interpolate_action(args, pre_action, action)
else:
interp_actions = action[np.newaxis, :]
# Execute the interpolated actions one by one
for act in interp_actions:
left_action = act[:7]
right_action = act[7:14]
if not args.disable_puppet_arm:
ros_operator.puppet_arm_publish(left_action,
right_action) # puppet_arm_publish_continuous_thread
if args.use_robot_base:
vel_action = act[14:16]
ros_operator.robot_base_publish(vel_action)
rate.sleep()
# print(f"doing action: {act}")
t += 1
print("Published Step", t)
pre_action = action.copy()
# ROS operator class
class RosOperator:
def __init__(self, args):
self.robot_base_deque = None
self.puppet_arm_right_deque = None
self.puppet_arm_left_deque = None
self.img_front_deque = None
self.img_right_deque = None
self.img_left_deque = None
self.img_front_depth_deque = None
self.img_right_depth_deque = None
self.img_left_depth_deque = None
self.bridge = None
self.puppet_arm_left_publisher = None
self.puppet_arm_right_publisher = None
self.robot_base_publisher = None
self.puppet_arm_publish_thread = None
self.puppet_arm_publish_lock = None
self.args = args
self.init()
self.init_ros()
def init(self):
self.bridge = CvBridge()
self.img_left_deque = deque()
self.img_right_deque = deque()
self.img_front_deque = deque()
self.img_left_depth_deque = deque()
self.img_right_depth_deque = deque()
self.img_front_depth_deque = deque()
self.puppet_arm_left_deque = deque()
self.puppet_arm_right_deque = deque()
self.robot_base_deque = deque()
self.puppet_arm_publish_lock = threading.Lock()
self.puppet_arm_publish_lock.acquire()
def puppet_arm_publish(self, left, right):
joint_state_msg = JointState()
joint_state_msg.header = Header()
joint_state_msg.header.stamp = rospy.Time.now() # Set timestep
joint_state_msg.name = [
"joint0",
"joint1",
"joint2",
"joint3",
"joint4",
"joint5",
"joint6",
] # 设置关节名称
joint_state_msg.position = left
self.puppet_arm_left_publisher.publish(joint_state_msg)
joint_state_msg.position = right
self.puppet_arm_right_publisher.publish(joint_state_msg)
def robot_base_publish(self, vel):
vel_msg = Twist()
vel_msg.linear.x = vel[0]
vel_msg.linear.y = 0
vel_msg.linear.z = 0
vel_msg.angular.x = 0
vel_msg.angular.y = 0
vel_msg.angular.z = vel[1]
self.robot_base_publisher.publish(vel_msg)
def puppet_arm_publish_continuous(self, left, right):
rate = rospy.Rate(self.args.publish_rate)
left_arm = None
right_arm = None
while True and not rospy.is_shutdown():
if len(self.puppet_arm_left_deque) != 0:
left_arm = list(self.puppet_arm_left_deque[-1].position)
if len(self.puppet_arm_right_deque) != 0:
right_arm = list(self.puppet_arm_right_deque[-1].position)
if left_arm is None or right_arm is None:
rate.sleep()
continue
else:
break
left_symbol = [1 if left[i] - left_arm[i] > 0 else -1 for i in range(len(left))]
right_symbol = [1 if right[i] - right_arm[i] > 0 else -1 for i in range(len(right))]
flag = True
step = 0
while flag and not rospy.is_shutdown():
if self.puppet_arm_publish_lock.acquire(False):
return
left_diff = [abs(left[i] - left_arm[i]) for i in range(len(left))]
right_diff = [abs(right[i] - right_arm[i]) for i in range(len(right))]
flag = False
for i in range(len(left)):
if left_diff[i] < self.args.arm_steps_length[i]:
left_arm[i] = left[i]
else:
left_arm[i] += left_symbol[i] * self.args.arm_steps_length[i]
flag = True
for i in range(len(right)):
if right_diff[i] < self.args.arm_steps_length[i]:
right_arm[i] = right[i]
else:
right_arm[i] += right_symbol[i] * self.args.arm_steps_length[i]
flag = True
joint_state_msg = JointState()
joint_state_msg.header = Header()
joint_state_msg.header.stamp = rospy.Time.now() # Set the timestep
joint_state_msg.name = [
"joint0",
"joint1",
"joint2",
"joint3",
"joint4",
"joint5",
"joint6",
] # 设置关节名称
joint_state_msg.position = left_arm
self.puppet_arm_left_publisher.publish(joint_state_msg)
joint_state_msg.position = right_arm
self.puppet_arm_right_publisher.publish(joint_state_msg)
step += 1
print("puppet_arm_publish_continuous:", step)
rate.sleep()
def puppet_arm_publish_linear(self, left, right):
num_step = 100
rate = rospy.Rate(200)
left_arm = None
right_arm = None
while True and not rospy.is_shutdown():
if len(self.puppet_arm_left_deque) != 0:
left_arm = list(self.puppet_arm_left_deque[-1].position)
if len(self.puppet_arm_right_deque) != 0:
right_arm = list(self.puppet_arm_right_deque[-1].position)
if left_arm is None or right_arm is None:
rate.sleep()
continue
else:
break
traj_left_list = np.linspace(left_arm, left, num_step)
traj_right_list = np.linspace(right_arm, right, num_step)
for i in range(len(traj_left_list)):
traj_left = traj_left_list[i]
traj_right = traj_right_list[i]
traj_left[-1] = left[-1]
traj_right[-1] = right[-1]
joint_state_msg = JointState()
joint_state_msg.header = Header()
joint_state_msg.header.stamp = rospy.Time.now() # 设置时间戳
joint_state_msg.name = [
"joint0",
"joint1",
"joint2",
"joint3",
"joint4",
"joint5",
"joint6",
] # 设置关节名称
joint_state_msg.position = traj_left
self.puppet_arm_left_publisher.publish(joint_state_msg)
joint_state_msg.position = traj_right
self.puppet_arm_right_publisher.publish(joint_state_msg)
rate.sleep()
def puppet_arm_publish_continuous_thread(self, left, right):
if self.puppet_arm_publish_thread is not None:
self.puppet_arm_publish_lock.release()
self.puppet_arm_publish_thread.join()
self.puppet_arm_publish_lock.acquire(False)
self.puppet_arm_publish_thread = None
self.puppet_arm_publish_thread = threading.Thread(target=self.puppet_arm_publish_continuous, args=(left, right))
self.puppet_arm_publish_thread.start()
def get_frame(self):
if (len(self.img_left_deque) == 0 or len(self.img_right_deque) == 0 or len(self.img_front_deque) == 0 or
(self.args.use_depth_image and (len(self.img_left_depth_deque) == 0 or len(self.img_right_depth_deque) == 0
or len(self.img_front_depth_deque) == 0))):
return False
if self.args.use_depth_image:
frame_time = min([
self.img_left_deque[-1].header.stamp.to_sec(),
self.img_right_deque[-1].header.stamp.to_sec(),
self.img_front_deque[-1].header.stamp.to_sec(),
self.img_left_depth_deque[-1].header.stamp.to_sec(),
self.img_right_depth_deque[-1].header.stamp.to_sec(),
self.img_front_depth_deque[-1].header.stamp.to_sec(),
])
else:
frame_time = min([
self.img_left_deque[-1].header.stamp.to_sec(),
self.img_right_deque[-1].header.stamp.to_sec(),
self.img_front_deque[-1].header.stamp.to_sec(),
])
if (len(self.img_left_deque) == 0 or self.img_left_deque[-1].header.stamp.to_sec() < frame_time):
return False
if (len(self.img_right_deque) == 0 or self.img_right_deque[-1].header.stamp.to_sec() < frame_time):
return False
if (len(self.img_front_deque) == 0 or self.img_front_deque[-1].header.stamp.to_sec() < frame_time):
return False
if (len(self.puppet_arm_left_deque) == 0 or self.puppet_arm_left_deque[-1].header.stamp.to_sec() < frame_time):
return False
if (len(self.puppet_arm_right_deque) == 0
or self.puppet_arm_right_deque[-1].header.stamp.to_sec() < frame_time):
return False
if self.args.use_depth_image and (len(self.img_left_depth_deque) == 0
or self.img_left_depth_deque[-1].header.stamp.to_sec() < frame_time):
return False
if self.args.use_depth_image and (len(self.img_right_depth_deque) == 0
or self.img_right_depth_deque[-1].header.stamp.to_sec() < frame_time):
return False
if self.args.use_depth_image and (len(self.img_front_depth_deque) == 0
or self.img_front_depth_deque[-1].header.stamp.to_sec() < frame_time):
return False
if self.args.use_robot_base and (len(self.robot_base_deque) == 0
or self.robot_base_deque[-1].header.stamp.to_sec() < frame_time):
return False
while self.img_left_deque[0].header.stamp.to_sec() < frame_time:
self.img_left_deque.popleft()
img_left = self.bridge.imgmsg_to_cv2(self.img_left_deque.popleft(), "passthrough")
while self.img_right_deque[0].header.stamp.to_sec() < frame_time:
self.img_right_deque.popleft()
img_right = self.bridge.imgmsg_to_cv2(self.img_right_deque.popleft(), "passthrough")
while self.img_front_deque[0].header.stamp.to_sec() < frame_time:
self.img_front_deque.popleft()
img_front = self.bridge.imgmsg_to_cv2(self.img_front_deque.popleft(), "passthrough")
while self.puppet_arm_left_deque[0].header.stamp.to_sec() < frame_time:
self.puppet_arm_left_deque.popleft()
puppet_arm_left = self.puppet_arm_left_deque.popleft()
while self.puppet_arm_right_deque[0].header.stamp.to_sec() < frame_time:
self.puppet_arm_right_deque.popleft()
puppet_arm_right = self.puppet_arm_right_deque.popleft()
img_left_depth = None
if self.args.use_depth_image:
while self.img_left_depth_deque[0].header.stamp.to_sec() < frame_time:
self.img_left_depth_deque.popleft()
img_left_depth = self.bridge.imgmsg_to_cv2(self.img_left_depth_deque.popleft(), "passthrough")
img_right_depth = None
if self.args.use_depth_image:
while self.img_right_depth_deque[0].header.stamp.to_sec() < frame_time:
self.img_right_depth_deque.popleft()
img_right_depth = self.bridge.imgmsg_to_cv2(self.img_right_depth_deque.popleft(), "passthrough")
img_front_depth = None
if self.args.use_depth_image:
while self.img_front_depth_deque[0].header.stamp.to_sec() < frame_time:
self.img_front_depth_deque.popleft()
img_front_depth = self.bridge.imgmsg_to_cv2(self.img_front_depth_deque.popleft(), "passthrough")
robot_base = None
if self.args.use_robot_base:
while self.robot_base_deque[0].header.stamp.to_sec() < frame_time:
self.robot_base_deque.popleft()
robot_base = self.robot_base_deque.popleft()
return (
img_front,
img_left,
img_right,
img_front_depth,
img_left_depth,
img_right_depth,
puppet_arm_left,
puppet_arm_right,
robot_base,
)
def img_left_callback(self, msg):
if len(self.img_left_deque) >= 2000:
self.img_left_deque.popleft()
self.img_left_deque.append(msg)
def img_right_callback(self, msg):
if len(self.img_right_deque) >= 2000:
self.img_right_deque.popleft()
self.img_right_deque.append(msg)
def img_front_callback(self, msg):
if len(self.img_front_deque) >= 2000:
self.img_front_deque.popleft()
self.img_front_deque.append(msg)
def img_left_depth_callback(self, msg):
if len(self.img_left_depth_deque) >= 2000:
self.img_left_depth_deque.popleft()
self.img_left_depth_deque.append(msg)
def img_right_depth_callback(self, msg):
if len(self.img_right_depth_deque) >= 2000:
self.img_right_depth_deque.popleft()
self.img_right_depth_deque.append(msg)
def img_front_depth_callback(self, msg):
if len(self.img_front_depth_deque) >= 2000:
self.img_front_depth_deque.popleft()
self.img_front_depth_deque.append(msg)
def puppet_arm_left_callback(self, msg):
if len(self.puppet_arm_left_deque) >= 2000:
self.puppet_arm_left_deque.popleft()
self.puppet_arm_left_deque.append(msg)
def puppet_arm_right_callback(self, msg):
if len(self.puppet_arm_right_deque) >= 2000:
self.puppet_arm_right_deque.popleft()
self.puppet_arm_right_deque.append(msg)
def robot_base_callback(self, msg):
if len(self.robot_base_deque) >= 2000:
self.robot_base_deque.popleft()
self.robot_base_deque.append(msg)
def init_ros(self):
rospy.init_node("joint_state_publisher", anonymous=True)
rospy.Subscriber(
self.args.img_left_topic,
Image,
self.img_left_callback,
queue_size=1000,
tcp_nodelay=True,
)
rospy.Subscriber(
self.args.img_right_topic,
Image,
self.img_right_callback,
queue_size=1000,
tcp_nodelay=True,
)
rospy.Subscriber(
self.args.img_front_topic,
Image,
self.img_front_callback,
queue_size=1000,
tcp_nodelay=True,
)
if self.args.use_depth_image:
rospy.Subscriber(
self.args.img_left_depth_topic,
Image,
self.img_left_depth_callback,
queue_size=1000,
tcp_nodelay=True,
)
rospy.Subscriber(
self.args.img_right_depth_topic,
Image,
self.img_right_depth_callback,
queue_size=1000,
tcp_nodelay=True,
)
rospy.Subscriber(
self.args.img_front_depth_topic,
Image,
self.img_front_depth_callback,
queue_size=1000,
tcp_nodelay=True,
)
rospy.Subscriber(
self.args.puppet_arm_left_topic,
JointState,
self.puppet_arm_left_callback,
queue_size=1000,
tcp_nodelay=True,
)
rospy.Subscriber(
self.args.puppet_arm_right_topic,
JointState,
self.puppet_arm_right_callback,
queue_size=1000,
tcp_nodelay=True,
)
rospy.Subscriber(
self.args.robot_base_topic,
Odometry,
self.robot_base_callback,
queue_size=1000,
tcp_nodelay=True,
)
self.puppet_arm_left_publisher = rospy.Publisher(self.args.puppet_arm_left_cmd_topic, JointState, queue_size=10)
self.puppet_arm_right_publisher = rospy.Publisher(self.args.puppet_arm_right_cmd_topic,
JointState,
queue_size=10)
self.robot_base_publisher = rospy.Publisher(self.args.robot_base_cmd_topic, Twist, queue_size=10)
def get_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"--max_publish_step",
action="store",
type=int,
help="Maximum number of action publishing steps",
default=10000,
required=False,
)
parser.add_argument(
"--seed",
action="store",
type=int,
help="Random seed",
default=None,
required=False,
)
parser.add_argument(
"--img_front_topic",
action="store",
type=str,
help="img_front_topic",
default="/camera_f/color/image_raw",
required=False,
)
parser.add_argument(
"--img_left_topic",
action="store",
type=str,
help="img_left_topic",
default="/camera_l/color/image_raw",
required=False,
)
parser.add_argument(
"--img_right_topic",
action="store",
type=str,
help="img_right_topic",
default="/camera_r/color/image_raw",
required=False,
)
parser.add_argument(
"--img_front_depth_topic",
action="store",
type=str,
help="img_front_depth_topic",
default="/camera_f/depth/image_raw",
required=False,
)
parser.add_argument(
"--img_left_depth_topic",
action="store",
type=str,
help="img_left_depth_topic",
default="/camera_l/depth/image_raw",
required=False,
)
parser.add_argument(
"--img_right_depth_topic",
action="store",
type=str,
help="img_right_depth_topic",
default="/camera_r/depth/image_raw",
required=False,
)
parser.add_argument(
"--puppet_arm_left_cmd_topic",
action="store",
type=str,
help="puppet_arm_left_cmd_topic",
default="/master/joint_left",
required=False,
)
parser.add_argument(
"--puppet_arm_right_cmd_topic",
action="store",
type=str,
help="puppet_arm_right_cmd_topic",
default="/master/joint_right",
required=False,
)
parser.add_argument(
"--puppet_arm_left_topic",
action="store",
type=str,
help="puppet_arm_left_topic",
default="/puppet/joint_left",
required=False,
)
parser.add_argument(
"--puppet_arm_right_topic",
action="store",
type=str,
help="puppet_arm_right_topic",
default="/puppet/joint_right",
required=False,
)
parser.add_argument(
"--robot_base_topic",
action="store",
type=str,
help="robot_base_topic",
default="/odom_raw",
required=False,
)
parser.add_argument(
"--robot_base_cmd_topic",
action="store",
type=str,
help="robot_base_topic",
default="/cmd_vel",
required=False,
)
parser.add_argument(
"--use_robot_base",
action="store_true",
help="Whether to use the robot base to move around",
default=False,
required=False,
)
parser.add_argument(
"--publish_rate",
action="store",
type=int,
help="The rate at which to publish the actions",
default=30,
required=False,
)
parser.add_argument(
"--ctrl_freq",
action="store",
type=int,
help="The control frequency of the robot",
default=25,
required=False,
)
parser.add_argument(
"--chunk_size",
action="store",
type=int,
help="Action chunk size",
default=64,
required=False,
)
parser.add_argument(
"--arm_steps_length",
action="store",
type=float,
help="The maximum change allowed for each joint per timestep",
default=[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.2],
required=False,
)
parser.add_argument(
"--use_actions_interpolation",
action="store_true",
help="Whether to interpolate the actions if the difference is too large",
default=False,
required=False,
)
parser.add_argument(
"--use_depth_image",
action="store_true",
help="Whether to use depth images",
default=False,
required=False,
)
parser.add_argument(
"--disable_puppet_arm",
action="store_true",
help="Whether to disable the puppet arm. This is useful for safely debugging",
default=False,
)
parser.add_argument(
"--config_path",
type=str,
default="configs/base.yaml",
help="Path to the config file",
)
# parser.add_argument('--cfg_scale', type=float, default=2.0,
# help='the scaling factor used to modify the magnitude of the control features during denoising')
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
required=True,
help="Name or path to the pretrained model",
)
parser.add_argument(
"--lang_embeddings_path",
type=str,
required=True,
help="Path to the pre-encoded language instruction embeddings",
)
args = parser.parse_args()
return args
def main():
args = get_arguments()
ros_operator = RosOperator(args)
if args.seed is not None:
set_seed(args.seed)
config = get_config(args)
model_inference(args, config, ros_operator)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,315 @@
import os, sys
import numpy as np
import torch
from PIL import Image
from torchvision import transforms
from configs.state_vec import STATE_VEC_IDX_MAPPING
from pathlib import Path
# get current workspace
current_file = Path(__file__)
sys.path.append(os.path.join(current_file.parent.parent, "models"))
sys.path.append(os.path.join(current_file.parent.parent, "models"))
from multimodal_encoder.siglip_encoder import SiglipVisionTower
from multimodal_encoder.t5_encoder import T5Embedder
from rdt_runner import RDTRunner
AGILEX_STATE_INDICES = [
STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"] for i in range(6)
]
# Create the RDT model
def create_model(args, **kwargs):
model = RoboticDiffusionTransformerModel(args, **kwargs)
pretrained = kwargs.get("pretrained", None)
if pretrained is not None and os.path.isfile(pretrained):
model.load_pretrained_weights(pretrained)
return model
class RoboticDiffusionTransformerModel(object):
"""A wrapper for the RDT model, which handles
1. Model initialization
2. Encodings of instructions
3. Model inference
"""
def __init__(
self,
args,
device="cuda",
dtype=torch.bfloat16,
image_size=None,
control_frequency=25,
pretrained=None,
pretrained_vision_encoder_name_or_path=None,
):
self.args = args
self.dtype = dtype
self.image_size = image_size
self.device = device
self.control_frequency = control_frequency
# We do not use the text encoder due to limited GPU memory
# self.text_tokenizer, self.text_model = self.get_text_encoder(pretrained_text_encoder_name_or_path)
self.image_processor, self.vision_model = self.get_vision_encoder(pretrained_vision_encoder_name_or_path)
self.policy = self.get_policy(pretrained)
self.reset()
def get_policy(self, pretrained):
"""Initialize the model."""
# Initialize model with arguments
if pretrained is None or os.path.isfile(pretrained):
img_cond_len = (self.args["common"]["img_history_size"] * self.args["common"]["num_cameras"] *
self.vision_model.num_patches)
_model = RDTRunner(
action_dim=self.args["common"]["state_dim"],
pred_horizon=self.args["common"]["action_chunk_size"],
config=self.args["model"],
lang_token_dim=self.args["model"]["lang_token_dim"],
img_token_dim=self.args["model"]["img_token_dim"],
state_token_dim=self.args["model"]["state_token_dim"],
max_lang_cond_len=self.args["dataset"]["tokenizer_max_length"],
img_cond_len=img_cond_len,
img_pos_embed_config=[
# No initial pos embed in the last grid size
# since we've already done in ViT
(
"image",
(
self.args["common"]["img_history_size"],
self.args["common"]["num_cameras"],
-self.vision_model.num_patches,
),
),
],
lang_pos_embed_config=[
# Similarly, no initial pos embed for language
("lang", -self.args["dataset"]["tokenizer_max_length"]),
],
dtype=self.dtype,
)
else:
_model = RDTRunner.from_pretrained(pretrained)
return _model
def get_text_encoder(self, pretrained_text_encoder_name_or_path):
text_embedder = T5Embedder(
from_pretrained=pretrained_text_encoder_name_or_path,
model_max_length=self.args["dataset"]["tokenizer_max_length"],
device=self.device,
)
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
return tokenizer, text_encoder
def get_vision_encoder(self, pretrained_vision_encoder_name_or_path):
vision_encoder = SiglipVisionTower(vision_tower=pretrained_vision_encoder_name_or_path, args=None)
image_processor = vision_encoder.image_processor
return image_processor, vision_encoder
def reset(self):
"""Set model to evaluation mode."""
device = self.device
weight_dtype = self.dtype
self.policy.eval()
# self.text_model.eval()
self.vision_model.eval()
self.policy = self.policy.to(device, dtype=weight_dtype)
# self.text_model = self.text_model.to(device, dtype=weight_dtype)
self.vision_model = self.vision_model.to(device, dtype=weight_dtype)
def load_pretrained_weights(self, pretrained=None):
if pretrained is None:
return
print(f"Loading weights from {pretrained}")
filename = os.path.basename(pretrained)
if filename.endswith(".pt"):
checkpoint = torch.load(pretrained)
self.policy.load_state_dict(checkpoint["module"])
elif filename.endswith(".safetensors"):
from safetensors.torch import load_model
load_model(self.policy, pretrained)
else:
raise NotImplementedError(f"Unknown checkpoint format: {pretrained}")
def encode_instruction(self, instruction, device="cuda"):
"""Encode string instruction to latent embeddings.
Args:
instruction: a string of instruction
device: a string of device
Returns:
pred: a tensor of latent embeddings of shape (text_max_length, 512)
"""
tokens = self.text_tokenizer(instruction, return_tensors="pt", padding="longest",
truncation=True)["input_ids"].to(device)
tokens = tokens.view(1, -1)
with torch.no_grad():
pred = self.text_model(tokens).last_hidden_state.detach()
return pred
def _format_joint_to_state(self, joints):
"""
Format the joint proprioception into the unified action vector.
Args:
joints (torch.Tensor): The joint proprioception to be formatted.
qpos ([B, N, 14]).
Returns:
state (torch.Tensor): The formatted vector for RDT ([B, N, 128]).
"""
# Rescale the gripper to the range of [0, 1]
joints = joints / torch.tensor(
[[[180, 180, 180, 180, 180, 180]]],
device=joints.device,
dtype=joints.dtype,
)
B, N, _ = joints.shape
state = torch.zeros(
(B, N, self.args["model"]["state_token_dim"]),
device=joints.device,
dtype=joints.dtype,
)
# Fill into the unified state vector
state[:, :, AGILEX_STATE_INDICES] = joints
# Assemble the mask indicating each dimension's availability
state_elem_mask = torch.zeros(
(B, self.args["model"]["state_token_dim"]),
device=joints.device,
dtype=joints.dtype,
)
state_elem_mask[:, AGILEX_STATE_INDICES] = 1
return state, state_elem_mask
def _unformat_action_to_joint(self, action):
"""
Unformat the unified action vector into the joint action to be executed.
Args:
action (torch.Tensor): The unified action vector to be unformatted.
([B, N, 128])
Returns:
joints (torch.Tensor): The unformatted robot joint action.
qpos ([B, N, 14]).
"""
action_indices = AGILEX_STATE_INDICES
joints = action[:, :, action_indices]
# Rescale the gripper back to the action range
# Note that the action range and proprioception range are different
# for Mobile ALOHA robot
joints = joints * torch.tensor(
[[[180, 180, 180, 180, 180, 180]]],
device=joints.device,
dtype=joints.dtype,
)
return joints
@torch.no_grad()
def step(self, proprio, images, text_embeds):
"""
Predict the next action chunk given the
proprioceptive states, images, and instruction embeddings.
Args:
proprio: proprioceptive states
images: RGB images, the order should be
[ext_{t-1}, right_wrist_{t-1}, left_wrist_{t-1},
ext_{t}, right_wrist_{t}, left_wrist_{t}]
text_embeds: instruction embeddings
Returns:
action: predicted action
"""
device = self.device
dtype = self.dtype
# The background image used for padding
background_color = np.array([int(x * 255) for x in self.image_processor.image_mean],
dtype=np.uint8).reshape(1, 1, 3)
background_image = (np.ones(
(
self.image_processor.size["height"],
self.image_processor.size["width"],
3,
),
dtype=np.uint8,
) * background_color)
# Preprocess the images by order and encode them
image_tensor_list = []
for image in images:
if image is None:
# Replace it with the background image
image = Image.fromarray(background_image)
if self.image_size is not None:
image = transforms.Resize(self.data_args.image_size)(image)
if self.args["dataset"].get("auto_adjust_image_brightness", False):
pixel_values = list(image.getdata())
average_brightness = sum(sum(pixel) for pixel in pixel_values) / (len(pixel_values) * 255.0 * 3)
if average_brightness <= 0.15:
image = transforms.ColorJitter(brightness=(1.75, 1.75))(image)
if self.args["dataset"].get("image_aspect_ratio", "pad") == "pad":
def expand2square(pil_img, background_color):
width, height = pil_img.size
if width == height:
return pil_img
elif width > height:
result = Image.new(pil_img.mode, (width, width), background_color)
result.paste(pil_img, (0, (width - height) // 2))
return result
else:
result = Image.new(pil_img.mode, (height, height), background_color)
result.paste(pil_img, ((height - width) // 2, 0))
return result
image = expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean))
image = self.image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
image_tensor_list.append(image)
image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
image_embeds = self.vision_model(image_tensor).detach()
image_embeds = image_embeds.reshape(-1, self.vision_model.hidden_size).unsqueeze(0)
# Prepare the proprioception states and the control frequency
joints = proprio.to(device).unsqueeze(0) # (1, 1, 14)
states, state_elem_mask = self._format_joint_to_state(joints) # (1, 1, 128), (1, 128)
states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(device, dtype=dtype)
states = states[:, -1:, :] # (1, 1, 128)
ctrl_freqs = torch.tensor([self.control_frequency]).to(device)
text_embeds = text_embeds.to(device, dtype=dtype)
# Predict the next action chunk given the inputs
trajectory = self.policy.predict_action(
lang_tokens=text_embeds,
lang_attn_mask=torch.ones(text_embeds.shape[:2], dtype=torch.bool, device=text_embeds.device),
img_tokens=image_embeds,
state_tokens=states,
action_mask=state_elem_mask.unsqueeze(1),
ctrl_freqs=ctrl_freqs,
)
trajectory = self._unformat_action_to_joint(trajectory).to(torch.float32)
return trajectory

View File

@ -0,0 +1,53 @@
import os
import torch
import yaml
from models.multimodal_encoder.t5_encoder import T5Embedder
GPU = 0
MODEL_PATH = "google/t5-v1_1-xxl"
CONFIG_PATH = "configs/base.yaml"
SAVE_DIR = "outs/"
# Modify this to your task name and instruction
TASK_NAME = "handover_pan"
INSTRUCTION = "Pick up the black marker on the right and put it into the packaging box on the left."
# Note: if your GPU VRAM is less than 24GB,
# it is recommended to enable offloading by specifying an offload directory.
OFFLOAD_DIR = (
None # Specify your offload directory here, ensuring the directory exists.
)
def main():
with open(CONFIG_PATH, "r") as fp:
config = yaml.safe_load(fp)
device = torch.device(f"cuda:{GPU}")
text_embedder = T5Embedder(
from_pretrained=MODEL_PATH,
model_max_length=config["dataset"]["tokenizer_max_length"],
device=device,
use_offload_folder=OFFLOAD_DIR,
)
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
tokens = tokenizer(INSTRUCTION, return_tensors="pt", padding="longest", truncation=True)["input_ids"].to(device)
tokens = tokens.view(1, -1)
with torch.no_grad():
pred = text_encoder(tokens).last_hidden_state.detach().cpu()
save_path = os.path.join(SAVE_DIR, f"{TASK_NAME}.pt")
# We save the embeddings in a dictionary format
torch.save({"name": TASK_NAME, "instruction": INSTRUCTION, "embeddings": pred}, save_path)
print(
f'"{INSTRUCTION}" from "{TASK_NAME}" is encoded by "{MODEL_PATH}" into shape {pred.shape} and saved to "{save_path}"'
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,57 @@
import os
import json
import argparse
import torch
import yaml
from tqdm import tqdm
from models.multimodal_encoder.t5_encoder import T5Embedder
def encode_lang(
DATA_FILE_PATH,
TARGET_DIR,
GPU,
desc_type="seen",
tokenizer=None,
text_encoder=None,
):
current_dir = os.path.dirname(__file__)
with open(os.path.join(current_dir, "../configs/base.yaml"), "r") as fp:
config = yaml.safe_load(fp)
device = torch.device(f"cuda:{GPU}")
if tokenizer is None or text_encoder is None:
text_embedder = T5Embedder(
from_pretrained=os.path.join(current_dir, "../../weights/RDT/t5-v1_1-xxl"),
model_max_length=config["dataset"]["tokenizer_max_length"],
device=device,
use_offload_folder=None,
)
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
with open(DATA_FILE_PATH, "r") as f_instr:
instruction_dict = json.load(f_instr)
instructions = instruction_dict[desc_type]
# Encode the instructions
tokenized_res = tokenizer(instructions, return_tensors="pt", padding="longest", truncation=True)
tokens = tokenized_res["input_ids"].to(device)
attn_mask = tokenized_res["attention_mask"].to(device)
with torch.no_grad():
text_embeds = (text_encoder(input_ids=tokens, attention_mask=attn_mask)["last_hidden_state"].detach().cpu())
attn_mask = attn_mask.cpu().bool()
if not os.path.exists(f"{TARGET_DIR}/instructions"):
os.makedirs(f"{TARGET_DIR}/instructions")
# Save the embeddings for training use
for i in range(len(instructions)):
text_embed = text_embeds[i][attn_mask[i]]
save_path = os.path.join(TARGET_DIR, f"instructions/lang_embed_{i}.pt")
# print("encoded instructions save_path:",save_path)
torch.save(text_embed, save_path)
return tokenizer, text_encoder

View File

@ -0,0 +1,84 @@
import json
import os
import sys
import re
def extract_metrics_from_log(log_file_path):
all_metrics = []
pattern = re.compile(
r"\{'agilex_sample_mse':\s*([0-9.eE+-]+),\s*'agilex_sample_l2err':\s*([0-9.eE+-]+),\s*'overall_avg_sample_mse':\s*([0-9.eE+-]+),\s*'overall_avg_sample_l2err':\s*([0-9.eE+-]+)\}"
)
try:
with open(log_file_path, 'r', encoding='utf-8') as f:
for line in f:
m = pattern.search(line)
if m:
metrics = (
float(m.group(1)),
float(m.group(2)),
float(m.group(3)),
float(m.group(4))
)
all_metrics.append(metrics)
print(f"Find Metrics: agilex_sample_mse={metrics[0]}, agilex_sample_l2err={metrics[1]}, "
f"overall_avg_sample_mse={metrics[2]}, overall_avg_sample_l2err={metrics[3]}")
except Exception as e:
print(f"Failed to read log: {e}")
return (None, None, None, None)
if not all_metrics:
print("No metrics found in the log file")
return (None, None, None, None)
print(f"\nTotal {len(all_metrics)} metrics found in the log file")
best_agilex_mse = min(m[0] for m in all_metrics)
best_agilex_l2err = min(m[1] for m in all_metrics)
best_overall_mse = min(m[2] for m in all_metrics)
best_overall_l2err = min(m[3] for m in all_metrics)
print(f"\nBest metrics:")
print(f" agilex_sample_mse: {best_agilex_mse}")
print(f" agilex_sample_l2err: {best_agilex_l2err}")
print(f" overall_avg_sample_mse: {best_overall_mse}")
print(f" overall_avg_sample_l2err: {best_overall_l2err}")
return (best_agilex_mse, best_agilex_l2err, best_overall_mse, best_overall_l2err)
def generate_output_json(input_config_file, output_dir, runtime):
with open(input_config_file, 'r') as f:
config = json.load(f)
log_file = os.path.join(output_dir, 'output.log')
agilex_sample_mse, agilex_sample_l2err, overall_avg_sample_mse, overall_avg_sample_l2err = extract_metrics_from_log(log_file)
if None in [agilex_sample_mse, agilex_sample_l2err, overall_avg_sample_mse, overall_avg_sample_l2err]:
print("Warning: Some metrics are missing in the log file.")
output_json = {
"task_id": config.get("task_id"),
"model_type": "RDT-170M",
"model_name": config.get("model_name") if "model_name" in config else config.get("train", {}).get("model"),
"gpu_id": config.get("gpu_id"),
"runtime": runtime,
"log_path": log_file,
"output_dir": output_dir,
"model_path": os.path.join(output_dir, 'pytorch_model.bin'),
"metrics": {
"agilex_sample_mse": agilex_sample_mse,
"agilex_sample_l2err": agilex_sample_l2err,
"overall_avg_sample_mse": overall_avg_sample_mse,
"overall_avg_sample_l2err": overall_avg_sample_l2err
}
}
# 写入 output.json格式化输出、确保null与规范json一致
output_json_path = os.path.join(output_dir, 'output.json')
with open(output_json_path, 'w') as f:
json.dump(output_json, f, indent=4, ensure_ascii=False)
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: python generate_output_json.py <input_config_file> <output_dir> <runtime>")
sys.exit(1)
generate_output_json(sys.argv[1], sys.argv[2], sys.argv[3])

View File

@ -0,0 +1,325 @@
import os
import numpy as np
import torch
from PIL import Image
from torchvision import transforms
from configs.state_vec import STATE_VEC_IDX_MAPPING
from models.multimodal_encoder.siglip_encoder import SiglipVisionTower
from models.multimodal_encoder.t5_encoder import T5Embedder
from models.rdt_runner import RDTRunner
MANISKILL_INDICES = [STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"]
for i in range(7)] + [STATE_VEC_IDX_MAPPING[f"right_gripper_open"]]
def create_model(args, pretrained, **kwargs):
model = RoboticDiffusionTransformerModel(args, **kwargs)
if pretrained is not None:
model.load_pretrained_weights(pretrained)
return model
DATA_STAT = {
"state_min": [
-0.7463043928146362,
-0.0801204964518547,
-0.4976441562175751,
-2.657780647277832,
-0.5742632150650024,
1.8309762477874756,
-2.2423808574676514,
0.0,
],
"state_max": [
0.7645499110221863,
1.4967026710510254,
0.4650936424732208,
-0.3866899907588959,
0.5505855679512024,
3.2900545597076416,
2.5737812519073486,
0.03999999910593033,
],
"action_min": [
-0.7472005486488342,
-0.08631071448326111,
-0.4995281398296356,
-2.658363103866577,
-0.5751323103904724,
1.8290787935256958,
-2.245187997817993,
-1.0,
],
"action_max": [
0.7654682397842407,
1.4984270334243774,
0.46786263585090637,
-0.38181185722351074,
0.5517147779464722,
3.291581630706787,
2.575840711593628,
1.0,
],
}
class RoboticDiffusionTransformerModel(object):
"""A wrapper for the RDT model, which handles
1. Model initialization
2. Encodings of instructions
3. Model inference
"""
def __init__(
self,
args,
device="cuda",
dtype=torch.bfloat16,
image_size=None,
control_frequency=25,
pretrained_text_encoder_name_or_path=None,
pretrained_vision_encoder_name_or_path=None,
):
self.args = args
self.dtype = dtype
self.image_size = image_size
self.device = device
self.control_frequency = control_frequency
self.text_tokenizer, self.text_model = self.get_text_encoder(pretrained_text_encoder_name_or_path)
self.image_processor, self.vision_model = self.get_vision_encoder(pretrained_vision_encoder_name_or_path)
self.policy = self.get_policy()
self.state_min = torch.tensor(DATA_STAT["state_min"]).to(device)
self.state_max = torch.tensor(DATA_STAT["state_max"]).to(device)
self.action_min = torch.tensor(DATA_STAT["action_min"]).to(device)
self.action_max = torch.tensor(DATA_STAT["action_max"]).to(device)
self.reset()
def get_policy(self):
"""Initialize the model."""
# Initialize model with arguments
img_cond_len = (self.args["common"]["img_history_size"] * self.args["common"]["num_cameras"] *
self.vision_model.num_patches)
_model = RDTRunner(
action_dim=self.args["common"]["state_dim"],
pred_horizon=self.args["common"]["action_chunk_size"],
config=self.args["model"],
lang_token_dim=self.args["model"]["lang_token_dim"],
img_token_dim=self.args["model"]["img_token_dim"],
state_token_dim=self.args["model"]["state_token_dim"],
max_lang_cond_len=self.args["dataset"]["tokenizer_max_length"],
img_cond_len=img_cond_len,
img_pos_embed_config=[
# No initial pos embed in the last grid size
# since we've already done in ViT
(
"image",
(
self.args["common"]["img_history_size"],
self.args["common"]["num_cameras"],
-self.vision_model.num_patches,
),
),
],
lang_pos_embed_config=[
# Similarly, no initial pos embed for language
("lang", -self.args["dataset"]["tokenizer_max_length"]),
],
dtype=self.dtype,
)
return _model
def get_text_encoder(self, pretrained_text_encoder_name_or_path):
text_embedder = T5Embedder(
from_pretrained=pretrained_text_encoder_name_or_path,
model_max_length=self.args["dataset"]["tokenizer_max_length"],
device=self.device,
)
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
return tokenizer, text_encoder
def get_vision_encoder(self, pretrained_vision_encoder_name_or_path):
vision_encoder = SiglipVisionTower(vision_tower=pretrained_vision_encoder_name_or_path, args=None)
image_processor = vision_encoder.image_processor
return image_processor, vision_encoder
def reset(self):
"""Set model to evaluation mode."""
device = self.device
weight_dtype = self.dtype
self.policy.eval()
self.text_model.eval()
self.vision_model.eval()
self.policy = self.policy.to(device, dtype=weight_dtype)
self.text_model = self.text_model.to(device, dtype=weight_dtype)
self.vision_model = self.vision_model.to(device, dtype=weight_dtype)
def load_pretrained_weights(self, pretrained=None):
if pretrained is None:
return
print(f"Loading weights from {pretrained}")
filename = os.path.basename(pretrained)
if filename.endswith(".pt"):
checkpoint = torch.load(pretrained)
self.policy.load_state_dict(checkpoint["module"])
elif filename.endswith(".safetensors"):
from safetensors.torch import load_model
load_model(self.policy, pretrained)
else:
raise NotImplementedError(f"Unknown checkpoint format: {pretrained}")
def encode_instruction(self, instruction, device="cuda"):
"""Encode string instruction to latent embeddings.
Args:
instruction: a string of instruction
device: a string of device
Returns:
pred: a tensor of latent embeddings of shape (text_max_length, 512)
"""
tokens = self.text_tokenizer(instruction, return_tensors="pt", padding="longest",
truncation=True)["input_ids"].to(device)
tokens = tokens.view(1, -1)
with torch.no_grad():
pred = self.text_model(tokens).last_hidden_state.detach()
return pred
def _format_joint_to_state(self, joints):
"""
Format the robot joint state into the unified state vector.
Args:
joints (torch.Tensor): The joint state to be formatted.
qpos ([B, N, 14]).
Returns:
state (torch.Tensor): The formatted state for RDT ([B, N, 128]).
"""
# Rescale the gripper
# joints = joints / torch.tensor(
# [[[1, 1, 1, 1, 1, 1, 4.7908, 1, 1, 1, 1, 1, 1, 4.7888]]],
# device=joints.device, dtype=joints.dtype
# )
# normalize to -1,1
joints = (joints - self.state_min) / (self.state_max - self.state_min) * 2 - 1
B, N, _ = joints.shape
state = torch.zeros(
(B, N, self.args["model"]["state_token_dim"]),
device=joints.device,
dtype=joints.dtype,
)
# assemble the unifed state vector
state[:, :, MANISKILL_INDICES] = joints
state_elem_mask = torch.zeros(
(B, self.args["model"]["state_token_dim"]),
device=joints.device,
dtype=joints.dtype,
)
state_elem_mask[:, MANISKILL_INDICES] = 1
return state, state_elem_mask
def _unformat_action_to_joint(self, action):
action_indices = MANISKILL_INDICES
joints = action[:, :, action_indices]
# denormalize to action space
joints = (joints + 1) / 2 * (self.action_max - self.action_min) + self.action_min
return joints
@torch.no_grad()
def step(self, proprio, images, text_embeds):
"""
Args:
proprio: proprioceptive states
images: RGB images
text_embeds: instruction embeddings
Returns:
action: predicted action
"""
device = self.device
dtype = self.dtype
background_color = np.array([int(x * 255) for x in self.image_processor.image_mean],
dtype=np.uint8).reshape(1, 1, 3)
background_image = (np.ones(
(
self.image_processor.size["height"],
self.image_processor.size["width"],
3,
),
dtype=np.uint8,
) * background_color)
image_tensor_list = []
for image in images:
if image is None:
# Replace it with the background image
image = Image.fromarray(background_image)
if self.image_size is not None:
image = transforms.Resize(self.data_args.image_size)(image)
if self.args["dataset"].get("auto_adjust_image_brightness", False):
pixel_values = list(image.getdata())
average_brightness = sum(sum(pixel) for pixel in pixel_values) / (len(pixel_values) * 255.0 * 3)
if average_brightness <= 0.15:
image = transforms.ColorJitter(brightness=(1.75, 1.75))(image)
if self.args["dataset"].get("image_aspect_ratio", "pad") == "pad":
def expand2square(pil_img, background_color):
width, height = pil_img.size
if width == height:
return pil_img
elif width > height:
result = Image.new(pil_img.mode, (width, width), background_color)
result.paste(pil_img, (0, (width - height) // 2))
return result
else:
result = Image.new(pil_img.mode, (height, height), background_color)
result.paste(pil_img, ((height - width) // 2, 0))
return result
image = expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean))
image = self.image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
image_tensor_list.append(image)
image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
image_embeds = self.vision_model(image_tensor).detach()
image_embeds = image_embeds.reshape(-1, self.vision_model.hidden_size).unsqueeze(0)
# history of actions
joints = proprio.to(device).unsqueeze(0) # (1, 1, 14)
states, state_elem_mask = self._format_joint_to_state(joints) # (1, 1, 128), (1, 128)
states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(device, dtype=dtype)
states = states[:, -1:, :] # (1, 1, 128)
ctrl_freqs = torch.tensor([self.control_frequency]).to(device)
text_embeds = text_embeds.to(device, dtype=dtype)
trajectory = self.policy.predict_action(
lang_tokens=text_embeds,
lang_attn_mask=torch.ones(text_embeds.shape[:2], dtype=torch.bool, device=text_embeds.device),
img_tokens=image_embeds,
state_tokens=states,
action_mask=state_elem_mask.unsqueeze(1),
ctrl_freqs=ctrl_freqs,
)
trajectory = self._unformat_action_to_joint(trajectory).to(torch.float32)
return trajectory

View File

@ -0,0 +1,169 @@
import sys
sys.path.append("./")
import os
import h5py
import numpy as np
import pickle
import cv2
import argparse
import yaml
from scripts.encode_lang_batch_once import encode_lang
def load_hdf5(dataset_path):
if not os.path.isfile(dataset_path):
print(f"Dataset does not exist at \n{dataset_path}\n")
exit()
with h5py.File(dataset_path, "r") as root:
left_gripper, left_arm = (
root["/joint_action/left_gripper"][()],
root["/joint_action/left_arm"][()],
)
right_gripper, right_arm = (
root["/joint_action/right_gripper"][()],
root["/joint_action/right_arm"][()],
)
image_dict = dict()
for cam_name in root[f"/observation/"].keys():
image_dict[cam_name] = root[f"/observation/{cam_name}/rgb"][()]
return left_gripper, left_arm, right_gripper, right_arm, image_dict
def images_encoding(imgs):
encode_data = []
padded_data = []
max_len = 0
for i in range(len(imgs)):
success, encoded_image = cv2.imencode(".jpg", imgs[i])
jpeg_data = encoded_image.tobytes()
encode_data.append(jpeg_data)
max_len = max(max_len, len(jpeg_data))
# padding
for i in range(len(imgs)):
padded_data.append(encode_data[i].ljust(max_len, b"\0"))
return encode_data, max_len
def get_task_config(task_name):
with open(f"./task_config/{task_name}.yml", "r", encoding="utf-8") as f:
args = yaml.load(f.read(), Loader=yaml.FullLoader)
return args
def data_transform(path, episode_num, save_path):
begin = 0
floders = os.listdir(path)
assert episode_num <= len(floders), "data num not enough"
if not os.path.exists(save_path):
os.makedirs(save_path)
for i in range(episode_num):
left_gripper_all, left_arm_all, right_gripper_all, right_arm_all, image_dict = (load_hdf5(
os.path.join(path, f"episode{i}.hdf5")))
qpos = []
actions = []
cam_high = []
cam_right_wrist = []
cam_left_wrist = []
left_arm_dim = []
right_arm_dim = []
last_state = None
for j in range(0, left_gripper_all.shape[0]):
left_gripper, left_arm, right_gripper, right_arm = (
left_gripper_all[j],
left_arm_all[j],
right_gripper_all[j],
right_arm_all[j],
)
state = np.concatenate((left_arm, [left_gripper], right_arm, [right_gripper]), axis=0) # joint
state = state.astype(np.float32)
if j != left_gripper_all.shape[0] - 1:
qpos.append(state)
camera_high_bits = image_dict["head_camera"][j]
camera_high = cv2.imdecode(np.frombuffer(camera_high_bits, np.uint8), cv2.IMREAD_COLOR)
camera_high_resized = cv2.resize(camera_high, (640, 480))
cam_high.append(camera_high_resized)
camera_right_wrist_bits = image_dict["right_camera"][j]
camera_right_wrist = cv2.imdecode(np.frombuffer(camera_right_wrist_bits, np.uint8), cv2.IMREAD_COLOR)
camera_right_wrist_resized = cv2.resize(camera_right_wrist, (640, 480))
cam_right_wrist.append(camera_right_wrist_resized)
camera_left_wrist_bits = image_dict["left_camera"][j]
camera_left_wrist = cv2.imdecode(np.frombuffer(camera_left_wrist_bits, np.uint8), cv2.IMREAD_COLOR)
camera_left_wrist_resized = cv2.resize(camera_left_wrist, (640, 480))
cam_left_wrist.append(camera_left_wrist_resized)
if j != 0:
action = state
actions.append(action)
left_arm_dim.append(left_arm.shape[0])
right_arm_dim.append(right_arm.shape[0])
if not os.path.exists(os.path.join(save_path, f"episode_{i}")):
os.makedirs(os.path.join(save_path, f"episode_{i}"))
hdf5path = os.path.join(save_path, f"episode_{i}/episode_{i}.hdf5")
with h5py.File(hdf5path, "w") as f:
f.create_dataset("action", data=np.array(actions))
obs = f.create_group("observations")
obs.create_dataset("qpos", data=np.array(qpos))
obs.create_dataset("left_arm_dim", data=np.array(left_arm_dim))
obs.create_dataset("right_arm_dim", data=np.array(right_arm_dim))
image = obs.create_group("images")
cam_high_enc, len_high = images_encoding(cam_high)
cam_right_wrist_enc, len_right = images_encoding(cam_right_wrist)
cam_left_wrist_enc, len_left = images_encoding(cam_left_wrist)
image.create_dataset("cam_high", data=cam_high_enc, dtype=f"S{len_high}")
image.create_dataset("cam_right_wrist", data=cam_right_wrist_enc, dtype=f"S{len_right}")
image.create_dataset("cam_left_wrist", data=cam_left_wrist_enc, dtype=f"S{len_left}")
begin += 1
print(f"proccess {i} success!")
return begin
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process some episodes.")
parser.add_argument("task_name", type=str)
parser.add_argument("task_config", type=str)
parser.add_argument("expert_data_num", type=int)
args = parser.parse_args()
task_name = args.task_name
task_config = args.task_config
expert_data_num = args.expert_data_num
load_dir = os.path.join("../../data", str(task_name), str(task_config), "data")
print(f"read data from path: {load_dir}")
begin = data_transform(
load_dir,
expert_data_num,
f"./processed_data/{task_name}-{task_config}-{expert_data_num}",
)
tokenizer, text_encoder = None, None
for idx in range(expert_data_num):
print(f"Processing Language: {idx}", end="\r")
data_file_path = (f"../../data/{task_name}/{task_config}/instructions/episode{idx}.json")
target_dir = (f"processed_data/{task_name}-{task_config}-{expert_data_num}/episode_{idx}")
tokenizer, text_encoder = encode_lang(
DATA_FILE_PATH=data_file_path,
TARGET_DIR=target_dir,
GPU=0,
desc_type="seen",
tokenizer=tokenizer,
text_encoder=text_encoder,
)

View File

@ -0,0 +1,42 @@
import json
import sys
def read_config(config_file, key_path):
"""
Read a value from JSON config file.
Args:
config_file: Path to JSON config file
key_path: Dot-separated path to the key (e.g., "evaluation.checkpoint_path")
Returns:
The value at the specified key path
"""
with open(config_file, 'r') as f:
json_config = json.load(f)
# Navigate through nested keys
keys = key_path.split('.')
value = json_config
for key in keys:
if isinstance(value, dict):
value = value.get(key)
else:
return None
return value
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python read_config.py <config_file> <key_path>", file=sys.stderr)
sys.exit(1)
config_file = sys.argv[1]
key_path = sys.argv[2]
value = read_config(config_file, key_path)
if value is not None:
print(value)
else:
print("", file=sys.stderr)
sys.exit(1)

View File

@ -0,0 +1,22 @@
import sys
import yaml
def read_yaml_value(file_path, key):
with open(file_path, "r") as file:
data = yaml.safe_load(file)
value = data.get(key)
if value is not None:
print(value)
else:
print(f"Key '{key}' not found in {file_path}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python read_yaml.py <file_path> <key>")
sys.exit(1)
file_path = sys.argv[1]
key = sys.argv[2]
read_yaml_value(file_path, key)

View File

@ -0,0 +1,2 @@
input/
output/

14
RDT/rdt-quant/Dockerfile Normal file
View File

@ -0,0 +1,14 @@
FROM ai_toolchain_ubuntu_22_s100_gpu:v3.2.0
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV TZ=Asia/Shanghai
RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
COPY . /app/
ENTRYPOINT ["bash", "convert.sh"]

30
RDT/rdt-quant/convert.sh Normal file
View File

@ -0,0 +1,30 @@
CONFIG=input/config.json
OUTPUT=/app/output/$(python3 read_json.py $CONFIG task_id)
python3 load_config.py $CONFIG
echo "Convert PTQ YAML Haved been Prepared"
######### Img Adaptor
cd $OUTPUT/Img_Adaptor
BEGIN_IMG_ADAPTOR_TIME=$(date +%s)
echo -e "\033[44;37m===== Start Compiling Img Adaptor =====\033[0m"
hb_compile --config $OUTPUT/img_adaptor.yaml
echo -e "\033[44;37m===== End Compiling Img Adaptor =====\033[0m"
END_IMG_ADAPTOR_TIME=$(date +%s)
IMG_ADAPTOR_TIME=$((END_IMG_ADAPTOR_TIME - BEGIN_IMG_ADAPTOR_TIME))
echo -e "\033[44;37m===== Img Adaptor Time =====\033[0m"
echo -e "\033[44;37m===== $IMG_ADAPTOR_TIME seconds =====\033[0m"
########## DiT
cd $OUTPUT/DiT_Policy
BEGIN_DIT_TIME=$(date +%s)
echo -e "\033[44;37m===== Start Compiling DiT =====\033[0m"
hb_compile --config $OUTPUT/dit.yaml
echo -e "\033[44;37m===== End Compiling DiT =====\033[0m"
END_DIT_TIME=$(date +%s)
DIT_TIME=$((END_DIT_TIME - BEGIN_DIT_TIME))
echo -e "\033[44;37m===== DiT Time =====\033[0m"
echo -e "\033[44;37m===== $DIT_TIME seconds =====\033[0m"

View File

@ -0,0 +1,88 @@
import json
import yaml
import sys
import os
from dataclasses import dataclass
DIT = "DiT_Policy"
IMG_ADAPTOR = "Img_Adaptor"
@dataclass
class QuantConfig:
task_id: str = None
gpu_id: str = None
march: str = None
model_type: str = None
output_path: str = None
DiT_Policy_ONNX: str = None
DiT_Policy_CALIBRATION: str = None
Img_Adaptor_ONNX: str = None
Img_Adaptor_CALIBRATION: str = None
def load_config(config_path):
with open(config_path, "r") as file:
config = json.load(file)
if "quant" in config:
quant_info = config["quant"]
if "DiT_Policy" in quant_info:
dit_policy = quant_info["DiT_Policy"]
if "Img_Adaptor" in quant_info:
img_adaptor = quant_info["Img_Adaptor"]
opt = QuantConfig(
task_id=config.get("task_id"),
gpu_id=config.get("gpu_id"),
march=quant_info.get("march"),
model_type=quant_info.get("model_type"),
output_path=os.path.join(quant_info.get("output_path"), config.get("task_id")),
DiT_Policy_ONNX=dit_policy.get("onnx_model"),
DiT_Policy_CALIBRATION=dit_policy.get("calibration_data"),
Img_Adaptor_ONNX=img_adaptor.get("onnx_model"),
Img_Adaptor_CALIBRATION=img_adaptor.get("calibration_data")
)
os.makedirs(opt.output_path, exist_ok=True)
# PrePare Img Convert YAML
with open(f"ptq_yaml/{opt.model_type}/img_adaptor.yaml", "r") as file:
img_adaptor_yaml = yaml.safe_load(file)
img_adaptor_yaml["model_parameters"]["onnx_model"] = opt.Img_Adaptor_ONNX
img_adaptor_yaml["model_parameters"]["march"] = opt.march
img_adaptor_yaml["model_parameters"]["output_model_file_prefix"] = "rdt_img_adaptor"
img_adaptor_yaml["calibration_parameters"]["cal_data_dir"] = opt.Img_Adaptor_CALIBRATION
img_adaptor_yaml["model_parameters"]["working_dir"] = IMG_ADAPTOR
img_adaptor_yaml_path = os.path.join(opt.output_path, "img_adaptor.yaml")
with open(img_adaptor_yaml_path, 'w') as f:
yaml.safe_dump(img_adaptor_yaml, f, default_flow_style=False, allow_unicode=True)
# PrePare DiT Convert YAML
with open(f"ptq_yaml/{opt.model_type}/dit.yaml", "r") as file:
dit_yaml = yaml.safe_load(file)
for k, v in dit_yaml.get("calibration_parameters", {}).items():
if isinstance(v, str) and "{dit_cal_name}" in v:
if opt.DiT_Policy_CALIBRATION is not None:
dit_yaml["calibration_parameters"][k] = v.replace("{dit_cal_name}", opt.DiT_Policy_CALIBRATION)
else:
raise ValueError(f"DiT_Policy_CALIBRATION is None, cannot replace {{dit_cal_name}} in {k}")
dit_yaml["model_parameters"]["onnx_model"] = opt.DiT_Policy_ONNX
dit_yaml["model_parameters"]["march"] = opt.march
dit_yaml["model_parameters"]["working_dir"] = DIT
# dit_onnx_dir = os.path.dirname(opt.DiT_Policy_ONNX) if opt.DiT_Policy_ONNX else ""
# os.environ["DIT_ONNX_DIR"] = dit_onnx_dir
with open(f"ptq_yaml/{opt.model_type}/dit_op_config.json", "r") as file:
dit_json = json.load(file)
dit_yaml["calibration_parameters"]["quant_config"] = dit_json
dit_yaml_path = os.path.join(opt.output_path, "dit.yaml")
with open(dit_yaml_path, 'w') as f:
yaml.safe_dump(dit_yaml, f, default_flow_style=False, allow_unicode=True)
if __name__ == "__main__":
config_path = sys.argv[1]
config = load_config(config_path)

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

View File

@ -0,0 +1,29 @@
calibration_parameters:
cal_data_dir: '{dit_cal_name}/x/;{dit_cal_name}/freq/;{dit_cal_name}/t/;{dit_cal_name}/lang_c/;{dit_cal_name}/img_c/;{dit_cal_name}/lang_mask/;'
quant_config: dit_json_name
run_on_cpu: '/t_embedder/Cos;/t_embedder/Sin;/freq_embedder/Cos;/freq_embedder/Sin'
compiler_parameters:
compile_mode: latency
core_num: 1
debug: true
jobs: 8
max_time_per_fc: 0
optimize_level: O2
advice: 1
input_parameters:
input_layout_rt: NCHW;NCHW;NCHW;NCHW;NCHW;NCHW
input_layout_train: NCHW;NCHW;NCHW;NCHW;NCHW;NCHW
input_name: x;freq;t;lang_c;img_c;lang_mask;
input_shape: 1x65x1024;1;1;1x64x1024;1x4374x1024;1x64
input_space_and_range: ''
input_type_rt: featuremap;featuremap;featuremap;featuremap;featuremap;featuremap
input_type_train: featuremap;featuremap;featuremap;featuremap;featuremap;featuremap
norm_type: no_preprocess;no_preprocess;no_preprocess;no_preprocess;no_preprocess;no_preprocess
model_parameters:
layer_out_dump: false
debug_mode: "dump_calibration_data"
enable_vpu: True
march: {opt.march}
onnx_model: {dit_name}
output_model_file_prefix: rdt_dit
working_dir: bpu_output

View File

@ -0,0 +1,251 @@
{
"model_config": {
"all_node_type": "int16",
"model_output_type": "float32",
"activation": {
"calibration_type": ["max"],
"num_bin": [1024, 2048, 4096],
"max_num_bin": 16384,
"max_percentile": 1.0,
"per_channel": true,
"asymmetric": [true]
},
"weight": {
"bias_correction": {
"metric": "mae"
}
},
"modelwise_search": {
"metric": "mae"
}
},
"op_config": {
"ReduceMean": {"qtype": "int16"},
"Sub": {"qtype": "int16"},
"Softmax": {"qtype": "int16"}
},
"node_config": {
"/t_embedder/Mul": {"qtype": "float32"},
"/t_embedder/Cos": {"qtype": "float32"},
"/t_embedder/Sin": {"qtype": "float32"},
"/t_embedder/Concat": {"qtype": "float32"},
"/freq_embedder/Mul": {"qtype": "float32"},
"/freq_embedder/Cos": {"qtype": "float32"},
"/freq_embedder/Sin": {"qtype": "float32"},
"/freq_embedder/Concat": {"qtype": "float32"},
"/blocks.0/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.0/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.0/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.0/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.0/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.0/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.0/ffn/act/Mul": {"qtype": "int16"},
"/blocks.0/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.0/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.0/ffn/act/Add": {"qtype": "int16"},
"/blocks.0/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.0/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.0/norm1/Mul_2": {"qtype": "int16"},
"/blocks.0/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.0/Add": {"qtype": "int16"},
"/blocks.1/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.1/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.1/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.1/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.1/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.1/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.1/ffn/act/Mul": {"qtype": "int16"},
"/blocks.1/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.1/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.1/ffn/act/Add": {"qtype": "int16"},
"/blocks.1/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.1/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.1/norm1/Mul_2": {"qtype": "int16"},
"/blocks.1/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.1/Add": {"qtype": "int16"},
"/blocks.2/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.2/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.2/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.2/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.2/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.2/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.2/ffn/act/Mul": {"qtype": "int16"},
"/blocks.2/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.2/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.2/ffn/act/Add": {"qtype": "int16"},
"/blocks.2/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.2/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.2/norm1/Mul_2": {"qtype": "int16"},
"/blocks.2/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.2/Add": {"qtype": "int16"},
"/blocks.3/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.3/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.3/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.3/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.3/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.3/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.3/ffn/act/Mul": {"qtype": "int16"},
"/blocks.3/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.3/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.3/ffn/act/Add": {"qtype": "int16"},
"/blocks.3/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.3/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.3/norm1/Mul_2": {"qtype": "int16"},
"/blocks.3/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.3/Add": {"qtype": "int16"},
"/blocks.4/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.4/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.4/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.4/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.4/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.4/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.4/ffn/act/Mul": {"qtype": "int16"},
"/blocks.4/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.4/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.4/ffn/act/Add": {"qtype": "int16"},
"/blocks.4/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.4/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.4/norm1/Mul_2": {"qtype": "int16"},
"/blocks.4/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.4/Add": {"qtype": "int16"},
"/blocks.5/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.5/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.5/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.5/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.5/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.5/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.5/ffn/act/Mul": {"qtype": "int16"},
"/blocks.5/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.5/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.5/ffn/act/Add": {"qtype": "int16"},
"/blocks.5/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.5/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.5/norm1/Mul_2": {"qtype": "int16"},
"/blocks.5/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.5/Add": {"qtype": "int16"},
"/blocks.6/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.6/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.6/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.6/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.6/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.6/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.6/ffn/act/Mul": {"qtype": "int16"},
"/blocks.6/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.6/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.6/ffn/act/Add": {"qtype": "int16"},
"/blocks.6/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.6/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.6/norm1/Mul_2": {"qtype": "int16"},
"/blocks.6/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.6/Add": {"qtype": "int16"},
"/blocks.7/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.7/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.7/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.7/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.7/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.7/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.7/ffn/act/Mul": {"qtype": "int16"},
"/blocks.7/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.7/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.7/ffn/act/Add": {"qtype": "int16"},
"/blocks.7/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.7/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.7/norm1/Mul_2": {"qtype": "int16"},
"/blocks.7/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.7/Add": {"qtype": "int16"},
"/blocks.8/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.8/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.8/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.8/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.8/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.8/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.8/ffn/act/Mul": {"qtype": "int16"},
"/blocks.8/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.8/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.8/ffn/act/Add": {"qtype": "int16"},
"/blocks.8/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.8/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.8/norm1/Mul_2": {"qtype": "int16"},
"/blocks.8/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.8/Add": {"qtype": "int16"},
"/blocks.9/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.9/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.9/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.9/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.9/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.9/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.9/ffn/act/Mul": {"qtype": "int16"},
"/blocks.9/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.9/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.9/ffn/act/Add": {"qtype": "int16"},
"/blocks.9/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.9/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.9/norm1/Mul_2": {"qtype": "int16"},
"/blocks.9/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.9/Add": {"qtype": "int16"},
"/blocks.10/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.10/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.10/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.10/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.10/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.10/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.10/ffn/act/Mul": {"qtype": "int16"},
"/blocks.10/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.10/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.10/ffn/act/Add": {"qtype": "int16"},
"/blocks.10/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.10/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.10/norm1/Mul_2": {"qtype": "int16"},
"/blocks.10/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.10/Add": {"qtype": "int16"},
"/blocks.11/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.11/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.11/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.11/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.11/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.11/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.11/ffn/act/Mul": {"qtype": "int16"},
"/blocks.11/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.11/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.11/ffn/act/Add": {"qtype": "int16"},
"/blocks.11/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.11/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.11/norm1/Mul_2": {"qtype": "int16"},
"/blocks.11/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.11/Add": {"qtype": "int16"},
"/blocks.12/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.12/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.12/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.12/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.12/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.12/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.12/ffn/act/Mul": {"qtype": "int16"},
"/blocks.12/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.12/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.12/ffn/act/Add": {"qtype": "int16"},
"/blocks.12/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.12/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.12/norm1/Mul_2": {"qtype": "int16"},
"/blocks.12/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.12/Add": {"qtype": "int16"},
"/blocks.13/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.13/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.13/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.13/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
"/blocks.13/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
"/blocks.13/ffn/fc1/MatMul": {"qtype": "int16"},
"/blocks.13/ffn/act/Mul": {"qtype": "int16"},
"/blocks.13/ffn/act/Mul_1": {"qtype": "int16"},
"/blocks.13/ffn/act/Mul_2": {"qtype": "int16"},
"/blocks.13/ffn/act/Add": {"qtype": "int16"},
"/blocks.13/ffn/act/Mul_3": {"qtype": "int16"},
"/blocks.13/ffn/act/Tanh": {"qtype": "int16"},
"/blocks.13/norm1/Mul_2": {"qtype": "int16"},
"/blocks.13/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
"/blocks.13/Add": {"qtype": "int16"},
"/blocks.13/norm3/Div_1_reciprocal": {"qtype": "int16"},
"/final_layer/ffn_final/act/Mul_1": {"qtype": "int16"},
"/final_layer/ffn_final/act/Mul_2 ": {"qtype": "int16"},
"/final_layer/norm_final/Div_1_reciprocal": {"qtype": "float32"}
}
}

View File

@ -0,0 +1,33 @@
model_parameters:
onnx_model: '{img_adaptor_name}'
march: {opt.march}
layer_out_dump: False
working_dir: bpu_output
output_model_file_prefix: rdt_img_adaptor
enable_vpu: True
input_parameters:
input_name: ''
input_type_rt: 'featuremap;'
input_layout_rt: 'NCHW;'
input_type_train: 'featuremap;'
input_layout_train: 'NCHW;'
norm_type: 'no_preprocess;'
calibration_parameters:
cal_data_dir: '{img_adaptor_cal_name}'
cal_data_type: 'float32'
calibration_type: 'default'
quant_config:
model_config:
all_node_type: int16
model_output_type: int16
compiler_parameters:
extra_params:
input_no_padding: true
output_no_padding: true
jobs: 8
compile_mode: 'latency'
debug: True
advice: 1
optimize_level: 'O2'
core_num: 2

View File

@ -0,0 +1,29 @@
calibration_parameters:
cal_data_dir: '{dit_cal_name}/x/;{dit_cal_name}/freq/;{dit_cal_name}/t/;{dit_cal_name}/lang_c/;{dit_cal_name}/img_c/;{dit_cal_name}/lang_mask/;'
quant_config: dit_json_name
run_on_cpu: '/t_embedder/Cos;/t_embedder/Sin;/freq_embedder/Cos;/freq_embedder/Sin'
compiler_parameters:
compile_mode: latency
core_num: 1
debug: true
jobs: 8
max_time_per_fc: 0
optimize_level: O2
advice: 1
input_parameters:
input_layout_rt: NCHW;NCHW;NCHW;NCHW;NCHW;NCHW
input_layout_train: NCHW;NCHW;NCHW;NCHW;NCHW;NCHW
input_name: x;freq;t;lang_c;img_c;lang_mask;
input_shape: 1x65x2048;1;1;1x64x2048;1x4374x2048;1x64
input_space_and_range: ''
input_type_rt: featuremap;featuremap;featuremap;featuremap;featuremap;featuremap
input_type_train: featuremap;featuremap;featuremap;featuremap;featuremap;featuremap
norm_type: no_preprocess;no_preprocess;no_preprocess;no_preprocess;no_preprocess;no_preprocess
model_parameters:
layer_out_dump: false
debug_mode: "dump_calibration_data"
enable_vpu: True
march: {opt.march}
onnx_model: {dit_name}
output_model_file_prefix: rdt_dit
working_dir: bpu_output

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,32 @@
model_parameters:
onnx_model: '{img_adaptor_name}'
march: {opt.march}
layer_out_dump: False
working_dir: bpu_output
output_model_file_prefix: rdt_img_adaptor
enable_vpu: True
input_parameters:
input_name: ''
input_type_rt: 'featuremap;'
input_layout_rt: 'NCHW;'
input_type_train: 'featuremap;'
input_layout_train: 'NCHW;'
norm_type: 'no_preprocess;'
calibration_parameters:
cal_data_dir: '{img_adaptor_cal_name}'
cal_data_type: 'float32'
calibration_type: 'default'
quant_config:
model_config:
all_node_type: int16
model_output_type: int16
compiler_parameters:
extra_params:
input_no_padding: true
output_no_padding: true
jobs: 8
compile_mode: 'latency'
debug: True
advice: 1
optimize_level: 'O2'
core_num: 2

View File

@ -0,0 +1,42 @@
import json
import sys
def read_config(config_file, key_path):
"""
Read a value from JSON config file.
Args:
config_file: Path to JSON config file
key_path: Dot-separated path to the key (e.g., "evaluation.checkpoint_path")
Returns:
The value at the specified key path
"""
with open(config_file, 'r') as f:
json_config = json.load(f)
# Navigate through nested keys
keys = key_path.split('.')
value = json_config
for key in keys:
if isinstance(value, dict):
value = value.get(key)
else:
return None
return value
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python read_config.py <config_file> <key_path>", file=sys.stderr)
sys.exit(1)
config_file = sys.argv[1]
key_path = sys.argv[2]
value = read_config(config_file, key_path)
if value is not None:
print(value)
else:
print("", file=sys.stderr)
sys.exit(1)

View File

@ -0,0 +1,2 @@
input/*
output/*

7
RDT/rdt170m-run/.gitignore vendored Normal file
View File

@ -0,0 +1,7 @@
processed_data/
training_data/
checkpoints/
model_config/*.yml
wandb/*
!models/
!data/

View File

@ -0,0 +1,48 @@
FROM registry.d-robotics.cc/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
# ccr-29eug8s3-pub.cnc.bj.baidubce.com/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV TZ=Asia/Shanghai
RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
RUN apt-get update --allow-unauthenticated && apt-get install -y \
software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update \
&& apt-get install -y \
python3.10 \
python3.10-dev \
python3-pip \
python3.10-distutils \
libgl1-mesa-glx \
libglib2.0-0 \
wget \
ffmpeg \
libsm6 \
libxext6 \
&& rm -rf /var/lib/apt/lists/*
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
COPY . /app/
RUN python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
# RUN pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu121
# RUN pip install torch==2.1.0 torchvision==0.16.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
RUN pip install packaging==24.0
RUN pip install tfds-nightly==4.9.4.dev202402070044
RUN pip install flash_attn-2.7.2.post1+cu12torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# RUN mkdir -p /app/dataset/input /app/dataset/output
ENTRYPOINT ["bash", "deploy.sh"]

View File

@ -0,0 +1 @@
from .deploy_policy import *

Binary file not shown.

After

Width:  |  Height:  |  Size: 726 KiB

300
RDT/rdt170m-run/client.py Normal file
View File

@ -0,0 +1,300 @@
#!/usr/bin/env python3
"""
RDT 推理服务器测试客户端
使用模拟数据测试 get_actions 接口
"""
import numpy as np
import logging
import argparse
import time
from cloud_helper import Client
# 配置日志
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
def create_mock_observation(
state_dim=6,
img_history_size=2,
img_height=480,
img_width=640,
num_cameras=3
):
"""创建模拟的观测数据
Args:
state_dim: 状态向量维度关节数量
img_history_size: 图像历史长度
img_height: 图像高度
img_width: 图像宽度
num_cameras: 相机数量
Returns:
observation: 包含状态和图像的观测字典
"""
observation = {}
# 1. 创建模拟的机器人状态(关节角度等)
# 范围在 [-180, 180] 度之间
state = np.random.uniform(-180, 180, size=(state_dim,)).astype(np.float32)
observation["state"] = state
# 2. 创建模拟的相机图像
# 注意msgpack_numpy 会自动处理 numpy 数组的序列化
camera_names = ["cam_high", "cam_left_wrist", "cam_right_wrist"]
for i, cam_name in enumerate(camera_names[:num_cameras]):
# 创建彩色渐变图像作为模拟数据
images = []
for t in range(img_history_size):
# 为每个时间步创建不同颜色的图像
img = np.zeros((img_height, img_width, 3), dtype=np.uint8)
# 创建彩色渐变效果
color_shift = (t * 50 + i * 100) % 255
img[:, :, 0] = np.linspace(color_shift, 255, img_width, dtype=np.uint8) # R
img[:, :, 1] = np.linspace(0, 255 - color_shift, img_height, dtype=np.uint8)[:, None] # G
img[:, :, 2] = 128 # B
images.append(img)
# 堆叠为 (IMG_HISTORY_SIZE, H, W, 3) 格式
observation[f"images.{cam_name}"] = np.stack(images, axis=0)
return observation
def create_test_batch(
observation,
instruction="pick up the bottle and place it in the box",
use_instruction_index=False
):
"""创建完整的测试批次数据
Args:
observation: 观测数据字典
instruction: 指令字符串或索引
use_instruction_index: 是否使用指令索引而非字符串
Returns:
batch: 完整的请求数据
"""
batch = {
"observation": observation,
"instruction": 0 if use_instruction_index else instruction
}
return batch
def test_single_request(client, args):
"""测试单次请求"""
logger.info("=" * 60)
logger.info("开始单次请求测试")
logger.info("=" * 60)
# 创建模拟数据
observation = create_mock_observation(
state_dim=args.state_dim,
img_history_size=args.img_history_size,
img_height=args.img_height,
img_width=args.img_width,
num_cameras=args.num_cameras
)
logger.info(f"模拟观测数据:")
logger.info(f" - state shape: {observation['state'].shape}")
for key in observation.keys():
if key.startswith("images."):
logger.info(f" - {key} shape: {observation[key].shape}")
# 创建请求批次
batch = create_test_batch(
observation,
instruction=args.instruction,
use_instruction_index=args.use_index
)
# 发送请求
logger.info(f"发送指令: {batch['instruction']}")
start_time = time.time()
try:
action = client.call_endpoint("get_actions", batch)
elapsed_time = time.time() - start_time
logger.info(f"✓ 请求成功! 耗时: {elapsed_time*1000:.2f} ms")
logger.info(f" - action shape: {action.shape}")
logger.info(f" - action dtype: {action.dtype}")
logger.info(f" - action range: [{action.min():.3f}, {action.max():.3f}]")
logger.info(f" - action preview (前3个时间步的前3个维度):")
preview_steps = min(3, action.shape[0])
preview_dims = min(3, action.shape[1])
for t in range(preview_steps):
logger.info(f" t={t}: {action[t, :preview_dims]}")
return True
except Exception as e:
logger.error(f"✗ 请求失败: {e}")
return False
def test_multiple_requests(client, args):
"""测试多次连续请求(性能测试)"""
logger.info("=" * 60)
logger.info(f"开始连续请求测试 (共 {args.num_requests} 次)")
logger.info("=" * 60)
# 预先创建观测数据
observation = create_mock_observation(
state_dim=args.state_dim,
img_history_size=args.img_history_size,
img_height=args.img_height,
img_width=args.img_width,
num_cameras=args.num_cameras
)
batch = create_test_batch(
observation,
instruction=args.instruction,
use_instruction_index=args.use_index
)
success_count = 0
total_time = 0
latencies = []
for i in range(args.num_requests):
try:
start_time = time.time()
action = client.call_endpoint("get_actions", batch)
elapsed_time = time.time() - start_time
success_count += 1
total_time += elapsed_time
latencies.append(elapsed_time)
if (i + 1) % 10 == 0:
logger.info(f"已完成 {i + 1}/{args.num_requests} 次请求")
except Exception as e:
logger.error(f"{i+1} 次请求失败: {e}")
# 统计结果
logger.info("=" * 60)
logger.info("性能统计:")
logger.info(f" - 总请求数: {args.num_requests}")
logger.info(f" - 成功数: {success_count}")
logger.info(f" - 失败数: {args.num_requests - success_count}")
logger.info(f" - 成功率: {success_count/args.num_requests*100:.1f}%")
if latencies:
latencies = np.array(latencies)
logger.info(f" - 平均延迟: {np.mean(latencies)*1000:.2f} ms")
logger.info(f" - 中位数延迟: {np.median(latencies)*1000:.2f} ms")
logger.info(f" - 最小延迟: {np.min(latencies)*1000:.2f} ms")
logger.info(f" - 最大延迟: {np.max(latencies)*1000:.2f} ms")
logger.info(f" - 吞吐量: {success_count/total_time:.2f} requests/s")
def test_different_instructions(client, args):
"""测试不同的指令"""
logger.info("=" * 60)
logger.info("测试不同指令")
logger.info("=" * 60)
instructions = [
"pick up the red cube",
"place the bottle on the table",
"move to the left",
"grasp the bottle",
"open the drawer"
]
observation = create_mock_observation(
state_dim=args.state_dim,
img_history_size=args.img_history_size,
img_height=args.img_height,
img_width=args.img_width,
num_cameras=args.num_cameras
)
for i, instruction in enumerate(instructions):
logger.info(f"\n测试指令 {i+1}/{len(instructions)}: '{instruction}'")
batch = create_test_batch(observation, instruction=instruction)
try:
start_time = time.time()
action = client.call_endpoint("get_actions", batch)
elapsed_time = time.time() - start_time
logger.info(f" ✓ 成功 | 耗时: {elapsed_time*1000:.2f} ms | action shape: {action.shape}")
except Exception as e:
logger.error(f" ✗ 失败: {e}")
def main():
parser = argparse.ArgumentParser(description="RDT 推理服务器测试客户端")
# 连接参数
parser.add_argument("--host", type=str, default="localhost", help="服务器地址")
parser.add_argument("--port", type=int, default=8005, help="服务器端口")
# 测试模式
parser.add_argument("--mode", type=str, default="single",
choices=["single", "multiple", "instructions"],
help="测试模式: single(单次), multiple(多次), instructions(不同指令)")
parser.add_argument("--num-requests", type=int, default=50,
help="多次测试的请求数量")
# 数据参数
parser.add_argument("--state-dim", type=int, default=6, help="状态向量维度")
parser.add_argument("--img-history-size", type=int, default=2, help="图像历史长度")
parser.add_argument("--img-height", type=int, default=480, help="图像高度")
parser.add_argument("--img-width", type=int, default=640, help="图像宽度")
parser.add_argument("--num-cameras", type=int, default=3, help="相机数量 (与服务器配置一致)")
# 指令参数
parser.add_argument("--instruction", type=str,
default="pick up the bottle and place it in the box",
help="测试指令")
parser.add_argument("--use-index", action="store_true",
help="使用指令索引而非字符串")
args = parser.parse_args()
# 连接服务器
logger.info(f"正在连接到 {args.host}:{args.port} ...")
try:
client = Client(host=args.host, port=args.port)
logger.info("✓ 连接成功!")
except Exception as e:
logger.error(f"✗ 连接失败: {e}")
return
# 根据模式运行测试
try:
if args.mode == "single":
test_single_request(client, args)
elif args.mode == "multiple":
test_multiple_requests(client, args)
elif args.mode == "instructions":
test_different_instructions(client, args)
except KeyboardInterrupt:
logger.info("\n测试被用户中断")
except Exception as e:
logger.error(f"测试过程中发生错误: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,162 @@
import zmq
import msgpack
import msgpack_numpy as m
import logging
import time
from typing import Any, Callable
import zstandard as zstd
logger = logging.getLogger(__name__)
compresser = zstd.ZstdCompressor(level=12)
decompresser = zstd.ZstdDecompressor()
def _pack(data: Any) -> bytes:
return compresser.compress(msgpack.packb(data, default=m.encode, use_bin_type=True))
def _unpack(data: bytes) -> Any:
return msgpack.unpackb(
decompresser.decompress(data), object_hook=m.decode, raw=False
)
class Server:
def __init__(self, host: str = "*", port: int = 5555):
self.host = host
self.port = port
self.context = zmq.Context()
self.socket = self.context.socket(zmq.REP)
self.socket.bind(f"tcp://{self.host}:{self.port}")
logger.info(f"Server started at tcp://{self.host}:{self.port}")
self.endpoints: dict[str, Callable[[Any], Any]] = {}
def register_endpoint(self, command: str, func: Callable[[Any], Any]):
self.endpoints[command] = func
logger.info(f"Registered endpoint: {command} -> {func}")
def return_error(self, message: str) -> None:
self.socket.send(_pack({"status": "error", "data": message}))
def return_ok(self, data: Any) -> None:
self.socket.send(_pack({"status": "ok", "data": data}))
def handle_once(self) -> None:
message = self.socket.recv()
message = _unpack(message)
cmd = message.get("command")
data = message.get("data")
logger.info("Received Command: %s", cmd)
handler = self.endpoints.get(cmd)
if handler is not None:
try:
if data is None:
response = handler()
else:
response = handler(data)
self.return_ok(response)
except Exception as e:
logger.error(f"Error handling command {cmd}: {e}")
self.return_error(str(e))
else:
logger.warning(f"Unknown command: {cmd}")
self.return_error(f"Unknown command: {cmd}")
def loop_forever(self):
try:
while True:
self.handle_once()
except KeyboardInterrupt:
logger.info("Server shutting down...")
finally:
self.socket.close()
self.context.term()
class Client:
def __init__(self, host: str = "localhost", port: int = 5555):
self.context = zmq.Context()
self.socket = self.context.socket(zmq.REQ)
self.socket.connect(f"tcp://{host}:{port}")
logger.info(f"Client connected to tcp://{host}:{port}")
def call_endpoint(self, command: str, data=None):
self.socket.send(_pack({"command": command, "data": data}))
message = self.socket.recv()
message = _unpack(message)
if message.get("status") == "ok":
return message.get("data")
else:
logger.error(f"Error from server: {message.get('data')}")
raise Exception(f"Error from server: {message.get('data')}")
def freq_control(freq: int = 25):
def decorator(func):
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
elapsed_time = end_time - start_time
# logger.info(f"'{func.__name__}' tooks {elapsed_time * 1000:.2f} ms")
time.sleep(max(0, (1.0 / freq) - elapsed_time))
return result
return wrapper
return decorator
if __name__ == "__main__":
import sys
from time import sleep
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
assert (len(sys.argv) == 2) and ((mode := sys.argv[1]) in ("server", "client")), (
"Usage: python service.py [server|client]"
)
## Protocol:
# Request: { "command": str, "data": Any }
# Response: { "status": "ok" | "error", "data": Any if status=="ok" else str (ErrorMsg) }
if mode == "server":
server = Server()
server.register_endpoint("ping", lambda: "pong")
server.register_endpoint("echo", lambda x: x)
server.register_endpoint("add", lambda data: data["a"] + data["b"])
server.loop_forever()
elif mode == "client":
client = Client()
while True:
try:
response = client.call_endpoint("ping")
print(f"Response from server: {response}")
response = client.call_endpoint("echo", "Hello, World!")
print(f"Response from server: {response}")
response = client.call_endpoint("add", {"a": 5, "b": 10})
print(f"Response from server: {response}")
sleep(0.2)
except Exception as e:
print(f"Error: {e}")
break

View File

@ -0,0 +1,71 @@
common:
# The number of historical images
img_history_size: 2
# The number of future actions to predict
action_chunk_size: 64
# The number of cameras to be used in the model
num_cameras: 3
# Dimension for state/action, we use the same space for both state and action
# This MUST be equal to configs/state_vec.py
state_dim: 128
dataset:
# We will extract the data from raw dataset
# and store them in the disk buffer by producer
# When training, we will read the data
# randomly from the buffer by consumer
# The producer will replace the data which has been
# read by the consumer with new data
# The path to the buffer (at least 400GB)
buf_path: /path/to/buffer
# The number of chunks in the buffer
buf_num_chunks: 512
# The number of samples (step rather than episode) in each chunk
buf_chunk_size: 512
# We will filter the episodes with length less than `epsd_len_thresh_low`
epsd_len_thresh_low: 32
# For those more than `epsd_len_thresh_high`,
# we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
# to better balance the training datasets
epsd_len_thresh_high: 2048
# How to fit the image size
image_aspect_ratio: pad
# Maximum number of language tokens
tokenizer_max_length: 1024
model:
# Config for condition adpators
lang_adaptor: mlp2x_gelu
img_adaptor: mlp2x_gelu
state_adaptor: mlp3x_gelu
lang_token_dim: 4096
img_token_dim: 1152
# Dim of action or proprioception vector
# A `state` refers to an action or a proprioception vector
state_token_dim: 128
# Config for RDT structure
rdt:
# 1B: num_head 32 hidden_size 2048
hidden_size: 2048
depth: 28
num_heads: 32
cond_pos_embed_type: multimodal
# For noise scheduler
noise_scheduler:
type: ddpm
num_train_timesteps: 1000
num_inference_timesteps: 5
beta_schedule: squaredcos_cap_v2 # Critical choice
prediction_type: sample
clip_sample: False
# For EMA (params averaging)
# We do not use EMA currently
ema:
update_after_step: 0
inv_gamma: 1.0
power: 0.75
min_value: 0.0
max_value: 0.9999

View File

@ -0,0 +1,50 @@
{
"A": [
[
-0.2691913843154907,
-0.21995729207992554,
-0.182277649641037
],
[
0.35127854347229004,
0.2769763469696045,
0.17159393429756165
]
],
"B": [
[
-0.2576896846294403,
-0.22244493663311005,
-0.20557966828346252
],
[
0.32854634523391724,
0.2922680974006653,
0.17373555898666382
]
],
"C": [
[
-0.29205888509750366,
-0.24688798189163208,
-0.17577645182609558
],
[
0.25053921341896057,
0.3277084231376648,
0.16431939601898193
]
],
"D": [
[
-0.25131964683532715,
-0.15233077108860016,
-0.13294968008995056
],
[
0.19209328293800354,
0.19344553351402283,
0.1370421051979065
]
]
}

View File

@ -0,0 +1,65 @@
{
"fractal20220817_data": 3,
"taco_play": 15,
"jaco_play": 10,
"berkeley_cable_routing": 10,
"nyu_door_opening_surprising_effectiveness": 3,
"viola": 20,
"berkeley_autolab_ur5": 5,
"toto": 30,
"kuka": 10,
"language_table": 10,
"columbia_cairlab_pusht_real": 10,
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 20,
"nyu_rot_dataset_converted_externally_to_rlds":3,
"stanford_hydra_dataset_converted_externally_to_rlds": 10,
"austin_buds_dataset_converted_externally_to_rlds": 20,
"nyu_franka_play_dataset_converted_externally_to_rlds": 3,
"maniskill_dataset_converted_externally_to_rlds": 20,
"furniture_bench_dataset_converted_externally_to_rlds": 10,
"ucsd_kitchen_dataset_converted_externally_to_rlds": 2,
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": 3,
"austin_sailor_dataset_converted_externally_to_rlds": 20,
"austin_sirius_dataset_converted_externally_to_rlds": 20,
"bc_z": 10,
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": 10,
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 10,
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
"utokyo_xarm_bimanual_converted_externally_to_rlds": 10,
"berkeley_mvp_converted_externally_to_rlds": 5,
"berkeley_rpt_converted_externally_to_rlds": 30,
"kaist_nonprehensile_converted_externally_to_rlds": 10,
"stanford_mask_vit_converted_externally_to_rlds": 0,
"tokyo_u_lsmo_converted_externally_to_rlds": 10,
"dlr_sara_pour_converted_externally_to_rlds": 10,
"dlr_sara_grid_clamp_converted_externally_to_rlds": 10,
"dlr_edan_shared_control_converted_externally_to_rlds": 5,
"asu_table_top_converted_externally_to_rlds": 12.5,
"stanford_robocook_converted_externally_to_rlds": 5,
"eth_agent_affordances": 66.6,
"imperialcollege_sawyer_wrist_cam": 10,
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": 20,
"uiuc_d3field": 1,
"utaustin_mutex": 20,
"berkeley_fanuc_manipulation": 10,
"cmu_play_fusion": 5,
"cmu_stretch": 10,
"berkeley_gnm_recon": 3,
"berkeley_gnm_cory_hall": 5,
"berkeley_gnm_sac_son": 10,
"robo_net": 1,
"roboturk_real_towercreation": 10,
"roboturk_real_laundrylayout": 10,
"roboturk_real_objectsearch": 10,
"aloha_mobile": 50,
"aloha_static": 50,
"roboset": 5,
"droid": 15,
"fmb": 10,
"dobbe": 30,
"qut_dexterous_manpulation": 30,
"agilex": 25,
"rh20t": 10,
"calvin": 30,
"bridgev2": 5
}

View File

@ -0,0 +1,575 @@
{
"fractal20220817_data": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[
1,0,0,0
]
},
"taco_play": {
"image_keys": [
"rgb_static",
"rgb_gripper",
"rgb_static",
"rgb_static"
],
"image_mask":[
1,1,0,0
]
},
"jaco_play": {
"image_keys": [
"image",
"image_wrist",
"image_wrist",
"image_wrist"
],
"image_mask":[
1,1,0,0
]
},
"berkeley_cable_routing": {
"image_keys": [
"image",
"wrist45_image",
"wrist225_image",
"top_image"
],
"image_mask":[1,1,0,1]
},
"nyu_door_opening_surprising_effectiveness": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"viola": {
"image_keys": [
"agentview_rgb",
"eye_in_hand_rgb",
"eye_in_hand_rgb",
"eye_in_hand_rgb"
],
"image_mask":[1,1,0,0]
},
"berkeley_autolab_ur5": {
"image_keys": [
"image",
"hand_image",
"hand_image",
"hand_image"
],
"image_mask":[1,1,0,0]
},
"toto": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"kuka": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"language_table": {
"image_keys": [
"rgb",
"rgb",
"rgb",
"rgb"
],
"image_mask":[1,0,0,0]
},
"columbia_cairlab_pusht_real": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"nyu_rot_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"stanford_hydra_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"austin_buds_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"nyu_franka_play_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"image_additional_view",
"image_additional_view",
"image_additional_view"
],
"image_mask":[1,0,0,1]
},
"maniskill_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"furniture_bench_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"ucsd_kitchen_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"austin_sailor_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"austin_sirius_dataset_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"bc_z": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": {
"image_keys": [
"image",
"hand_image",
"hand_image",
"image2"
],
"image_mask":[1,1,0,1]
},
"utokyo_xarm_bimanual_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"berkeley_mvp_converted_externally_to_rlds": {
"image_keys": [
"hand_image",
"hand_image",
"hand_image",
"hand_image"
],
"image_mask":[0,1,0,0]
},
"berkeley_rpt_converted_externally_to_rlds": {
"image_keys": [
"hand_image",
"hand_image",
"hand_image",
"hand_image"
],
"image_mask":[0,1,0,0]
},
"kaist_nonprehensile_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"stanford_mask_vit_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"tokyo_u_lsmo_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"dlr_sara_pour_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"dlr_sara_grid_clamp_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"dlr_edan_shared_control_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"asu_table_top_converted_externally_to_rlds": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"stanford_robocook_converted_externally_to_rlds": {
"image_keys": [
"image_2",
"image_1",
"image_3",
"image_4"
],
"image_mask":[1,0,0,1]
},
"eth_agent_affordances": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"imperialcollege_sawyer_wrist_cam": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[0,1,0,0]
},
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"uiuc_d3field": {
"image_keys": [
"image_1",
"image_2",
"image_3",
"image_4"
],
"image_mask":[1,0,0,1]
},
"utaustin_mutex": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"berkeley_fanuc_manipulation": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"cmu_play_fusion": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"cmu_stretch": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"berkeley_gnm_recon": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"berkeley_gnm_cory_hall": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"berkeley_gnm_sac_son": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"robo_net": {
"image_keys": [
"image",
"image1",
"image2",
"image2"
],
"image_mask":[1,0,0,1]
},
"roboturk_real_towercreation": {
"image_keys": [
"top_rgb_frame",
"front_rgb_frame",
"front_rgb_frame",
"front_rgb_frame"
],
"image_mask":[1,0,0,1]
},
"roboturk_real_laundrylayout": {
"image_keys": [
"top_rgb_frame",
"front_rgb_frame",
"front_rgb_frame",
"front_rgb_frame"
],
"image_mask":[1,0,0,1]
},
"roboturk_real_objectsearch": {
"image_keys": [
"top_rgb_frame",
"front_rgb_frame",
"front_rgb_frame",
"front_rgb_frame"
],
"image_mask":[1,0,0,1]
},
"aloha_mobile": {
"image_keys": [
"cam_high",
"cam_right_wrist",
"cam_left_wrist",
"cam_right_wrist"
],
"image_mask":[1,1,1,0]
},
"aloha_static": {
"image_keys": [
"cam_high",
"cam_right_wrist",
"cam_left_wrist",
"cam_low"
],
"image_mask":[1,1,1,1]
},
"roboset": {
"image_keys": [
"rgb_top",
"rgb_right",
"rgb_left",
"rgb_right"
],
"image_mask":[1,1,1,0]
},
"droid": {
"image_keys": [
"exterior_image_1_left",
"wrist_image_left",
"wrist_image_left",
"exterior_image_2_left"
],
"image_mask":[1,1,0,1]
},
"fmb": {
"image_keys": [
"image_side_1",
"image_wrist_1",
"image_wrist_1",
"image_side_2"
],
"image_mask":[1,1,0,1]
},
"dobbe": {
"image_keys": [
"wrist_image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[0,1,0,0]
},
"qut_dexterous_manpulation": {
"image_keys": [
"image",
"wrist_image",
"wrist_image",
"wrist_image"
],
"image_mask":[1,1,0,0]
},
"agilex": {
"image_keys": [
"cam_high",
"cam_right_wrist",
"cam_left_wrist",
"cam_right_wrist"
],
"image_mask":[1,1,1,0]
},
"rh20t": {
"image_keys": [
"image",
"image",
"image",
"image"
],
"image_mask":[1,0,0,0]
},
"calvin": {
"image_keys": [
"rgb_static",
"rgb_gripper",
"rgb_gripper",
"rgb_gripper"
],
"image_mask":[1,1,0,0]
},
"bridgev2": {
"image_keys": [
"images0",
"images0",
"images0",
"images0"
],
"image_mask":[1,0,0,0]
}
}

View File

@ -0,0 +1,525 @@
{
"agilex": {
"dataset_name": "agilex",
"state_mean": [
-0.0036545392947090432,
-0.2773659935760079,
0.3147616748061523,
0.3813313179910183,
0.04028575944090457,
0.034888520819083294,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0
],
"state_std": [
0.05763674563578847,
0.2580181064167735,
0.19785840483767897,
0.05020347749331385,
0.054529239104671424,
0.05020521339363586,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0
],
"state_min": [
-0.17447535196940103,
-0.5522612677680121,
-0.3340397516886393,
0.21861712137858072,
-0.09725829230414497,
0.003396739231215583,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0
],
"state_max": [
0.21961932712131077,
0.30613206227620443,
0.5444545321994357,
0.4866888682047526,
0.31486290825737845,
0.3355223337809245,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0
]
}
}

View File

@ -0,0 +1,3 @@
[
"agilex"
]

View File

@ -0,0 +1,3 @@
{
"agilex": 100
}

View File

@ -0,0 +1,48 @@
[
"fractal20220817_data",
"jaco_play",
"taco_play",
"berkeley_cable_routing",
"viola",
"berkeley_autolab_ur5",
"toto",
"nyu_door_opening_surprising_effectiveness",
"columbia_cairlab_pusht_real",
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds",
"austin_buds_dataset_converted_externally_to_rlds",
"kuka",
"utokyo_xarm_bimanual_converted_externally_to_rlds",
"stanford_hydra_dataset_converted_externally_to_rlds",
"maniskill_dataset_converted_externally_to_rlds",
"ucsd_kitchen_dataset_converted_externally_to_rlds",
"ucsd_pick_and_place_dataset_converted_externally_to_rlds",
"austin_sailor_dataset_converted_externally_to_rlds",
"austin_sirius_dataset_converted_externally_to_rlds",
"bc_z",
"utokyo_pr2_opening_fridge_converted_externally_to_rlds",
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds",
"utokyo_xarm_pick_and_place_converted_externally_to_rlds",
"berkeley_mvp_converted_externally_to_rlds",
"berkeley_rpt_converted_externally_to_rlds",
"kaist_nonprehensile_converted_externally_to_rlds",
"tokyo_u_lsmo_converted_externally_to_rlds",
"dlr_sara_grid_clamp_converted_externally_to_rlds",
"stanford_robocook_converted_externally_to_rlds",
"imperialcollege_sawyer_wrist_cam",
"iamlab_cmu_pickup_insert_converted_externally_to_rlds",
"utaustin_mutex",
"berkeley_fanuc_manipulation",
"cmu_play_fusion",
"language_table",
"furniture_bench_dataset_converted_externally_to_rlds",
"droid",
"fmb",
"dobbe",
"qut_dexterous_manpulation",
"aloha_mobile",
"aloha_static",
"roboset",
"rh20t",
"calvin",
"bridgev2"
]

View File

@ -0,0 +1,48 @@
{
"fractal20220817_data": 271,
"taco_play": 60,
"jaco_play": 33,
"berkeley_cable_routing": 8,
"nyu_door_opening_surprising_effectiveness": 10,
"viola": 12,
"berkeley_autolab_ur5": 32,
"toto": 32,
"kuka": 50,
"language_table": 100,
"columbia_cairlab_pusht_real": 12,
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 55,
"stanford_hydra_dataset_converted_externally_to_rlds": 24,
"austin_buds_dataset_converted_externally_to_rlds": 7,
"maniskill_dataset_converted_externally_to_rlds": 174,
"furniture_bench_dataset_converted_externally_to_rlds": 71,
"ucsd_kitchen_dataset_converted_externally_to_rlds": 12,
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": 37,
"austin_sailor_dataset_converted_externally_to_rlds": 15,
"austin_sirius_dataset_converted_externally_to_rlds": 24,
"bc_z": 208,
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": 9,
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 15,
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
"utokyo_xarm_bimanual_converted_externally_to_rlds": 1,
"berkeley_mvp_converted_externally_to_rlds": 22,
"berkeley_rpt_converted_externally_to_rlds": 30,
"kaist_nonprehensile_converted_externally_to_rlds": 14,
"tokyo_u_lsmo_converted_externally_to_rlds": 7,
"dlr_sara_grid_clamp_converted_externally_to_rlds": 1,
"stanford_robocook_converted_externally_to_rlds": 50,
"imperialcollege_sawyer_wrist_cam": 13,
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": 25,
"utaustin_mutex": 39,
"berkeley_fanuc_manipulation": 20,
"cmu_play_fusion": 24,
"droid": 303,
"fmb": 42,
"dobbe": 36,
"qut_dexterous_manpulation": 14,
"aloha_mobile": 150,
"aloha_static": 150,
"roboset": 135,
"rh20t": 331,
"calvin": 100,
"bridgev2": 224
}

View File

@ -0,0 +1,126 @@
STATE_VEC_IDX_MAPPING = {
# [0, 10): right arm joint positions
**{
"arm_joint_{}_pos".format(i): i
for i in range(10)
},
**{
"right_arm_joint_{}_pos".format(i): i
for i in range(10)
},
# [10, 15): right gripper joint positions
**{
"gripper_joint_{}_pos".format(i): i + 10
for i in range(5)
},
**{
"right_gripper_joint_{}_pos".format(i): i + 10
for i in range(5)
},
"gripper_open": 10, # alias of right_gripper_joint_0_pos
"right_gripper_open": 10,
# [15, 25): right arm joint velocities
**{
"arm_joint_{}_vel".format(i): i + 15
for i in range(10)
},
**{
"right_arm_joint_{}_vel".format(i): i + 15
for i in range(10)
},
# [25, 30): right gripper joint velocities
**{
"gripper_joint_{}_vel".format(i): i + 25
for i in range(5)
},
**{
"right_gripper_joint_{}_vel".format(i): i + 25
for i in range(5)
},
"gripper_open_vel": 25, # alias of right_gripper_joint_0_vel
"right_gripper_open_vel": 25,
# [30, 33): right end effector positions
"eef_pos_x": 30,
"right_eef_pos_x": 30,
"eef_pos_y": 31,
"right_eef_pos_y": 31,
"eef_pos_z": 32,
"right_eef_pos_z": 32,
# [33, 39): right end effector 6D pose
"eef_angle_0": 33,
"right_eef_angle_0": 33,
"eef_angle_1": 34,
"right_eef_angle_1": 34,
"eef_angle_2": 35,
"right_eef_angle_2": 35,
"eef_angle_3": 36,
"right_eef_angle_3": 36,
"eef_angle_4": 37,
"right_eef_angle_4": 37,
"eef_angle_5": 38,
"right_eef_angle_5": 38,
# [39, 42): right end effector velocities
"eef_vel_x": 39,
"right_eef_vel_x": 39,
"eef_vel_y": 40,
"right_eef_vel_y": 40,
"eef_vel_z": 41,
"right_eef_vel_z": 41,
# [42, 45): right end effector angular velocities
"eef_angular_vel_roll": 42,
"right_eef_angular_vel_roll": 42,
"eef_angular_vel_pitch": 43,
"right_eef_angular_vel_pitch": 43,
"eef_angular_vel_yaw": 44,
"right_eef_angular_vel_yaw": 44,
# [45, 50): reserved
# [50, 60): left arm joint positions
**{
"left_arm_joint_{}_pos".format(i): i + 50
for i in range(10)
},
# [60, 65): left gripper joint positions
**{
"left_gripper_joint_{}_pos".format(i): i + 60
for i in range(5)
},
"left_gripper_open": 60, # alias of left_gripper_joint_0_pos
# [65, 75): left arm joint velocities
**{
"left_arm_joint_{}_vel".format(i): i + 65
for i in range(10)
},
# [75, 80): left gripper joint velocities
**{
"left_gripper_joint_{}_vel".format(i): i + 75
for i in range(5)
},
"left_gripper_open_vel": 75, # alias of left_gripper_joint_0_vel
# [80, 83): left end effector positions
"left_eef_pos_x": 80,
"left_eef_pos_y": 81,
"left_eef_pos_z": 82,
# [83, 89): left end effector 6D pose
"left_eef_angle_0": 83,
"left_eef_angle_1": 84,
"left_eef_angle_2": 85,
"left_eef_angle_3": 86,
"left_eef_angle_4": 87,
"left_eef_angle_5": 88,
# [89, 92): left end effector velocities
"left_eef_vel_x": 89,
"left_eef_vel_y": 90,
"left_eef_vel_z": 91,
# [92, 95): left end effector angular velocities
"left_eef_angular_vel_roll": 92,
"left_eef_angular_vel_pitch": 93,
"left_eef_angular_vel_yaw": 94,
# [95, 100): reserved
# [100, 102): base linear velocities
"base_vel_x": 100,
"base_vel_y": 101,
# [102, 103): base angular velocities
"base_angular_vel": 102,
# [103, 128): reserved
}
STATE_VEC_LEN = 128

View File

@ -0,0 +1,14 @@
{
"bf16": {
"enabled": "auto"
},
"train_micro_batch_size_per_gpu": "auto",
"train_batch_size": "auto",
"gradient_accumulation_steps": "auto",
"zero_optimization": {
"stage": 2,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9
}
}

2
RDT/rdt170m-run/data/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
# Ignore data files
datasets

Some files were not shown because too many files have changed in this diff Show More