first update
This commit is contained in:
commit
c88bfcf840
174
.gitignore
vendored
Normal file
174
.gitignore
vendored
Normal file
@ -0,0 +1,174 @@
|
||||
input/
|
||||
output/
|
||||
Temp/
|
||||
weights/
|
||||
# ---> Python
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# UV
|
||||
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
#uv.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
2
ACT/act_export/.dockerignore
Normal file
2
ACT/act_export/.dockerignore
Normal file
@ -0,0 +1,2 @@
|
||||
input/*
|
||||
output/*
|
||||
40
ACT/act_export/Dockerfile
Normal file
40
ACT/act_export/Dockerfile
Normal file
@ -0,0 +1,40 @@
|
||||
|
||||
FROM registry.d-robotics.cc/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
|
||||
# ccr-29eug8s3-pub.cnc.bj.baidubce.com/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
|
||||
WORKDIR /app
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV TZ=Asia/Shanghai
|
||||
|
||||
RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
|
||||
sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
|
||||
|
||||
RUN apt-get update --allow-unauthenticated && apt-get install -y \
|
||||
software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y \
|
||||
python3.10 \
|
||||
python3.10-dev \
|
||||
python3-pip \
|
||||
python3.10-distutils \
|
||||
libgl1-mesa-glx \
|
||||
libglib2.0-0 \
|
||||
wget \
|
||||
ffmpeg \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
|
||||
|
||||
|
||||
|
||||
COPY . /app/
|
||||
|
||||
ENV TORCH_HOME=/app/weights/
|
||||
RUN python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
RUN pip install --ignore-installed -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
ENTRYPOINT ["python3", "export.py"]
|
||||
465
ACT/act_export/export.py
Normal file
465
ACT/act_export/export.py
Normal file
@ -0,0 +1,465 @@
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import argparse
|
||||
import onnx
|
||||
import json
|
||||
import yaml
|
||||
from copy import deepcopy
|
||||
from termcolor import colored
|
||||
from onnxsim import simplify
|
||||
from pprint import pformat
|
||||
import time
|
||||
from lerobot.policies.act.modeling_act import ACTPolicy
|
||||
from lerobot.datasets.factory import make_dataset
|
||||
from lerobot.utils.utils import get_safe_torch_device, init_logging
|
||||
from lerobot.configs import parser
|
||||
from lerobot.configs.train import TrainPipelineConfig
|
||||
|
||||
_global_config = None
|
||||
|
||||
BPU_VisionEncoder = "BPU_ACTPolicy_VisionEncoder"
|
||||
BPU_TransformerLayers = "BPU_ACTPolicy_TransformerLayers"
|
||||
|
||||
def onnx_sim(onnx_path, onnx_sim):
|
||||
if onnx_sim:
|
||||
model_onnx = onnx.load(onnx_path) # load onnx model
|
||||
onnx.checker.check_model(model_onnx) # check onnx model
|
||||
model_onnx, check = simplify(
|
||||
model_onnx,
|
||||
dynamic_input_shape=False,
|
||||
input_shapes=None)
|
||||
assert check, 'assert check failed'
|
||||
onnx.save(model_onnx, onnx_path)
|
||||
|
||||
def load_config(config_path):
|
||||
# 根据文件扩展名选择加载方式
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
config_dict = json.load(f)
|
||||
args = []
|
||||
|
||||
if 'export' in config_dict:
|
||||
export_cfg = config_dict['export']
|
||||
if 'repo_id' in export_cfg:
|
||||
args.extend(['--dataset.repo_id', str(export_cfg['repo_id'])])
|
||||
if 'dataset_path' in export_cfg:
|
||||
args.extend(['--dataset.root', str(export_cfg['dataset_path'])])
|
||||
|
||||
args.extend(['--policy.type', 'act'])
|
||||
args.extend(['--policy.device', 'cpu' if 'gpu_id' not in config_dict else f"cuda"])
|
||||
args.extend(['--policy.repo_id', str(export_cfg['repo_id'])])
|
||||
|
||||
# 使用 opencv 作为视频后端,避免 torchcodec 需要 FFmpeg 的问题
|
||||
args.extend(['--dataset.video_backend', 'pyav'])
|
||||
|
||||
args.extend(['--wandb.enable', 'false'])
|
||||
|
||||
# 保留原始的脚本名称作为 sys.argv[0],然后添加参数
|
||||
sys.argv = [sys.argv[0]] + args
|
||||
|
||||
logging.info(f"Loaded config from {config_path}")
|
||||
logging.info(f"Config: {sys.argv}")
|
||||
|
||||
return config_dict
|
||||
|
||||
return None
|
||||
|
||||
class BPU_ACTPolicy_VisionEncoder(nn.Module):
|
||||
def __init__(self, act_policy):
|
||||
super().__init__()
|
||||
self.backbone = deepcopy(act_policy.model.backbone)
|
||||
self.encoder_img_feat_input_proj = deepcopy(act_policy.model.encoder_img_feat_input_proj)
|
||||
def forward(self, images):
|
||||
cam_features = self.backbone(images)["feature_map"]
|
||||
cam_features = self.encoder_img_feat_input_proj(cam_features)
|
||||
cam_features = cam_features
|
||||
return cam_features
|
||||
|
||||
class BPU_ACTPolicy_TransformerLayers(nn.Module):
|
||||
def __init__(self, act_policy, camera_names):
|
||||
super().__init__()
|
||||
self.model = deepcopy(act_policy.model)
|
||||
self.camera_names = camera_names
|
||||
|
||||
def forward(self, states, *vision_features):
|
||||
latent_sample = torch.zeros([1, self.model.config.latent_dim], dtype=torch.float32)
|
||||
|
||||
encoder_in_tokens = [self.model.encoder_latent_input_proj(latent_sample)]
|
||||
encoder_in_pos_embed = self.model.encoder_1d_feature_pos_embed.weight.unsqueeze(1).unbind(dim=0)
|
||||
encoder_in_tokens.append(self.model.encoder_robot_state_input_proj(states))
|
||||
|
||||
all_cam_features = []
|
||||
all_cam_pos_embeds = []
|
||||
|
||||
# 动态处理所有相机的视觉特征
|
||||
for vision_feature in vision_features:
|
||||
cam_pos_embed = self.model.encoder_cam_feat_pos_embed(vision_feature)
|
||||
all_cam_features.append(vision_feature)
|
||||
all_cam_pos_embeds.append(cam_pos_embed)
|
||||
|
||||
|
||||
tokens = []
|
||||
for token in encoder_in_tokens:
|
||||
tokens.append(token.view(1,1,self.model.config.dim_model))
|
||||
all_cam_features = torch.cat(all_cam_features, axis=-1).permute(2, 3, 0, 1).view(-1,1,self.model.config.dim_model)
|
||||
tokens.append(all_cam_features)
|
||||
encoder_in_tokens = torch.cat(tokens, axis=0)
|
||||
|
||||
pos_embeds = []
|
||||
for pos_embed in encoder_in_pos_embed:
|
||||
pos_embeds.append(pos_embed.view(1,1,self.model.config.dim_model))
|
||||
all_cam_pos_embeds = torch.cat(all_cam_pos_embeds, axis=-1).permute(2, 3, 0, 1).view(-1,1,self.model.config.dim_model)
|
||||
pos_embeds.append(all_cam_pos_embeds)
|
||||
encoder_in_pos_embed = torch.cat(pos_embeds, axis=0)
|
||||
|
||||
encoder_out = self.model.encoder(encoder_in_tokens, pos_embed=encoder_in_pos_embed)
|
||||
|
||||
decoder_in = torch.zeros(
|
||||
(self.model.config.chunk_size, 1, self.model.config.dim_model),
|
||||
dtype=encoder_in_pos_embed.dtype,
|
||||
device=encoder_in_pos_embed.device,
|
||||
)
|
||||
decoder_out = self.model.decoder(
|
||||
decoder_in,
|
||||
encoder_out,
|
||||
encoder_pos_embed=encoder_in_pos_embed,
|
||||
decoder_pos_embed=self.model.decoder_pos_embed.weight.unsqueeze(1),
|
||||
)
|
||||
|
||||
decoder_out = decoder_out.transpose(0, 1)
|
||||
|
||||
actions = self.model.action_head(decoder_out)
|
||||
|
||||
return actions
|
||||
|
||||
def lerobotTensor2cvmat(tensor):
|
||||
img = (tensor*255).permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8)[0,:,:,:]
|
||||
# img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
||||
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
||||
return img
|
||||
|
||||
def onnx_sim(onnx_path, onnx_sim):
|
||||
if onnx_sim:
|
||||
model_onnx = onnx.load(onnx_path) # load onnx model
|
||||
onnx.checker.check_model(model_onnx) # check onnx model
|
||||
model_onnx, check = simplify(
|
||||
model_onnx,
|
||||
dynamic_input_shape=False,
|
||||
input_shapes=None)
|
||||
assert check, 'assert check failed'
|
||||
onnx.save(model_onnx, onnx_path)
|
||||
|
||||
@parser.wrap()
|
||||
def main(cfg: TrainPipelineConfig):
|
||||
# LeRobot的参数列表
|
||||
# 跳过validate(),配置不是用于训练,而是用于导出
|
||||
# cfg.validate()
|
||||
logging.info(pformat(cfg.to_dict()))
|
||||
|
||||
# BPU导出参数 - 从全局配置或命令行读取
|
||||
global _global_config
|
||||
|
||||
class BPUOptions:
|
||||
act_path = _global_config['export']['model_path']
|
||||
export_path = _global_config['export']['output_path'] + "/" + _global_config['task_id']
|
||||
cal_num = _global_config['export']['calibration_num']
|
||||
onnx_sim = True
|
||||
combine_jobs = 6
|
||||
|
||||
opt = BPUOptions()
|
||||
|
||||
if _global_config:
|
||||
opt.act_path = _global_config['export']['model_path']
|
||||
opt.export_path = _global_config['export']['output_path'] + "/" + _global_config['task_id']
|
||||
opt.cal_num = _global_config['export']['calibration_num']
|
||||
opt.onnx_sim = True
|
||||
opt.march = _global_config['export']['march']
|
||||
opt.combine_jobs = 6
|
||||
logging.info("BPU parameters loaded from config file")
|
||||
|
||||
logging.info("="*80)
|
||||
logging.info(colored("BPU Export Configuration:", 'light_cyan'))
|
||||
logging.info(f" ACT Model Path: {opt.act_path}")
|
||||
logging.info(f" Export Path: {opt.export_path}")
|
||||
logging.info(f" Calibration Samples: {opt.cal_num}")
|
||||
logging.info(f" ONNX Simplify: {opt.onnx_sim}")
|
||||
logging.info(f" March: {opt.march}")
|
||||
logging.info(f" Compiler Jobs: {opt.combine_jobs}")
|
||||
logging.info(f" Dataset Root: {cfg.dataset.root}")
|
||||
logging.info("="*80)
|
||||
|
||||
if not os.path.exists(opt.export_path):
|
||||
os.makedirs(opt.export_path)
|
||||
|
||||
visionEncoder_ws = os.path.join(opt.export_path, BPU_VisionEncoder)
|
||||
transformersLayers_ws = os.path.join(opt.export_path, BPU_TransformerLayers)
|
||||
onnx_name_BPU_ACTPolicy_VisionEncoder = BPU_VisionEncoder + ".onnx"
|
||||
onnx_path_BPU_ACTPolicy_VisionEncoder = os.path.join(visionEncoder_ws, onnx_name_BPU_ACTPolicy_VisionEncoder)
|
||||
onnx_name_BPU_ACTPolicy_TransformerLayers = BPU_TransformerLayers + ".onnx"
|
||||
onnx_path_BPU_ACTPolicy_TransformerLayers = os.path.join(transformersLayers_ws, onnx_name_BPU_ACTPolicy_TransformerLayers)
|
||||
## 导出校准文件路径
|
||||
calbrate_data_name_BPU_ACTPolicy_VisionEncoder = "calibration_data_" + BPU_VisionEncoder
|
||||
calbrate_data_path_BPU_ACTPolicy_VisionEncoder = os.path.join(visionEncoder_ws, calbrate_data_name_BPU_ACTPolicy_VisionEncoder)
|
||||
calbrate_data_name_BPU_ACTPolicy_TransformerLayers = "calibration_data_" + BPU_TransformerLayers
|
||||
calbrate_data_path_BPU_ACTPolicy_TransformerLayers = os.path.join(transformersLayers_ws, calbrate_data_name_BPU_ACTPolicy_TransformerLayers)
|
||||
state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, "state")
|
||||
## 发布文件夹的脚本路径
|
||||
bpu_output_name = "bpu_output"
|
||||
bpu_output_path = os.path.join(opt.export_path, bpu_output_name)
|
||||
bash_build_all_path = os.path.join(opt.export_path, "build_all.sh")
|
||||
## 前后处理参数文件路径
|
||||
action_std_path = os.path.join(bpu_output_path, "action_std.npy")
|
||||
action_mean_path = os.path.join(bpu_output_path, "action_mean.npy")
|
||||
action_std_unnormalize_path = os.path.join(bpu_output_path, "action_std_unnormalize.npy")
|
||||
action_mean_unnormalize_path = os.path.join(bpu_output_path, "action_mean_unnormalize.npy")
|
||||
## 新建工作目录
|
||||
os.makedirs(visionEncoder_ws, exist_ok=True)
|
||||
logging.info(colored(f"mkdir: {visionEncoder_ws} Success.", 'green'))
|
||||
os.makedirs(transformersLayers_ws, exist_ok=True)
|
||||
logging.info(colored(f"mkdir: {transformersLayers_ws} Success.", 'green'))
|
||||
os.makedirs(calbrate_data_path_BPU_ACTPolicy_VisionEncoder, exist_ok=True)
|
||||
logging.info(colored(f"mkdir: {calbrate_data_path_BPU_ACTPolicy_VisionEncoder} Success.", 'green'))
|
||||
os.makedirs(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, exist_ok=True)
|
||||
logging.info(colored(f"mkdir: {calbrate_data_path_BPU_ACTPolicy_TransformerLayers} Success.", 'green'))
|
||||
os.makedirs(state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers, exist_ok=True)
|
||||
logging.info(colored(f"mkdir: {state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers} Success.", 'green'))
|
||||
os.makedirs(bpu_output_path, exist_ok=True)
|
||||
logging.info(colored(f"mkdir: {bpu_output_path} Success.", 'green'))
|
||||
|
||||
policy = ACTPolicy.from_pretrained(opt.act_path).cpu().eval()
|
||||
logging.info(colored(f"Load ACT Policy Model: {opt.act_path} Success.", 'light_red'))
|
||||
device = get_safe_torch_device(cfg.policy.device, log=True)
|
||||
torch.backends.cudnn.benchmark = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
|
||||
# 加载数据集
|
||||
dataset = make_dataset(cfg)
|
||||
dataloader = torch.utils.data.DataLoader(
|
||||
dataset,
|
||||
num_workers=0,
|
||||
batch_size=1,
|
||||
shuffle=True,
|
||||
sampler=None,
|
||||
pin_memory=device.type != "cpu",
|
||||
drop_last=False,
|
||||
)
|
||||
logging.info(colored(f"Load ACT Policy Dataset: \n{dataset} Success.", 'light_red'))
|
||||
batch = next(iter(dataloader))
|
||||
image_keys = [key for key in batch.keys() if key.startswith('observation.images.')]
|
||||
camera_names = [key.split('.')[-1] for key in image_keys]
|
||||
logging.info(colored(f"Camera Names: {camera_names} Success.", 'light_red'))
|
||||
logging.info(colored(f"Image Keys: {image_keys} Success.", 'light_red'))
|
||||
logging.info(colored(f"Batch: {batch} Success.", 'light_red'))
|
||||
|
||||
outputs = policy.select_action(deepcopy(batch))
|
||||
|
||||
## 动态获取前后处理参数
|
||||
# 为每个相机保存归一化参数
|
||||
for camera_name in camera_names:
|
||||
buffer_name = f"buffer_observation_images_{camera_name}"
|
||||
if hasattr(policy.normalize_inputs, buffer_name):
|
||||
buffer = getattr(policy.normalize_inputs, buffer_name)
|
||||
camera_std = buffer.std.data.detach().cpu().numpy()
|
||||
camera_mean = buffer.mean.data.detach().cpu().numpy()
|
||||
|
||||
camera_std_path = os.path.join(bpu_output_path, f"{camera_name}_std.npy")
|
||||
camera_mean_path = os.path.join(bpu_output_path, f"{camera_name}_mean.npy")
|
||||
|
||||
np.save(camera_std_path, camera_std)
|
||||
np.save(camera_mean_path, camera_mean)
|
||||
logging.info(f"Saved {camera_name} normalization parameters")
|
||||
|
||||
# 保存状态和动作归一化参数
|
||||
action_std = policy.normalize_inputs.buffer_observation_state.std.data.detach().cpu().numpy()
|
||||
action_mean = policy.normalize_inputs.buffer_observation_state.mean.data.detach().cpu().numpy()
|
||||
action_std_unnormalize = policy.unnormalize_outputs.buffer_action.std.data.detach().cpu().numpy()
|
||||
action_mean_unnormalize = policy.unnormalize_outputs.buffer_action.mean.data.detach().cpu().numpy()
|
||||
|
||||
np.save(action_std_path, action_std)
|
||||
np.save(action_mean_path, action_mean)
|
||||
np.save(action_std_unnormalize_path, action_std_unnormalize)
|
||||
np.save(action_mean_unnormalize_path, action_mean_unnormalize)
|
||||
|
||||
## Vision Encoder
|
||||
batch = policy.normalize_inputs(batch)
|
||||
m_VisionEncoder = BPU_ACTPolicy_VisionEncoder(policy)
|
||||
m_VisionEncoder.eval()
|
||||
|
||||
# 动态获取相机视觉特征
|
||||
vision_features = []
|
||||
for camera_name in camera_names:
|
||||
input_tensor = batch[f'observation.images.{camera_name}']
|
||||
vision_feature = m_VisionEncoder(input_tensor)
|
||||
vision_features.append(vision_feature)
|
||||
logging.info(f"Generated vision features for {camera_name}: {vision_feature.shape}")
|
||||
|
||||
# 确定ONNX版本
|
||||
opset_version = 11 if "bayes" in opt.march else 19
|
||||
logging.info(f"Using ONNX opset version: {opset_version} for type: {opt.march}")
|
||||
|
||||
onnx_path = onnx_path_BPU_ACTPolicy_VisionEncoder
|
||||
torch.onnx.export(
|
||||
m_VisionEncoder, # 要转换的模型
|
||||
input_tensor, # 模型的输入
|
||||
onnx_path, # 输出文件名
|
||||
export_params=True, # 存储训练后的参数
|
||||
opset_version=opset_version, # 动态ONNX版本
|
||||
do_constant_folding=True, # 是否执行常量折叠优化
|
||||
input_names=['images'], # 输入节点名称
|
||||
output_names=['Vision_Features'], # 输出节点名称
|
||||
dynamic_axes=None
|
||||
)
|
||||
onnx_sim(onnx_path, opt.onnx_sim)
|
||||
logging.info(colored(f"Export {onnx_path} Success.", 'green'))
|
||||
|
||||
m_TransformerLayers = BPU_ACTPolicy_TransformerLayers(policy, camera_names)
|
||||
m_TransformerLayers.eval()
|
||||
state = batch["observation.state"]
|
||||
actions = m_TransformerLayers(state, *vision_features)
|
||||
# np.save(f"new_actions.npy", actions.detach().cpu().numpy())
|
||||
|
||||
input_names = ['states'] + [f'{camera_name}_features' for camera_name in camera_names]
|
||||
logging.info(f"Transformer input names: {input_names}")
|
||||
|
||||
onnx_path = onnx_path_BPU_ACTPolicy_TransformerLayers
|
||||
torch.onnx.export(
|
||||
m_TransformerLayers, # 要转换的模型
|
||||
(state, *vision_features), # 模型的输入
|
||||
onnx_path, # 输出文件名
|
||||
export_params=True, # 存储训练后的参数
|
||||
opset_version=opset_version, # 动态ONNX版本
|
||||
do_constant_folding=True, # 是否执行常量折叠优化
|
||||
input_names=input_names, # 动态输入节点名称
|
||||
output_names=['Actions'], # 输出节点名称
|
||||
dynamic_axes=None
|
||||
)
|
||||
onnx_sim(onnx_path, opt.onnx_sim)
|
||||
logging.info(colored(f"Export {onnx_path} Success.", 'green'))
|
||||
|
||||
if "nash" in opt.march:
|
||||
## calibrate data - 动态生成相机校准数据目录
|
||||
input_names_TransformerLayers = camera_names + ["state"]
|
||||
input_cal_path = []
|
||||
for input_name in input_names_TransformerLayers:
|
||||
p = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, input_name)
|
||||
input_cal_path.append(p)
|
||||
os.makedirs(p, exist_ok=True)
|
||||
logging.info(colored(f"mkdir: {p} Success.", 'green'))
|
||||
|
||||
for i, batch in enumerate(dataloader):
|
||||
name = "%.10d.npy"%i
|
||||
batch = policy.normalize_inputs(batch)
|
||||
|
||||
# 动态处理所有相机输入
|
||||
camera_inputs = {}
|
||||
for camera_name in camera_names:
|
||||
camera_inputs[camera_name] = batch[f'observation.images.{camera_name}']
|
||||
state_input = batch["observation.state"]
|
||||
|
||||
## VisionEncoder - 动态保存所有相机的校准数据
|
||||
if i%4 == 0:
|
||||
for camera_name in camera_names:
|
||||
p = os.path.join(calbrate_data_path_BPU_ACTPolicy_VisionEncoder, f"{camera_name}_" + name)
|
||||
np.save(p, camera_inputs[camera_name].detach().cpu().numpy())
|
||||
logging.info(colored(f"save to: {p}", 'light_blue'))
|
||||
|
||||
## TransformerLayers - 动态处理所有相机的视觉特征
|
||||
for camera_name in camera_names:
|
||||
vision_feature = m_VisionEncoder(camera_inputs[camera_name])
|
||||
camera_cal_path = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, camera_name)
|
||||
p = os.path.join(camera_cal_path, name)
|
||||
np.save(p, vision_feature.detach().cpu().numpy())
|
||||
logging.info(colored(f"save to: {p}", 'light_magenta'))
|
||||
|
||||
p = os.path.join(state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers, name)
|
||||
np.save(p, state_input.detach().cpu().numpy())
|
||||
logging.info(colored(f"save to: {p}", 'light_magenta'))
|
||||
|
||||
if i >= opt.cal_num:
|
||||
break
|
||||
|
||||
if "bayes" in opt.march:
|
||||
|
||||
## calibrate data - 动态生成相机校准数据目录
|
||||
input_names_TransformerLayers = camera_names + ["state"]
|
||||
input_cal_path = []
|
||||
for input_name in input_names_TransformerLayers:
|
||||
p = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, input_name)
|
||||
input_cal_path.append(p)
|
||||
os.makedirs(p, exist_ok=True)
|
||||
logging.info(colored(f"mkdir: {p} Success.", 'green'))
|
||||
|
||||
for i, batch in enumerate(dataloader):
|
||||
name = "%.10d.nchw"%i
|
||||
batch = policy.normalize_inputs(batch)
|
||||
|
||||
# 动态处理所有相机输入
|
||||
camera_inputs = {}
|
||||
for camera_name in camera_names:
|
||||
camera_inputs[camera_name] = batch[f'observation.images.{camera_name}']
|
||||
|
||||
state_input = batch["observation.state"]
|
||||
|
||||
## VisionEncoder - 动态保存所有相机的校准数据 (Bayes格式)
|
||||
if i%4 == 0:
|
||||
for camera_name in camera_names:
|
||||
p = os.path.join(calbrate_data_path_BPU_ACTPolicy_VisionEncoder, f"{camera_name}_" + name)
|
||||
camera_inputs[camera_name].detach().cpu().numpy().tofile(p)
|
||||
logging.info(colored(f"save to: {p}", 'light_blue'))
|
||||
|
||||
## TransformerLayers - 动态处理所有相机的视觉特征 (Bayes格式)
|
||||
for camera_name in camera_names:
|
||||
vision_feature = m_VisionEncoder(camera_inputs[camera_name])
|
||||
camera_cal_path = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, camera_name)
|
||||
p = os.path.join(camera_cal_path, name)
|
||||
vision_feature.detach().cpu().numpy().tofile(p)
|
||||
logging.info(colored(f"save to: {p}", 'light_magenta'))
|
||||
|
||||
p = os.path.join(state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers, name)
|
||||
state_input.detach().cpu().numpy().tofile(p)
|
||||
logging.info(colored(f"save to: {p}", 'light_magenta'))
|
||||
|
||||
if i >= opt.cal_num:
|
||||
break
|
||||
|
||||
def generate_output_config(time_cost):
|
||||
global _global_config
|
||||
export_path = _global_config['export']['output_path'] + "/" + _global_config['task_id']
|
||||
TransformerLayers = export_path + "/" + BPU_TransformerLayers
|
||||
TransformerLayers_onnx = TransformerLayers + "/" + BPU_TransformerLayers + ".onnx"
|
||||
TransformerLayers_calibration_data = TransformerLayers + "/" + "calibration_data_" + BPU_TransformerLayers
|
||||
VisionEncoder = export_path + "/" + BPU_VisionEncoder
|
||||
VisionEncoder_onnx = VisionEncoder + "/" + BPU_VisionEncoder + ".onnx"
|
||||
VisionEncoder_calibration_data = VisionEncoder + "/" + "calibration_data_" + BPU_VisionEncoder
|
||||
output_config = {
|
||||
"task_name": _global_config['task_id'],
|
||||
"march": _global_config['export']['march'],
|
||||
"time_cost": time_cost,
|
||||
"export_path": export_path,
|
||||
"TransformerLayers": TransformerLayers_onnx,
|
||||
"TransformerLayers_calibration_data": TransformerLayers_calibration_data,
|
||||
"VisionEncoder": VisionEncoder_onnx,
|
||||
"VisionEncoder_calibration_data": VisionEncoder_calibration_data,
|
||||
}
|
||||
with open(os.path.join(export_path, "output.json"), "w") as f:
|
||||
json.dump(output_config, f)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
init_logging()
|
||||
config_path = "input/config.json"
|
||||
_global_config = load_config(config_path)
|
||||
time_start = time.time()
|
||||
main()
|
||||
time_end = time.time()
|
||||
time_cost = time_end - time_start
|
||||
logging.info(colored(f"Time Cost: {time_cost} seconds", 'light_red'))
|
||||
generate_output_config(time_cost)
|
||||
|
||||
|
||||
5
ACT/act_export/requirements.txt
Normal file
5
ACT/act_export/requirements.txt
Normal file
@ -0,0 +1,5 @@
|
||||
lerobot==0.3.3
|
||||
onnx
|
||||
onnxsim
|
||||
onnxruntime
|
||||
av
|
||||
0
ACT/act_quant/.dockerignore
Normal file
0
ACT/act_quant/.dockerignore
Normal file
17
ACT/act_quant/Dockerfile
Normal file
17
ACT/act_quant/Dockerfile
Normal file
@ -0,0 +1,17 @@
|
||||
# ARG BASE_IMAGE=ccr-29eug8s3-pub.cnc.bj.baidubce.com/deliver/ai_toolchain_ubuntu_20_x5_cpu:v1.2.8
|
||||
ARG BASE_IMAGE=ccr-29eug8s3-pub.cnc.bj.baidubce.com/aitools/ai_toolchain_ubuntu_22_j6_gpu:v3.3.0
|
||||
# 可通过 --build-arg BASE_IMAGE=... 来替换基础镜像
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV TZ=Asia/Shanghai
|
||||
|
||||
RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
|
||||
sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
|
||||
|
||||
COPY . /app/
|
||||
|
||||
ENTRYPOINT ["bash", "convert.sh"]
|
||||
24
ACT/act_quant/convert.sh
Normal file
24
ACT/act_quant/convert.sh
Normal file
@ -0,0 +1,24 @@
|
||||
CONFIG=input/config.json
|
||||
TASKID=$(python3 read_json.py $CONFIG task_id)
|
||||
MARCH=$(python3 read_json.py $CONFIG quant.march)
|
||||
OUTPUT=output/$TASKID
|
||||
|
||||
python3 load_config.py $CONFIG
|
||||
echo "Convert PTQ YAML Haved been Prepared"
|
||||
|
||||
VISIONENCODER_YAML=$OUTPUT/ptq_yaml/VisionEncoder.yaml
|
||||
TRANSFORMERLAYERS_YAML=$OUTPUT/ptq_yaml/TransformerLayers.yaml
|
||||
|
||||
if [[ "$MARCH" == *"nash"* ]]; then
|
||||
echo -e "\033[44;37m===== Start Compiling TRANSFORMERLAYERS =====\033[0m"
|
||||
hb_compile --config $TRANSFORMERLAYERS_YAML
|
||||
echo -e "\033[44;37m===== Start Compiling TRANSFORMERLAYERS =====\033[0m"
|
||||
hb_compile --config $VISIONENCODER_YAML
|
||||
echo -e "\033[44;37m===== End Compiling Nash Model =====\033[0m"
|
||||
else
|
||||
echo -e "\033[44;37m===== Start Compiling TRANSFORMERLAYERS =====\033[0m"
|
||||
hb_mapper makertbin --model-type onnx --config $TRANSFORMERLAYERS_YAML
|
||||
echo -e "\033[44;37m===== Start Compiling VISIONENCODER =====\033[0m"
|
||||
hb_mapper makertbin --model-type onnx --config $VISIONENCODER_YAML
|
||||
echo -e "\033[44;37m===== End Compiling Bayes Model =====\033[0m"
|
||||
fi
|
||||
76
ACT/act_quant/load_config.py
Normal file
76
ACT/act_quant/load_config.py
Normal file
@ -0,0 +1,76 @@
|
||||
import json
|
||||
import yaml
|
||||
import sys
|
||||
import os
|
||||
|
||||
|
||||
def load_config(config_path):
|
||||
with open(config_path, "r") as file:
|
||||
config = yaml.safe_load(file)
|
||||
|
||||
if "quant" in config:
|
||||
quant_info = config["quant"]
|
||||
if "output_path" in quant_info:
|
||||
output_path = os.path.join(quant_info["output_path"], config["task_id"])
|
||||
if "march" in quant_info:
|
||||
march = "nash" if "nash" in quant_info["march"] else "bayes"
|
||||
convert_yaml_path = f"pyq_yaml/{march}/"
|
||||
# prepare the nash and bayes bpu
|
||||
## first prepare the VisionEncoder yaml
|
||||
VisionEncoder_yaml_path = os.path.join(convert_yaml_path, "VisionEncoder.yaml")
|
||||
with open(VisionEncoder_yaml_path, "r") as file:
|
||||
VisionEncoder_yaml = yaml.safe_load(file)
|
||||
VisionEncoder_yaml["model_parameters"]["onnx_model"] = quant_info["VisionEncoder"]["onnx_model"]
|
||||
VisionEncoder_yaml["calibration_parameters"]["cal_data_dir"] = quant_info["VisionEncoder"]["calibration_data"]
|
||||
VisionEncoder_yaml["model_parameters"]["march"] = quant_info["march"]
|
||||
|
||||
# Make sure output ptq_yaml directory exists
|
||||
output_ptq_yaml_dir = os.path.join(output_path, "ptq_yaml")
|
||||
os.makedirs(output_ptq_yaml_dir, exist_ok=True)
|
||||
|
||||
# Save VisionEncoder yaml to output/ptq_yaml
|
||||
VisionEncoder_yaml_save_path = os.path.join(output_ptq_yaml_dir, "VisionEncoder.yaml")
|
||||
with open(VisionEncoder_yaml_save_path, "w") as file:
|
||||
yaml.safe_dump(VisionEncoder_yaml, file)
|
||||
|
||||
## second prepare the TransformerLayers yaml
|
||||
TransformerLayers_yaml_path = os.path.join(convert_yaml_path, "TransformerLayers.yaml")
|
||||
with open(TransformerLayers_yaml_path, "r") as file:
|
||||
TransformerLayers_yaml = yaml.safe_load(file)
|
||||
TransformerLayers_yaml["model_parameters"]["onnx_model"] = quant_info["TransformerLayers"]["onnx_model"]
|
||||
TransformerLayers_yaml["model_parameters"]["march"] = quant_info["march"]
|
||||
|
||||
TransformerLayers_Cal_dir = quant_info["TransformerLayers"]["calibration_data"]
|
||||
# (Fix cal_data_dir variable)
|
||||
cal_data_dir = TransformerLayers_Cal_dir
|
||||
sub_dirs = [d for d in os.listdir(cal_data_dir) if os.path.isdir(os.path.join(cal_data_dir, d))]
|
||||
input_names = []
|
||||
for name in sub_dirs:
|
||||
if name == "state":
|
||||
input_names.append("states")
|
||||
else:
|
||||
input_names.append(f"{name}_features")
|
||||
input_name_str = ";".join(input_names) + ";"
|
||||
|
||||
TransformerLayers_yaml["input_parameters"]["input_name"] = input_name_str
|
||||
TransformerLayers_yaml["input_parameters"]["input_type_rt"] = "featuremap;" * len(input_names)
|
||||
TransformerLayers_yaml["input_parameters"]["input_layout_rt"] = "NCHW;" * len(input_names)
|
||||
TransformerLayers_yaml["input_parameters"]["input_type_train"] = "featuremap;" * len(input_names)
|
||||
TransformerLayers_yaml["input_parameters"]["input_layout_train"] = "NCHW;" * len(input_names)
|
||||
TransformerLayers_yaml["input_parameters"]["norm_type"] = "no_preprocess;" * len(input_names)
|
||||
TransformerLayers_yaml["calibration_parameters"]["cal_data_dir"] = ";".join([os.path.join(TransformerLayers_Cal_dir, name) for name in sub_dirs]) + ";"
|
||||
TransformerLayers_yaml["calibration_parameters"]["cal_data_type"] = "float32;" * len(input_names)
|
||||
|
||||
# Save TransformerLayers yaml to output/ptq_yaml
|
||||
TransformerLayers_yaml_save_path = os.path.join(output_ptq_yaml_dir, "TransformerLayers.yaml")
|
||||
with open(TransformerLayers_yaml_save_path, "w") as file:
|
||||
yaml.safe_dump(TransformerLayers_yaml, file)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config_path = sys.argv[1]
|
||||
config = load_config(config_path)
|
||||
23
ACT/act_quant/pyq_yaml/bayes/TransformerLayers.yaml
Normal file
23
ACT/act_quant/pyq_yaml/bayes/TransformerLayers.yaml
Normal file
@ -0,0 +1,23 @@
|
||||
model_parameters:
|
||||
onnx_model: '{onnx_name_BPU_ACTPolicy_TransformerLayers}'
|
||||
march: "{opt.type}"
|
||||
layer_out_dump: False
|
||||
working_dir: 'bpu_model_output'
|
||||
output_model_file_prefix: 'BPU_TransformerLayers'
|
||||
input_parameters:
|
||||
input_name: "{input_name_str}"
|
||||
input_type_rt: '{input_type_str}'
|
||||
input_layout_rt: '{nchw_str}'
|
||||
input_type_train: '{input_type_str}'
|
||||
input_layout_train: '{nchw_str}'
|
||||
norm_type: '{norm_type_str}'
|
||||
calibration_parameters:
|
||||
cal_data_dir: '{cal_data_dir_str}'
|
||||
cal_data_type: '{cal_data_type_str}'
|
||||
calibration_type: 'default'
|
||||
optimization: set_all_nodes_int16;set_Softmax_input_int16;set_Softmax_output_int16;
|
||||
compiler_parameters:
|
||||
jobs: 6
|
||||
compile_mode: 'latency'
|
||||
debug: False
|
||||
optimize_level: 'O3'
|
||||
23
ACT/act_quant/pyq_yaml/bayes/VisionEncoder.yaml
Normal file
23
ACT/act_quant/pyq_yaml/bayes/VisionEncoder.yaml
Normal file
@ -0,0 +1,23 @@
|
||||
model_parameters:
|
||||
onnx_model: 'onnx_name_BPU_ACTPolicy_VisionEncoder'
|
||||
march: "opt.type"
|
||||
layer_out_dump: False
|
||||
working_dir: 'bpu_model_output'
|
||||
output_model_file_prefix: 'BPU_VisionEncoder'
|
||||
input_parameters:
|
||||
input_name: ""
|
||||
input_type_rt: 'featuremap'
|
||||
input_layout_rt: 'NCHW'
|
||||
input_type_train: 'featuremap'
|
||||
input_layout_train: 'NCHW'
|
||||
norm_type: 'no_preprocess'
|
||||
calibration_parameters:
|
||||
cal_data_dir: 'calbrate_data_name_BPU_ACTPolicy_VisionEncoder'
|
||||
cal_data_type: 'float32'
|
||||
calibration_type: 'default'
|
||||
optimization: set_all_nodes_int16;set_Softmax_input_int16;set_Softmax_output_int16;
|
||||
compiler_parameters:
|
||||
jobs: 6
|
||||
compile_mode: 'latency'
|
||||
debug: true
|
||||
optimize_level: 'O3'
|
||||
24
ACT/act_quant/pyq_yaml/nash/TransformerLayers.yaml
Normal file
24
ACT/act_quant/pyq_yaml/nash/TransformerLayers.yaml
Normal file
@ -0,0 +1,24 @@
|
||||
model_parameters:
|
||||
onnx_model: '{onnx_name_BPU_ACTPolicy_TransformerLayers}'
|
||||
march: "{opt.type}"
|
||||
layer_out_dump: False
|
||||
working_dir: 'bpu_model_output'
|
||||
output_model_file_prefix: 'BPU_TransformerLayers'
|
||||
input_parameters:
|
||||
input_name: "{input_name_str}"
|
||||
input_type_rt: '{input_type_str}'
|
||||
input_layout_rt: '{nchw_str}'
|
||||
input_type_train: '{input_type_str}'
|
||||
input_layout_train: '{nchw_str}'
|
||||
norm_type: '{norm_type_str}'
|
||||
calibration_parameters:
|
||||
cal_data_dir: '{cal_data_dir_str}'
|
||||
cal_data_type: '{cal_data_type_str}'
|
||||
calibration_type: 'default'
|
||||
optimization: set_all_nodes_int16
|
||||
compiler_parameters:
|
||||
extra_params: {'input_no_padding': True, 'output_no_padding': True}
|
||||
jobs: 6
|
||||
compile_mode: 'latency'
|
||||
debug: False
|
||||
optimize_level: 'O2'
|
||||
24
ACT/act_quant/pyq_yaml/nash/VisionEncoder.yaml
Normal file
24
ACT/act_quant/pyq_yaml/nash/VisionEncoder.yaml
Normal file
@ -0,0 +1,24 @@
|
||||
model_parameters:
|
||||
onnx_model: '{onnx_name_BPU_ACTPolicy_VisionEncoder}'
|
||||
march: "{opt.type}"
|
||||
layer_out_dump: False
|
||||
working_dir: 'bpu_model_output'
|
||||
output_model_file_prefix: 'BPU_VisionEncoder'
|
||||
input_parameters:
|
||||
input_name: ""
|
||||
input_type_rt: 'featuremap'
|
||||
input_layout_rt: 'NCHW'
|
||||
input_type_train: 'featuremap'
|
||||
input_layout_train: 'NCHW'
|
||||
norm_type: 'no_preprocess'
|
||||
calibration_parameters:
|
||||
cal_data_dir: '{calbrate_data_name_BPU_ACTPolicy_VisionEncoder}'
|
||||
cal_data_type: 'float32'
|
||||
calibration_type: 'default'
|
||||
optimization: set_all_nodes_int16
|
||||
compiler_parameters:
|
||||
extra_params: {'input_no_padding': True, 'output_no_padding': True}
|
||||
jobs: 6
|
||||
compile_mode: 'latency'
|
||||
debug: true
|
||||
optimize_level: 'O2'
|
||||
42
ACT/act_quant/read_json.py
Normal file
42
ACT/act_quant/read_json.py
Normal file
@ -0,0 +1,42 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
def read_config(config_file, key_path):
|
||||
"""
|
||||
Read a value from JSON config file.
|
||||
|
||||
Args:
|
||||
config_file: Path to JSON config file
|
||||
key_path: Dot-separated path to the key (e.g., "evaluation.checkpoint_path")
|
||||
|
||||
Returns:
|
||||
The value at the specified key path
|
||||
"""
|
||||
with open(config_file, 'r') as f:
|
||||
json_config = json.load(f)
|
||||
|
||||
# Navigate through nested keys
|
||||
keys = key_path.split('.')
|
||||
value = json_config
|
||||
for key in keys:
|
||||
if isinstance(value, dict):
|
||||
value = value.get(key)
|
||||
else:
|
||||
return None
|
||||
|
||||
return value
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: python read_config.py <config_file> <key_path>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
config_file = sys.argv[1]
|
||||
key_path = sys.argv[2]
|
||||
|
||||
value = read_config(config_file, key_path)
|
||||
if value is not None:
|
||||
print(value)
|
||||
else:
|
||||
print("", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
2
RDT/README.md
Normal file
2
RDT/README.md
Normal file
@ -0,0 +1,2 @@
|
||||
# d-robotics-rdt
|
||||
|
||||
2
RDT/lerobot2rdt/.dockerignore
Normal file
2
RDT/lerobot2rdt/.dockerignore
Normal file
@ -0,0 +1,2 @@
|
||||
input/*
|
||||
output/*
|
||||
43
RDT/lerobot2rdt/Dockerfile
Normal file
43
RDT/lerobot2rdt/Dockerfile
Normal file
@ -0,0 +1,43 @@
|
||||
|
||||
FROM registry.d-robotics.cc/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
|
||||
# ccr-29eug8s3-pub.cnc.bj.baidubce.com/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
|
||||
WORKDIR /app
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV TZ=Asia/Shanghai
|
||||
|
||||
RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
|
||||
sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
|
||||
|
||||
RUN apt-get update --allow-unauthenticated && apt-get install -y \
|
||||
software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y \
|
||||
python3.10 \
|
||||
python3.10-dev \
|
||||
python3-pip \
|
||||
python3.10-distutils \
|
||||
libgl1-mesa-glx \
|
||||
libglib2.0-0 \
|
||||
wget \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
|
||||
|
||||
COPY . /app/
|
||||
|
||||
RUN python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
# RUN pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu121
|
||||
RUN pip install torch==2.1.0 torchvision==0.16.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
RUN pip install packaging==24.0
|
||||
|
||||
# RUN mkdir -p /app/dataset/input /app/dataset/output
|
||||
|
||||
ENTRYPOINT ["bash", "convert.sh"]
|
||||
60
RDT/lerobot2rdt/convert.sh
Normal file
60
RDT/lerobot2rdt/convert.sh
Normal file
@ -0,0 +1,60 @@
|
||||
|
||||
BEGIN_TIME=$(date +%s)
|
||||
|
||||
CONFIG_FILE="input/config.json"
|
||||
echo "CONFIG_FILE_PATH: $CONFIG_FILE"
|
||||
|
||||
# Read values directly from the config.json using python - no more nested key error by using a helper script
|
||||
TASK_ID=$(python3 read_json.py "$CONFIG_FILE" "task_id")
|
||||
DATA_DIR=$(python3 read_json.py "$CONFIG_FILE" "data_dir")
|
||||
OUTPUT_DIR=$(python3 read_json.py "$CONFIG_FILE" "output_dir")
|
||||
EPISODE_NUM=$(python3 read_json.py "$CONFIG_FILE" "episode_num")
|
||||
GPU=$(python3 read_json.py "$CONFIG_FILE" "gpu")
|
||||
T5_PATH="/weights/t5-v1_1-xxl"
|
||||
NO_LANGUAGE=$(python3 read_json.py "$CONFIG_FILE" "no_language")
|
||||
|
||||
# For the camera keys, extract them in a way that avoids the error about 'images_info.key.*' not found
|
||||
CAM_HIGH_KEY=$(python3 -c "import json; print(json.load(open('$CONFIG_FILE'))['images_info']['key'].get('cam_high', ''))")
|
||||
CAM_RIGHT_WRIST_KEY=$(python3 -c "import json; print(json.load(open('$CONFIG_FILE'))['images_info']['key'].get('cam_right_wrist', ''))")
|
||||
|
||||
# create output path
|
||||
if [ ! -d "$OUTPUT_DIR/$TASK_ID" ]; then
|
||||
mkdir -p "$OUTPUT_DIR/$TASK_ID"
|
||||
echo "Created output directory: $OUTPUT_DIR/$TASK_ID"
|
||||
else
|
||||
echo "Output directory already exists: $OUTPUT_DIR/$TASK_ID"
|
||||
fi
|
||||
|
||||
if [ "$NO_LANGUAGE" = "true" ]; then
|
||||
python3 lerobot2rdt.py \
|
||||
--data_dir $DATA_DIR \
|
||||
--output_dir $OUTPUT_DIR/$TASK_ID \
|
||||
--episode_num $EPISODE_NUM \
|
||||
--gpu $GPU \
|
||||
--t5_path $T5_PATH \
|
||||
--cam_high_key $CAM_HIGH_KEY \
|
||||
--cam_right_wrist_key $CAM_RIGHT_WRIST_KEY \
|
||||
--no_language
|
||||
status=$?
|
||||
else
|
||||
python3 lerobot2rdt.py \
|
||||
--data_dir $DATA_DIR \
|
||||
--output_dir $OUTPUT_DIR/$TASK_ID \
|
||||
--episode_num $EPISODE_NUM \
|
||||
--gpu $GPU \
|
||||
--t5_path $T5_PATH \
|
||||
--cam_high_key $CAM_HIGH_KEY \
|
||||
--cam_right_wrist_key $CAM_RIGHT_WRIST_KEY
|
||||
status=$?
|
||||
fi
|
||||
|
||||
END_TIME=$(date +%s)
|
||||
echo "END_TIME: $END_TIME"
|
||||
echo "TOTAL_TIME: $((END_TIME - BEGIN_TIME))"
|
||||
|
||||
if [ $status -eq 0 ]; then
|
||||
python3 generate_output.py $CONFIG_FILE $((END_TIME - BEGIN_TIME))
|
||||
else
|
||||
echo "lerobot2rdt.py exited with status $status, skipping generate_output.py"
|
||||
fi
|
||||
|
||||
26
RDT/lerobot2rdt/generate_output.py
Normal file
26
RDT/lerobot2rdt/generate_output.py
Normal file
@ -0,0 +1,26 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
def generate_output(input_config, time):
|
||||
with open(input_config, "r") as f:
|
||||
data = json.load(f)
|
||||
output_dir_with_taskid = os.path.join(data["output_dir"], str(data["task_id"]))
|
||||
# Ensure the output directory exists before writing the output file
|
||||
os.makedirs(output_dir_with_taskid, exist_ok=True)
|
||||
output_data = {
|
||||
"task_id": data["task_id"],
|
||||
"convert_time": time,
|
||||
"data_dir": data["data_dir"],
|
||||
"output_dir": output_dir_with_taskid,
|
||||
"episode_num": data["episode_num"],
|
||||
"no_language": data["no_language"],
|
||||
}
|
||||
output_json_path = os.path.join(output_dir_with_taskid, "output.json")
|
||||
with open(output_json_path, "w") as f:
|
||||
json.dump(output_data, f)
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_config = sys.argv[1]
|
||||
time = int(sys.argv[2])
|
||||
generate_output(input_config, time)
|
||||
368
RDT/lerobot2rdt/lerobot2rdt.py
Normal file
368
RDT/lerobot2rdt/lerobot2rdt.py
Normal file
@ -0,0 +1,368 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
LeRobot到RDT数据转换脚本
|
||||
|
||||
LeRobot机器人结构:
|
||||
- 5个关节 (shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll)
|
||||
- 1个夹爪 (gripper)
|
||||
- 总计:6个自由度 (6DOF)
|
||||
|
||||
维度映射(匹配RDT训练代码):
|
||||
- left_arm_dim = 0 (单臂机器人,左臂不存在)
|
||||
- right_arm_dim = 6 (5关节 + 1夹爪,映射到RDT的right_arm部分)
|
||||
- 状态向量:6维 [joint1, joint2, joint3, joint4, joint5, gripper]
|
||||
- RDT索引映射:right_arm_joint_0_pos到right_arm_joint_5_pos (索引0-5)
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import h5py
|
||||
import numpy as np
|
||||
import cv2
|
||||
import argparse
|
||||
import yaml
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import torch
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
sys.path.append(os.path.join(current_dir, ".."))
|
||||
from models.multimodal_encoder.t5_encoder import T5Embedder
|
||||
|
||||
def extract_frames_from_video(video_path, output_dir, episode_idx):
|
||||
if not os.path.exists(video_path):
|
||||
print(f" No video file: {video_path}")
|
||||
return []
|
||||
|
||||
temp_dir = os.path.join(output_dir, f"temp_frames_{episode_idx}")
|
||||
if not os.path.exists(temp_dir):
|
||||
os.makedirs(temp_dir)
|
||||
|
||||
output_pattern = os.path.join(temp_dir, "frame_%04d.jpg")
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
'ffmpeg', '-i', video_path,
|
||||
'-vf', 'fps=30',
|
||||
'-q:v', '2',
|
||||
output_pattern,
|
||||
'-y'
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f" Failed to extract frames with ffmpeg: {result.stderr}")
|
||||
return []
|
||||
|
||||
frames = []
|
||||
frame_files = sorted([f for f in os.listdir(temp_dir) if f.endswith('.jpg')])
|
||||
|
||||
for frame_file in frame_files:
|
||||
frame_path = os.path.join(temp_dir, frame_file)
|
||||
frame = cv2.imread(frame_path)
|
||||
if frame is not None:
|
||||
frame_resized = cv2.resize(frame, (640, 480))
|
||||
frames.append(frame_resized)
|
||||
|
||||
print(f" Successfully extracted {len(frames)} frames")
|
||||
|
||||
for frame_file in frame_files:
|
||||
os.remove(os.path.join(temp_dir, frame_file))
|
||||
os.rmdir(temp_dir)
|
||||
|
||||
return frames
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error extracting frames: {e}")
|
||||
return []
|
||||
|
||||
def load_lerobot_episode(data_dir, episode_idx, output_dir, cam_high_key="high", cam_right_wrist_key="arm"):
|
||||
"""加载LeRobot的单个episode数据
|
||||
|
||||
LeRobot数据结构:
|
||||
- action: 6维 [shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll, gripper]
|
||||
- observation.state: 6维 [shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll, gripper]
|
||||
- 图像: 高位相机 + 手臂相机
|
||||
"""
|
||||
parquet_path = os.path.join(data_dir, "data/chunk-000", f"episode_{episode_idx:06d}.parquet")
|
||||
if not os.path.exists(parquet_path):
|
||||
print(f"Episode {episode_idx} parquet file does not exist: {parquet_path}")
|
||||
return None
|
||||
|
||||
df = pd.read_parquet(parquet_path)
|
||||
|
||||
actions = []
|
||||
qpos = []
|
||||
|
||||
for i in range(len(df)):
|
||||
action = df['action'].iloc[i]
|
||||
state = df['observation.state'].iloc[i]
|
||||
|
||||
if isinstance(action, np.ndarray):
|
||||
actions.append(action.astype(np.float32))
|
||||
else:
|
||||
actions.append(np.array(action, dtype=np.float32))
|
||||
|
||||
if isinstance(state, np.ndarray):
|
||||
qpos.append(state.astype(np.float32))
|
||||
else:
|
||||
qpos.append(np.array(state, dtype=np.float32))
|
||||
|
||||
high_cam_path = os.path.join(data_dir, f"videos/chunk-000/observation.images.{cam_high_key}", f"episode_{episode_idx:06d}.mp4")
|
||||
arm_cam_path = os.path.join(data_dir, f"videos/chunk-000/observation.images.{cam_right_wrist_key}", f"episode_{episode_idx:06d}.mp4")
|
||||
|
||||
print(f" Extracting high camera frames...")
|
||||
high_images = extract_frames_from_video(high_cam_path, output_dir, episode_idx)
|
||||
|
||||
print(f" Extracting arm camera frames...")
|
||||
arm_images = extract_frames_from_video(arm_cam_path, output_dir, episode_idx)
|
||||
|
||||
target_frames = len(df)
|
||||
if len(high_images) > target_frames:
|
||||
high_images = high_images[:target_frames]
|
||||
if len(arm_images) > target_frames:
|
||||
arm_images = arm_images[:target_frames]
|
||||
|
||||
while len(high_images) < target_frames and high_images:
|
||||
high_images.append(high_images[-1])
|
||||
while len(arm_images) < target_frames and arm_images:
|
||||
arm_images.append(arm_images[-1])
|
||||
|
||||
return {
|
||||
'actions': np.array(actions),
|
||||
'qpos': np.array(qpos),
|
||||
'high_images': high_images,
|
||||
'arm_images': arm_images,
|
||||
'episode_length': len(df)
|
||||
}
|
||||
|
||||
def images_encoding(imgs):
|
||||
if not imgs:
|
||||
return [], 0
|
||||
|
||||
encode_data = []
|
||||
padded_data = []
|
||||
max_len = 0
|
||||
|
||||
for i in range(len(imgs)):
|
||||
success, encoded_image = cv2.imencode(".jpg", imgs[i])
|
||||
if success:
|
||||
jpeg_data = encoded_image.tobytes()
|
||||
encode_data.append(jpeg_data)
|
||||
max_len = max(max_len, len(jpeg_data))
|
||||
else:
|
||||
print(f" Image encoding failed: {i}")
|
||||
empty_data = b""
|
||||
encode_data.append(empty_data)
|
||||
|
||||
for i in range(len(imgs)):
|
||||
padded_data.append(encode_data[i].ljust(max_len, b"\0"))
|
||||
|
||||
return encode_data, max_len
|
||||
|
||||
def load_task_instructions(data_dir):
|
||||
tasks_file = os.path.join(data_dir, "meta/tasks.jsonl")
|
||||
if not os.path.exists(tasks_file):
|
||||
print(f"Warning: tasks file not found: {tasks_file}")
|
||||
return None
|
||||
|
||||
instructions = []
|
||||
with open(tasks_file, 'r') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
task_data = json.loads(line.strip())
|
||||
instructions.append(task_data["task"])
|
||||
|
||||
print(f" 加载了 {len(instructions)} 个任务指令")
|
||||
return instructions
|
||||
|
||||
def encode_language_instruction(instruction_text, t5_embedder, device):
|
||||
try:
|
||||
text_embeds, attn_mask = t5_embedder.get_text_embeddings([instruction_text])
|
||||
|
||||
valid_embeds = text_embeds[0][attn_mask[0]].float()
|
||||
return valid_embeds.cpu().numpy()
|
||||
|
||||
except Exception as e:
|
||||
print(f" Language encoding failed: {e}")
|
||||
return np.zeros((1, 4096))
|
||||
|
||||
def convert_lerobot_to_rdt(data_dir, output_dir, episode_num, gpu=0, no_language=False, t5_path=None, cam_high_key="high", cam_right_wrist_key="arm"):
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
print(f"Start converting LeRobot data to RDT format...")
|
||||
print(f"Data source: {data_dir}")
|
||||
print(f"Output directory: {output_dir}")
|
||||
print(f"Processing episode number: {episode_num}")
|
||||
print(f"GPU device: {gpu}")
|
||||
|
||||
scene_name = os.path.basename(data_dir)
|
||||
|
||||
instructions = None
|
||||
if not no_language:
|
||||
instructions = load_task_instructions(data_dir)
|
||||
|
||||
t5_embedder = None
|
||||
if not no_language and instructions:
|
||||
try:
|
||||
print(f" Initializing T5 encoder...")
|
||||
t5_embedder = T5Embedder(
|
||||
from_pretrained=t5_path,
|
||||
device=f"cuda:{gpu}" if torch.cuda.is_available() else "cpu",
|
||||
model_max_length=1024,
|
||||
use_offload_folder=None,
|
||||
)
|
||||
print(f" T5 encoder initialized successfully")
|
||||
except Exception as e:
|
||||
print(f" T5 encoder initialization failed: {e}")
|
||||
print(f" Will skip language processing")
|
||||
no_language = True
|
||||
|
||||
for i in range(episode_num):
|
||||
print(f"Processing episode {i}...")
|
||||
|
||||
episode_data = load_lerobot_episode(data_dir, i, output_dir, cam_high_key=cam_high_key, cam_right_wrist_key=cam_right_wrist_key)
|
||||
if episode_data is None:
|
||||
print(f"Skipping episode {i}")
|
||||
continue
|
||||
|
||||
episode_output_dir = os.path.join(output_dir, f"episode_{i}")
|
||||
if not os.path.exists(episode_output_dir):
|
||||
os.makedirs(episode_output_dir)
|
||||
|
||||
hdf5_path = os.path.join(episode_output_dir, f"episode_{i}.hdf5")
|
||||
|
||||
with h5py.File(hdf5_path, "w") as f:
|
||||
f.create_dataset("action", data=episode_data['actions'])
|
||||
|
||||
obs = f.create_group("observations")
|
||||
obs.create_dataset("qpos", data=episode_data['qpos'])
|
||||
|
||||
image = obs.create_group("images")
|
||||
|
||||
if episode_data['high_images']:
|
||||
print(f" Encoding high camera images...")
|
||||
high_enc, len_high = images_encoding(episode_data['high_images'])
|
||||
if high_enc and len_high > 0:
|
||||
image.create_dataset("cam_high", data=high_enc, dtype=f"S{len_high}")
|
||||
print(f" Saved high camera images: {len(episode_data['high_images'])} frames")
|
||||
else:
|
||||
print(f" Warning: High camera images encoding failed")
|
||||
|
||||
if episode_data['arm_images']:
|
||||
print(f" Encoding arm camera images...")
|
||||
arm_enc, len_arm = images_encoding(episode_data['arm_images'])
|
||||
if arm_enc and len_arm > 0:
|
||||
image.create_dataset("cam_right_wrist", data=arm_enc, dtype=f"S{len_arm}")
|
||||
print(f" Saved arm camera images: {len(episode_data['arm_images'])} frames")
|
||||
else:
|
||||
print(f" Warning: Arm camera images encoding failed")
|
||||
|
||||
# 添加机器人维度信息(LeRobot: 5个关节 + 1个夹爪)
|
||||
# 根据process_data.py的逻辑,每个时间步都需要记录维度信息
|
||||
# LeRobot是单臂机器人,只有右臂:5个关节 + 1个夹爪 = 6维
|
||||
# 左臂:0维(单臂机器人)
|
||||
|
||||
# 为每个时间步记录维度信息
|
||||
left_arm_dim = [0] * len(episode_data['actions']) # 左臂0维(单臂机器人)
|
||||
right_arm_dim = [6] * len(episode_data['actions']) # 右臂6维(5关节+1夹爪)
|
||||
|
||||
obs.create_dataset("left_arm_dim", data=np.array(left_arm_dim))
|
||||
obs.create_dataset("right_arm_dim", data=np.array(right_arm_dim))
|
||||
|
||||
print(f" Episode {i} converted successfully: {hdf5_path}")
|
||||
print(f" Data length: {episode_data['episode_length']}")
|
||||
print(f" Action shape: {episode_data['actions'].shape}")
|
||||
print(f" Qpos shape: {episode_data['qpos'].shape}")
|
||||
print(f" High camera frames: {len(episode_data['high_images'])}")
|
||||
print(f" Arm camera frames: {len(episode_data['arm_images'])}")
|
||||
|
||||
if not no_language and t5_embedder and instructions:
|
||||
print(f" Processing language instructions...")
|
||||
try:
|
||||
instruction = instructions[0]
|
||||
|
||||
language_features = encode_language_instruction(instruction, t5_embedder, f"cuda:{gpu}")
|
||||
|
||||
instructions_dir = os.path.join(episode_output_dir, "instructions")
|
||||
if not os.path.exists(instructions_dir):
|
||||
os.makedirs(instructions_dir)
|
||||
|
||||
lang_embed_path = os.path.join(instructions_dir, "lang_embed_0.pt")
|
||||
torch.save(torch.from_numpy(language_features), lang_embed_path)
|
||||
|
||||
print(f" Language instruction encoded successfully: {instruction}")
|
||||
print(f" Language features saved to: {lang_embed_path}")
|
||||
print(f" Language features shape: {language_features.shape}, data type: {language_features.dtype}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" Language instruction processing failed: {e}")
|
||||
|
||||
print(f"\nConversion completed! Processed {episode_num} episodes")
|
||||
print(f"Output directory: {output_dir}")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Convert LeRobot data to RDT format")
|
||||
parser.add_argument("--data_dir", type=str, required=True,
|
||||
help="LeRobot data directory path")
|
||||
parser.add_argument("--output_dir", type=str, required=True,
|
||||
help="Output directory path")
|
||||
parser.add_argument("--episode_num", type=int, default=10,
|
||||
help="Number of episodes to process")
|
||||
parser.add_argument("--gpu", type=int, default=0,
|
||||
help="GPU device ID")
|
||||
parser.add_argument("--no_language", action="store_true",
|
||||
help="Skip language processing")
|
||||
parser.add_argument("--cam_high_key", type=str, default="cam_high",
|
||||
help="High camera key")
|
||||
parser.add_argument("--cam_right_wrist_key", type=str, default="cam_right_wrist",
|
||||
help="Right wrist camera key")
|
||||
parser.add_argument("--cam_left_wrist_key", type=str, default="cam_left_wrist",
|
||||
help="Left wrist camera key")
|
||||
parser.add_argument("--t5_path", type=str, required=True,
|
||||
help="T5 model path")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.data_dir):
|
||||
print(f"Error: Data directory does not exist: {args.data_dir}")
|
||||
return
|
||||
|
||||
meta_file = os.path.join(args.data_dir, "meta/info.json")
|
||||
if not os.path.exists(meta_file):
|
||||
print(f"Error: Meta information file not found: {meta_file}")
|
||||
return
|
||||
|
||||
try:
|
||||
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
|
||||
print("ffmpeg is available, will use ffmpeg to extract video frames")
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
print("Warning: ffmpeg is not available, image data may not be extracted correctly")
|
||||
print("Please install ffmpeg: conda install -c conda-forge ffmpeg=6.1")
|
||||
return
|
||||
|
||||
with open(meta_file, 'r') as f:
|
||||
meta_info = yaml.safe_load(f)
|
||||
|
||||
total_episodes = meta_info.get('total_episodes', 10)
|
||||
if args.episode_num > total_episodes:
|
||||
print(f"Warning: Requested episode number ({args.episode_num}) exceeds available number ({total_episodes})")
|
||||
args.episode_num = total_episodes
|
||||
|
||||
convert_lerobot_to_rdt(
|
||||
args.data_dir,
|
||||
args.output_dir,
|
||||
args.episode_num,
|
||||
args.gpu,
|
||||
args.no_language,
|
||||
args.t5_path,
|
||||
args.cam_high_key,
|
||||
args.cam_right_wrist_key,
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
RDT/lerobot2rdt/models/__init__.py
Normal file
0
RDT/lerobot2rdt/models/__init__.py
Normal file
82
RDT/lerobot2rdt/models/ema_model.py
Normal file
82
RDT/lerobot2rdt/models/ema_model.py
Normal file
@ -0,0 +1,82 @@
|
||||
# Reference: DiffusionPolicy [https://github.com/real-stanford/diffusion_policy]
|
||||
|
||||
import torch
|
||||
from torch.nn.modules.batchnorm import _BatchNorm
|
||||
|
||||
|
||||
class EMAModel:
|
||||
"""
|
||||
Exponential Moving Average of models weights
|
||||
"""
|
||||
|
||||
def __init__(self, model, update_after_step=0, inv_gamma=1.0, power=2 / 3, min_value=0.0, max_value=0.9999):
|
||||
"""
|
||||
@crowsonkb's notes on EMA Warmup:
|
||||
If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
|
||||
to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
|
||||
gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
|
||||
at 215.4k steps).
|
||||
Args:
|
||||
inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
|
||||
power (float): Exponential factor of EMA warmup. Default: 2/3.
|
||||
min_value (float): The minimum EMA decay rate. Default: 0.
|
||||
"""
|
||||
|
||||
self.averaged_model = model
|
||||
self.averaged_model.eval()
|
||||
self.averaged_model.requires_grad_(False)
|
||||
|
||||
self.update_after_step = update_after_step
|
||||
self.inv_gamma = inv_gamma
|
||||
self.power = power
|
||||
self.min_value = min_value
|
||||
self.max_value = max_value
|
||||
|
||||
self.decay = 0.0
|
||||
self.optimization_step = 0
|
||||
|
||||
def get_decay(self, optimization_step):
|
||||
"""
|
||||
Compute the decay factor for the exponential moving average.
|
||||
"""
|
||||
step = max(0, optimization_step - self.update_after_step - 1)
|
||||
value = 1 - (1 + step / self.inv_gamma)**-self.power
|
||||
|
||||
if step <= 0:
|
||||
return 0.0
|
||||
|
||||
return max(self.min_value, min(value, self.max_value))
|
||||
|
||||
@torch.no_grad()
|
||||
def step(self, new_model):
|
||||
self.decay = self.get_decay(self.optimization_step)
|
||||
|
||||
# old_all_dataptrs = set()
|
||||
# for param in new_model.parameters():
|
||||
# data_ptr = param.data_ptr()
|
||||
# if data_ptr != 0:
|
||||
# old_all_dataptrs.add(data_ptr)
|
||||
|
||||
all_dataptrs = set()
|
||||
for module, ema_module in zip(new_model.modules(), self.averaged_model.modules()):
|
||||
for param, ema_param in zip(module.parameters(recurse=False), ema_module.parameters(recurse=False)):
|
||||
# iterative over immediate parameters only.
|
||||
if isinstance(param, dict):
|
||||
raise RuntimeError('Dict parameter not supported')
|
||||
|
||||
# data_ptr = param.data_ptr()
|
||||
# if data_ptr != 0:
|
||||
# all_dataptrs.add(data_ptr)
|
||||
|
||||
if isinstance(module, _BatchNorm):
|
||||
# skip batchnorms
|
||||
ema_param.copy_(param.to(dtype=ema_param.dtype).data)
|
||||
elif not param.requires_grad:
|
||||
ema_param.copy_(param.to(dtype=ema_param.dtype).data)
|
||||
else:
|
||||
ema_param.mul_(self.decay)
|
||||
ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=1 - self.decay)
|
||||
|
||||
# verify that iterating over module and then parameters is identical to parameters recursively.
|
||||
# assert old_all_dataptrs == all_dataptrs
|
||||
self.optimization_step += 1
|
||||
75
RDT/lerobot2rdt/models/hub_mixin.py
Normal file
75
RDT/lerobot2rdt/models/hub_mixin.py
Normal file
@ -0,0 +1,75 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Union
|
||||
|
||||
from huggingface_hub import PyTorchModelHubMixin
|
||||
from huggingface_hub.constants import (PYTORCH_WEIGHTS_NAME, SAFETENSORS_SINGLE_FILE)
|
||||
from huggingface_hub.file_download import hf_hub_download
|
||||
from huggingface_hub.utils import EntryNotFoundError, is_torch_available
|
||||
|
||||
if is_torch_available():
|
||||
import torch # type: ignore
|
||||
|
||||
|
||||
class CompatiblePyTorchModelHubMixin(PyTorchModelHubMixin):
|
||||
"""Mixin class to load Pytorch models from the Hub."""
|
||||
|
||||
def _save_pretrained(self, save_directory: Path) -> None:
|
||||
"""Save weights from a Pytorch model to a local directory."""
|
||||
# To bypass saving into safetensor by default
|
||||
model_to_save = self.module if hasattr(self, "module") else self # type: ignore
|
||||
torch.save(model_to_save.state_dict(), save_directory / PYTORCH_WEIGHTS_NAME)
|
||||
|
||||
@classmethod
|
||||
def _from_pretrained(
|
||||
cls,
|
||||
*,
|
||||
model_id: str,
|
||||
revision: Optional[str],
|
||||
cache_dir: Optional[Union[str, Path]],
|
||||
force_download: bool,
|
||||
proxies: Optional[Dict],
|
||||
resume_download: Optional[bool],
|
||||
local_files_only: bool,
|
||||
token: Union[str, bool, None],
|
||||
map_location: str = "cpu",
|
||||
strict: bool = False,
|
||||
**model_kwargs,
|
||||
):
|
||||
"""Load Pytorch pretrained weights and return the loaded model."""
|
||||
model = cls(**model_kwargs)
|
||||
if os.path.isdir(model_id):
|
||||
print("Loading weights from local directory")
|
||||
try:
|
||||
model_file = os.path.join(model_id, SAFETENSORS_SINGLE_FILE)
|
||||
return cls._load_as_safetensor(model, model_file, map_location, strict)
|
||||
except FileNotFoundError:
|
||||
model_file = os.path.join(model_id, PYTORCH_WEIGHTS_NAME)
|
||||
return cls._load_as_pickle(model, model_file, map_location, strict)
|
||||
else:
|
||||
try:
|
||||
model_file = hf_hub_download(
|
||||
repo_id=model_id,
|
||||
filename=SAFETENSORS_SINGLE_FILE,
|
||||
revision=revision,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
token=token,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
return cls._load_as_safetensor(model, model_file, map_location, strict)
|
||||
except EntryNotFoundError:
|
||||
model_file = hf_hub_download(
|
||||
repo_id=model_id,
|
||||
filename=PYTORCH_WEIGHTS_NAME,
|
||||
revision=revision,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
token=token,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
return cls._load_as_pickle(model, model_file, map_location, strict)
|
||||
159
RDT/lerobot2rdt/models/multimodal_encoder/clip_encoder.py
Normal file
159
RDT/lerobot2rdt/models/multimodal_encoder/clip_encoder.py
Normal file
@ -0,0 +1,159 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
|
||||
|
||||
|
||||
class CLIPVisionTower(nn.Module):
|
||||
|
||||
def __init__(self, vision_tower, args, delay_load=False):
|
||||
super().__init__()
|
||||
|
||||
self.is_loaded = False
|
||||
|
||||
self.vision_tower_name = vision_tower
|
||||
self.select_layer = args.mm_vision_select_layer
|
||||
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
|
||||
|
||||
if not delay_load:
|
||||
self.load_model()
|
||||
elif getattr(args, 'unfreeze_mm_vision_tower', False):
|
||||
self.load_model()
|
||||
else:
|
||||
self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
|
||||
|
||||
def load_model(self, device_map=None):
|
||||
if self.is_loaded:
|
||||
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
|
||||
return
|
||||
|
||||
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
|
||||
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
||||
self.vision_tower.requires_grad_(False)
|
||||
|
||||
self.is_loaded = True
|
||||
|
||||
def feature_select(self, image_forward_outs):
|
||||
image_features = image_forward_outs.hidden_states[self.select_layer]
|
||||
if self.select_feature == 'patch':
|
||||
image_features = image_features[:, 1:]
|
||||
elif self.select_feature == 'cls_patch':
|
||||
image_features = image_features
|
||||
else:
|
||||
raise ValueError(f'Unexpected select feature: {self.select_feature}')
|
||||
return image_features
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, images):
|
||||
if type(images) is list:
|
||||
image_features = []
|
||||
for image in images:
|
||||
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
|
||||
output_hidden_states=True)
|
||||
image_feature = self.feature_select(image_forward_out).to(image.dtype)
|
||||
image_features.append(image_feature)
|
||||
else:
|
||||
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
|
||||
output_hidden_states=True)
|
||||
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
||||
|
||||
return image_features
|
||||
|
||||
@property
|
||||
def dummy_feature(self):
|
||||
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self.vision_tower.dtype
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return self.vision_tower.device
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
if self.is_loaded:
|
||||
return self.vision_tower.config
|
||||
else:
|
||||
return self.cfg_only
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.config.hidden_size
|
||||
|
||||
@property
|
||||
def num_patches_per_side(self):
|
||||
return self.config.image_size // self.config.patch_size
|
||||
|
||||
@property
|
||||
def num_patches(self):
|
||||
return (self.config.image_size // self.config.patch_size)**2
|
||||
|
||||
|
||||
class CLIPVisionTowerS2(CLIPVisionTower):
|
||||
|
||||
def __init__(self, vision_tower, args, delay_load=False):
|
||||
super().__init__(vision_tower, args, delay_load)
|
||||
|
||||
self.s2_scales = getattr(args, 's2_scales', '336,672,1008')
|
||||
self.s2_scales = list(map(int, self.s2_scales.split(',')))
|
||||
self.s2_scales.sort()
|
||||
self.s2_split_size = self.s2_scales[0]
|
||||
self.s2_image_size = self.s2_scales[-1]
|
||||
|
||||
try:
|
||||
from s2wrapper import forward as multiscale_forward
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git'
|
||||
)
|
||||
self.multiscale_forward = multiscale_forward
|
||||
|
||||
# change resize/crop size in preprocessing to the largest image size in s2_scale
|
||||
if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False):
|
||||
self.image_processor.size['shortest_edge'] = self.s2_image_size
|
||||
self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
|
||||
|
||||
def load_model(self, device_map=None):
|
||||
if self.is_loaded:
|
||||
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
|
||||
return
|
||||
|
||||
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
|
||||
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
||||
self.vision_tower.requires_grad_(False)
|
||||
|
||||
self.image_processor.size['shortest_edge'] = self.s2_image_size
|
||||
self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
|
||||
|
||||
self.is_loaded = True
|
||||
|
||||
@torch.no_grad()
|
||||
def forward_feature(self, images):
|
||||
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
|
||||
output_hidden_states=True)
|
||||
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
||||
return image_features
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, images):
|
||||
if type(images) is list:
|
||||
image_features = []
|
||||
for image in images:
|
||||
image_feature = self.multiscale_forward(self.forward_feature,
|
||||
image.unsqueeze(0),
|
||||
img_sizes=self.s2_scales,
|
||||
max_split_size=self.s2_split_size)
|
||||
image_features.append(image_feature)
|
||||
else:
|
||||
image_features = self.multiscale_forward(self.forward_feature,
|
||||
images,
|
||||
img_sizes=self.s2_scales,
|
||||
max_split_size=self.s2_split_size)
|
||||
|
||||
return image_features
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.config.hidden_size * len(self.s2_scales)
|
||||
87
RDT/lerobot2rdt/models/multimodal_encoder/dinov2_encoder.py
Normal file
87
RDT/lerobot2rdt/models/multimodal_encoder/dinov2_encoder.py
Normal file
@ -0,0 +1,87 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import AutoConfig, AutoImageProcessor, AutoModel, Dinov2Model
|
||||
|
||||
|
||||
class DinoV2VisionTower(nn.Module):
|
||||
|
||||
def __init__(self, vision_tower, args, delay_load=False):
|
||||
super().__init__()
|
||||
|
||||
self.is_loaded = False
|
||||
|
||||
self.vision_tower_name = vision_tower
|
||||
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
|
||||
|
||||
if not delay_load:
|
||||
self.load_model()
|
||||
elif getattr(args, 'unfreeze_mm_vision_tower', False):
|
||||
self.load_model()
|
||||
else:
|
||||
self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
|
||||
|
||||
def load_model(self, device_map=None):
|
||||
if self.is_loaded:
|
||||
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
|
||||
return
|
||||
|
||||
self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name)
|
||||
self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
||||
self.vision_tower.requires_grad_(False) # FIXME:
|
||||
|
||||
self.is_loaded = True
|
||||
|
||||
def feature_select(self, image_forward_outs):
|
||||
image_features = image_forward_outs.last_hidden_state
|
||||
if self.select_feature == 'patch':
|
||||
image_features = image_features[:, 1:] # (B, 1369, 1536)
|
||||
elif self.select_feature == 'cls_patch':
|
||||
image_features = image_features # (B, 1, 1536)
|
||||
else:
|
||||
raise ValueError(f'Unexpected select feature: {self.select_feature}')
|
||||
return image_features
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, images):
|
||||
if type(images) is list:
|
||||
image_features = []
|
||||
for image in images:
|
||||
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
|
||||
image_feature = self.feature_select(image_forward_out).to(image.dtype)
|
||||
image_features.append(image_feature)
|
||||
else:
|
||||
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
|
||||
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
||||
|
||||
return image_features
|
||||
|
||||
@property
|
||||
def dummy_feature(self):
|
||||
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self.vision_tower.dtype
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return self.vision_tower.device
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
if self.is_loaded:
|
||||
return self.vision_tower.config
|
||||
else:
|
||||
return self.cfg_only
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.config.hidden_size
|
||||
|
||||
@property
|
||||
def num_patches_per_side(self):
|
||||
return self.config.image_size // self.config.patch_size
|
||||
|
||||
@property
|
||||
def num_patches(self):
|
||||
return (self.config.image_size // self.config.patch_size)**2
|
||||
86
RDT/lerobot2rdt/models/multimodal_encoder/siglip_encoder.py
Normal file
86
RDT/lerobot2rdt/models/multimodal_encoder/siglip_encoder.py
Normal file
@ -0,0 +1,86 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import AutoConfig, SiglipImageProcessor, SiglipVisionModel
|
||||
|
||||
|
||||
class SiglipVisionTower(nn.Module):
|
||||
|
||||
def __init__(self, vision_tower, args, delay_load=False):
|
||||
super().__init__()
|
||||
|
||||
self.is_loaded = False
|
||||
|
||||
self.vision_tower_name = vision_tower
|
||||
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
|
||||
|
||||
if not delay_load:
|
||||
self.load_model()
|
||||
elif getattr(args, 'unfreeze_mm_vision_tower', False):
|
||||
self.load_model()
|
||||
else:
|
||||
self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
|
||||
|
||||
def load_model(self, device_map=None):
|
||||
if self.is_loaded:
|
||||
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
|
||||
return
|
||||
|
||||
self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
|
||||
self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
||||
self.vision_tower.eval()
|
||||
|
||||
self.is_loaded = True
|
||||
|
||||
def feature_select(self, image_forward_outs):
|
||||
if self.select_feature == 'patch':
|
||||
image_features = image_forward_outs.last_hidden_state # (B, 729, 1536)
|
||||
elif self.select_feature == 'cls_patch':
|
||||
image_features = image_forward_outs.pooler_output # (B, 1, 1536)
|
||||
else:
|
||||
raise ValueError(f'Unexpected select feature: {self.select_feature}')
|
||||
return image_features
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, images):
|
||||
if type(images) is list:
|
||||
image_features = []
|
||||
for image in images:
|
||||
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
|
||||
image_feature = self.feature_select(image_forward_out).to(image.dtype)
|
||||
image_features.append(image_feature)
|
||||
else:
|
||||
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
|
||||
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
||||
|
||||
return image_features
|
||||
|
||||
@property
|
||||
def dummy_feature(self):
|
||||
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self.vision_tower.dtype
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return self.vision_tower.device
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
if self.is_loaded:
|
||||
return self.vision_tower.config
|
||||
else:
|
||||
return self.cfg_only
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.config.hidden_size
|
||||
|
||||
@property
|
||||
def num_patches_per_side(self):
|
||||
return self.config.image_size // self.config.patch_size
|
||||
|
||||
@property
|
||||
def num_patches(self):
|
||||
return (self.config.image_size // self.config.patch_size)**2
|
||||
111
RDT/lerobot2rdt/models/multimodal_encoder/t5_encoder.py
Normal file
111
RDT/lerobot2rdt/models/multimodal_encoder/t5_encoder.py
Normal file
@ -0,0 +1,111 @@
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
|
||||
class T5Embedder:
|
||||
# available_models = ["google/t5-v1_1-xxl"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
device,
|
||||
from_pretrained=None,
|
||||
*,
|
||||
cache_dir=None,
|
||||
hf_token=None,
|
||||
use_text_preprocessing=True,
|
||||
t5_model_kwargs=None,
|
||||
torch_dtype=None,
|
||||
use_offload_folder=None,
|
||||
model_max_length=120,
|
||||
local_files_only=False,
|
||||
):
|
||||
# from_pretrained="google/t5-v1_1-xxl" # zijian
|
||||
self.device = torch.device(device)
|
||||
self.torch_dtype = torch_dtype or torch.bfloat16
|
||||
self.cache_dir = cache_dir
|
||||
|
||||
if t5_model_kwargs is None:
|
||||
t5_model_kwargs = {
|
||||
"low_cpu_mem_usage": True,
|
||||
"torch_dtype": self.torch_dtype,
|
||||
}
|
||||
|
||||
if use_offload_folder is not None:
|
||||
t5_model_kwargs["offload_folder"] = use_offload_folder
|
||||
t5_model_kwargs["device_map"] = {
|
||||
"shared": self.device,
|
||||
"encoder.embed_tokens": self.device,
|
||||
"encoder.block.0": self.device,
|
||||
"encoder.block.1": self.device,
|
||||
"encoder.block.2": self.device,
|
||||
"encoder.block.3": self.device,
|
||||
"encoder.block.4": self.device,
|
||||
"encoder.block.5": self.device,
|
||||
"encoder.block.6": self.device,
|
||||
"encoder.block.7": self.device,
|
||||
"encoder.block.8": self.device,
|
||||
"encoder.block.9": self.device,
|
||||
"encoder.block.10": self.device,
|
||||
"encoder.block.11": self.device,
|
||||
"encoder.block.12": "disk",
|
||||
"encoder.block.13": "disk",
|
||||
"encoder.block.14": "disk",
|
||||
"encoder.block.15": "disk",
|
||||
"encoder.block.16": "disk",
|
||||
"encoder.block.17": "disk",
|
||||
"encoder.block.18": "disk",
|
||||
"encoder.block.19": "disk",
|
||||
"encoder.block.20": "disk",
|
||||
"encoder.block.21": "disk",
|
||||
"encoder.block.22": "disk",
|
||||
"encoder.block.23": "disk",
|
||||
"encoder.final_layer_norm": "disk",
|
||||
"encoder.dropout": "disk",
|
||||
}
|
||||
else:
|
||||
t5_model_kwargs["device_map"] = {
|
||||
"shared": self.device,
|
||||
"encoder": self.device,
|
||||
}
|
||||
|
||||
self.use_text_preprocessing = use_text_preprocessing
|
||||
self.hf_token = hf_token
|
||||
|
||||
# assert from_pretrained in self.available_models
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
from_pretrained,
|
||||
model_max_length=model_max_length,
|
||||
cache_dir=cache_dir,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
self.model = T5EncoderModel.from_pretrained(
|
||||
from_pretrained,
|
||||
cache_dir=cache_dir,
|
||||
local_files_only=local_files_only,
|
||||
**t5_model_kwargs,
|
||||
).eval()
|
||||
self.model_max_length = model_max_length
|
||||
|
||||
def get_text_embeddings(self, texts):
|
||||
text_tokens_and_mask = self.tokenizer(
|
||||
texts,
|
||||
max_length=self.model_max_length,
|
||||
padding="longest",
|
||||
truncation=True,
|
||||
return_attention_mask=True,
|
||||
add_special_tokens=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
input_ids = text_tokens_and_mask["input_ids"].to(self.device)
|
||||
attention_mask = text_tokens_and_mask["attention_mask"].to(self.device)
|
||||
with torch.no_grad():
|
||||
text_encoder_embs = self.model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
)["last_hidden_state"].detach()
|
||||
return text_encoder_embs, attention_mask
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
T5Embedder(from_pretrained="google/t5-v1_1-xxl", device='cuda:7')
|
||||
304
RDT/lerobot2rdt/models/rdt/blocks.py
Normal file
304
RDT/lerobot2rdt/models/rdt/blocks.py
Normal file
@ -0,0 +1,304 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
|
||||
# This source code is licensed under the license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
# --------------------------------------------------------
|
||||
# References:
|
||||
# DiT: https://github.com/facebookresearch/DiT
|
||||
# GLIDE: https://github.com/openai/glide-text2im
|
||||
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
|
||||
# --------------------------------------------------------
|
||||
|
||||
import math
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.jit import Final
|
||||
from timm.models.vision_transformer import Attention, Mlp, RmsNorm, use_fused_attn
|
||||
|
||||
|
||||
#################################################################################
|
||||
# Embedding Layers for Timesteps and Condition Inptus #
|
||||
#################################################################################
|
||||
class TimestepEmbedder(nn.Module):
|
||||
"""
|
||||
Embeds scalar timesteps into vector representations.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size, frequency_embedding_size=256, dtype=torch.bfloat16):
|
||||
super().__init__()
|
||||
self.mlp = nn.Sequential(
|
||||
nn.Linear(frequency_embedding_size, hidden_size, bias=True),
|
||||
nn.SiLU(),
|
||||
nn.Linear(hidden_size, hidden_size, bias=True),
|
||||
)
|
||||
self.frequency_embedding_size = frequency_embedding_size
|
||||
self.dtype = dtype
|
||||
|
||||
def timestep_embedding(self, t, dim, max_period=10000):
|
||||
"""
|
||||
Create sinusoidal timestep embeddings.
|
||||
:param t: a 1-D Tensor of N indices, one per batch element.
|
||||
These may be fractional.
|
||||
:param dim: the dimension of the output.
|
||||
:param max_period: controls the minimum frequency of the embeddings.
|
||||
:return: an (N, D) Tensor of positional embeddings.
|
||||
"""
|
||||
# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
|
||||
half = dim // 2
|
||||
freqs = torch.exp(-math.log(max_period) *
|
||||
torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
|
||||
args = t[:, None].float() * freqs[None]
|
||||
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
||||
if dim % 2:
|
||||
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||
return embedding.to(self.dtype)
|
||||
|
||||
def forward(self, t):
|
||||
t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
|
||||
t_emb = self.mlp(t_freq)
|
||||
return t_emb
|
||||
|
||||
|
||||
#################################################################################
|
||||
# Cross Attention Layers #
|
||||
#################################################################################
|
||||
class CrossAttention(nn.Module):
|
||||
"""
|
||||
A cross-attention layer with flash attention.
|
||||
"""
|
||||
fused_attn: Final[bool]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim: int,
|
||||
num_heads: int = 8,
|
||||
qkv_bias: bool = False,
|
||||
qk_norm: bool = False,
|
||||
attn_drop: float = 0,
|
||||
proj_drop: float = 0,
|
||||
norm_layer: nn.Module = nn.LayerNorm,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
assert dim % num_heads == 0, 'dim should be divisible by num_heads'
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
self.scale = self.head_dim**-0.5
|
||||
self.fused_attn = use_fused_attn()
|
||||
|
||||
self.q = nn.Linear(dim, dim, bias=qkv_bias)
|
||||
self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
|
||||
self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
|
||||
self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
|
||||
self.attn_drop = nn.Dropout(attn_drop)
|
||||
self.proj = nn.Linear(dim, dim)
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
|
||||
def forward(self, x: torch.Tensor, c: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
||||
B, N, C = x.shape
|
||||
_, L, _ = c.shape
|
||||
q = self.q(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
|
||||
kv = self.kv(c).reshape(B, L, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
|
||||
k, v = kv.unbind(0)
|
||||
q, k = self.q_norm(q), self.k_norm(k)
|
||||
|
||||
# Prepare attn mask (B, L) to mask the conditioion
|
||||
if mask is not None:
|
||||
mask = mask.reshape(B, 1, 1, L)
|
||||
mask = mask.expand(-1, -1, N, -1)
|
||||
|
||||
if self.fused_attn:
|
||||
x = F.scaled_dot_product_attention(query=q,
|
||||
key=k,
|
||||
value=v,
|
||||
dropout_p=self.attn_drop.p if self.training else 0.,
|
||||
attn_mask=mask)
|
||||
else:
|
||||
q = q * self.scale
|
||||
attn = q @ k.transpose(-2, -1)
|
||||
if mask is not None:
|
||||
attn = attn.masked_fill_(mask.logical_not(), float('-inf'))
|
||||
attn = attn.softmax(dim=-1)
|
||||
if self.attn_drop.p > 0:
|
||||
attn = self.attn_drop(attn)
|
||||
x = attn @ v
|
||||
|
||||
x = x.permute(0, 2, 1, 3).reshape(B, N, C)
|
||||
x = self.proj(x)
|
||||
if self.proj_drop.p > 0:
|
||||
x = self.proj_drop(x)
|
||||
return x
|
||||
|
||||
|
||||
#################################################################################
|
||||
# RDT Block #
|
||||
#################################################################################
|
||||
class RDTBlock(nn.Module):
|
||||
"""
|
||||
A RDT block with cross-attention conditioning.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size, num_heads, **block_kwargs):
|
||||
super().__init__()
|
||||
self.norm1 = RmsNorm(hidden_size, eps=1e-6)
|
||||
self.attn = Attention(dim=hidden_size,
|
||||
num_heads=num_heads,
|
||||
qkv_bias=True,
|
||||
qk_norm=True,
|
||||
norm_layer=RmsNorm,
|
||||
**block_kwargs)
|
||||
self.cross_attn = CrossAttention(hidden_size,
|
||||
num_heads=num_heads,
|
||||
qkv_bias=True,
|
||||
qk_norm=True,
|
||||
norm_layer=RmsNorm,
|
||||
**block_kwargs)
|
||||
|
||||
self.norm2 = RmsNorm(hidden_size, eps=1e-6)
|
||||
approx_gelu = lambda: nn.GELU(approximate="tanh")
|
||||
self.ffn = Mlp(in_features=hidden_size, hidden_features=hidden_size, act_layer=approx_gelu, drop=0)
|
||||
self.norm3 = RmsNorm(hidden_size, eps=1e-6)
|
||||
|
||||
def forward(self, x, c, mask=None):
|
||||
origin_x = x
|
||||
x = self.norm1(x)
|
||||
x = self.attn(x)
|
||||
x = x + origin_x
|
||||
|
||||
origin_x = x
|
||||
x = self.norm2(x)
|
||||
x = self.cross_attn(x, c, mask)
|
||||
x = x + origin_x
|
||||
|
||||
origin_x = x
|
||||
x = self.norm3(x)
|
||||
x = self.ffn(x)
|
||||
x = x + origin_x
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class FinalLayer(nn.Module):
|
||||
"""
|
||||
The final layer of RDT.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size, out_channels):
|
||||
super().__init__()
|
||||
self.norm_final = RmsNorm(hidden_size, eps=1e-6)
|
||||
approx_gelu = lambda: nn.GELU(approximate="tanh")
|
||||
self.ffn_final = Mlp(in_features=hidden_size,
|
||||
hidden_features=hidden_size,
|
||||
out_features=out_channels,
|
||||
act_layer=approx_gelu,
|
||||
drop=0)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.norm_final(x)
|
||||
x = self.ffn_final(x)
|
||||
return x
|
||||
|
||||
|
||||
#################################################################################
|
||||
# Sine/Cosine Positional Embedding Functions #
|
||||
#################################################################################
|
||||
# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
|
||||
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
|
||||
"""
|
||||
embed_dim: output dimension for each position
|
||||
pos: a list of positions to be encoded: size (M,)
|
||||
out: (M, D)
|
||||
"""
|
||||
assert embed_dim % 2 == 0
|
||||
omega = np.arange(embed_dim // 2, dtype=np.float64)
|
||||
omega /= embed_dim / 2.
|
||||
omega = 1. / 10000**omega # (D/2,)
|
||||
|
||||
if not isinstance(pos, np.ndarray):
|
||||
pos = np.array(pos, dtype=np.float64)
|
||||
pos = pos.reshape(-1) # (M,)
|
||||
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
|
||||
|
||||
emb_sin = np.sin(out) # (M, D/2)
|
||||
emb_cos = np.cos(out) # (M, D/2)
|
||||
|
||||
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
|
||||
return emb
|
||||
|
||||
|
||||
def get_nd_sincos_pos_embed_from_grid(embed_dim, grid_sizes):
|
||||
"""
|
||||
embed_dim: output dimension for each position
|
||||
grid_sizes: the grids sizes in each dimension (K,).
|
||||
out: (grid_sizes[0], ..., grid_sizes[K-1], D)
|
||||
"""
|
||||
num_sizes = len(grid_sizes)
|
||||
# For grid size of 1, we do not need to add any positional embedding
|
||||
num_valid_sizes = len([x for x in grid_sizes if x > 1])
|
||||
emb = np.zeros(grid_sizes + (embed_dim, ))
|
||||
# Uniformly divide the embedding dimension for each grid size
|
||||
dim_for_each_grid = embed_dim // num_valid_sizes
|
||||
# To make it even
|
||||
if dim_for_each_grid % 2 != 0:
|
||||
dim_for_each_grid -= 1
|
||||
valid_size_idx = 0
|
||||
for size_idx in range(num_sizes):
|
||||
grid_size = grid_sizes[size_idx]
|
||||
if grid_size <= 1:
|
||||
continue
|
||||
pos = np.arange(grid_size)
|
||||
posemb_shape = [1] * len(grid_sizes) + [dim_for_each_grid]
|
||||
posemb_shape[size_idx] = -1
|
||||
emb[..., valid_size_idx * dim_for_each_grid:(valid_size_idx + 1) * dim_for_each_grid] += \
|
||||
get_1d_sincos_pos_embed_from_grid(dim_for_each_grid, pos).reshape(posemb_shape)
|
||||
valid_size_idx += 1
|
||||
return emb
|
||||
|
||||
|
||||
def get_multimodal_cond_pos_embed(embed_dim, mm_cond_lens: OrderedDict, embed_modality=True):
|
||||
"""
|
||||
Generate position embeddings for multimodal conditions.
|
||||
|
||||
mm_cond_lens: an OrderedDict containing
|
||||
(modality name, modality token length) pairs.
|
||||
For `"image"` modality, the value can be a multi-dimensional tuple.
|
||||
If the length < 0, it means there is no position embedding for the modality or grid.
|
||||
embed_modality: whether to embed the modality information. Default is True.
|
||||
"""
|
||||
num_modalities = len(mm_cond_lens)
|
||||
modality_pos_embed = np.zeros((num_modalities, embed_dim))
|
||||
if embed_modality:
|
||||
# Get embeddings for various modalites
|
||||
# We put it in the first half
|
||||
modality_sincos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, torch.arange(num_modalities))
|
||||
modality_pos_embed[:, :embed_dim // 2] = modality_sincos_embed
|
||||
# The second half is for position embeddings
|
||||
pos_embed_dim = embed_dim // 2
|
||||
else:
|
||||
# The whole embedding is for position embeddings
|
||||
pos_embed_dim = embed_dim
|
||||
|
||||
# Get embeddings for positions inside each modality
|
||||
c_pos_emb = np.zeros((0, embed_dim))
|
||||
for idx, (modality, cond_len) in enumerate(mm_cond_lens.items()):
|
||||
if modality == "image" and \
|
||||
(isinstance(cond_len, tuple) or isinstance(cond_len, list)):
|
||||
all_grid_sizes = tuple([abs(x) for x in cond_len])
|
||||
embed_grid_sizes = tuple([x if x > 0 else 1 for x in cond_len])
|
||||
cond_sincos_embed = get_nd_sincos_pos_embed_from_grid(pos_embed_dim, embed_grid_sizes)
|
||||
cond_pos_embed = np.zeros(all_grid_sizes + (embed_dim, ))
|
||||
cond_pos_embed[..., -pos_embed_dim:] += cond_sincos_embed
|
||||
cond_pos_embed = cond_pos_embed.reshape((-1, embed_dim))
|
||||
else:
|
||||
cond_sincos_embed = get_1d_sincos_pos_embed_from_grid(pos_embed_dim,
|
||||
torch.arange(cond_len if cond_len > 0 else 1))
|
||||
cond_pos_embed = np.zeros((abs(cond_len), embed_dim))
|
||||
cond_pos_embed[:, -pos_embed_dim:] += cond_sincos_embed
|
||||
cond_pos_embed += modality_pos_embed[idx]
|
||||
c_pos_emb = np.concatenate([c_pos_emb, cond_pos_embed], axis=0)
|
||||
|
||||
return c_pos_emb
|
||||
156
RDT/lerobot2rdt/models/rdt/model.py
Normal file
156
RDT/lerobot2rdt/models/rdt/model.py
Normal file
@ -0,0 +1,156 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
|
||||
# This source code is licensed under the license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
# --------------------------------------------------------
|
||||
# References:
|
||||
# DiT: https://github.com/facebookresearch/DiT
|
||||
# GLIDE: https://github.com/openai/glide-text2im
|
||||
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
|
||||
# --------------------------------------------------------
|
||||
from collections import OrderedDict
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from pathlib import Path
|
||||
import sys, os
|
||||
# get current workspace
|
||||
current_file = Path(__file__)
|
||||
sys.path.append(str(current_file.parent.parent))
|
||||
|
||||
from rdt.blocks import (FinalLayer, RDTBlock, TimestepEmbedder, get_1d_sincos_pos_embed_from_grid,
|
||||
get_multimodal_cond_pos_embed)
|
||||
|
||||
|
||||
class RDT(nn.Module):
|
||||
"""
|
||||
Class for Robotics Diffusion Transformers.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
output_dim=128,
|
||||
horizon=32,
|
||||
hidden_size=1152,
|
||||
depth=28,
|
||||
num_heads=16,
|
||||
max_lang_cond_len=1024,
|
||||
img_cond_len=4096,
|
||||
lang_pos_embed_config=None,
|
||||
img_pos_embed_config=None,
|
||||
dtype=torch.bfloat16):
|
||||
super().__init__()
|
||||
self.horizon = horizon
|
||||
self.hidden_size = hidden_size
|
||||
self.max_lang_cond_len = max_lang_cond_len
|
||||
self.img_cond_len = img_cond_len
|
||||
self.dtype = dtype
|
||||
self.lang_pos_embed_config = lang_pos_embed_config
|
||||
self.img_pos_embed_config = img_pos_embed_config
|
||||
|
||||
self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
|
||||
self.freq_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
|
||||
|
||||
# We will use trainable sin-cos embeddings
|
||||
# [timestep; state; action]
|
||||
self.x_pos_embed = nn.Parameter(torch.zeros(1, horizon + 3, hidden_size))
|
||||
# Language conditions
|
||||
self.lang_cond_pos_embed = nn.Parameter(torch.zeros(1, max_lang_cond_len, hidden_size))
|
||||
# Image conditions
|
||||
self.img_cond_pos_embed = nn.Parameter(torch.zeros(1, img_cond_len, hidden_size))
|
||||
|
||||
self.blocks = nn.ModuleList([RDTBlock(hidden_size, num_heads) for _ in range(depth)])
|
||||
self.final_layer = FinalLayer(hidden_size, output_dim)
|
||||
self.initialize_weights()
|
||||
|
||||
def initialize_weights(self):
|
||||
# Initialize transformer layers:
|
||||
def _basic_init(module):
|
||||
if isinstance(module, nn.Linear):
|
||||
torch.nn.init.xavier_uniform_(module.weight)
|
||||
if module.bias is not None:
|
||||
nn.init.constant_(module.bias, 0)
|
||||
|
||||
self.apply(_basic_init)
|
||||
|
||||
# Initialize pos_embed by sin-cos embedding
|
||||
x_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
|
||||
mm_cond_lens=OrderedDict([
|
||||
('timestep', 1),
|
||||
('ctrl_freq', 1),
|
||||
('state', 1),
|
||||
('action', self.horizon),
|
||||
]))
|
||||
self.x_pos_embed.data.copy_(torch.from_numpy(x_pos_embed).float().unsqueeze(0))
|
||||
|
||||
if self.lang_pos_embed_config is None:
|
||||
lang_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size,
|
||||
torch.arange(self.max_lang_cond_len))
|
||||
else:
|
||||
lang_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
|
||||
mm_cond_lens=OrderedDict(self.lang_pos_embed_config),
|
||||
embed_modality=False)
|
||||
self.lang_cond_pos_embed.data.copy_(torch.from_numpy(lang_cond_pos_embed).float().unsqueeze(0))
|
||||
|
||||
if self.img_pos_embed_config is None:
|
||||
img_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size, torch.arange(self.img_cond_len))
|
||||
else:
|
||||
img_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
|
||||
mm_cond_lens=OrderedDict(self.img_pos_embed_config),
|
||||
embed_modality=False)
|
||||
self.img_cond_pos_embed.data.copy_(torch.from_numpy(img_cond_pos_embed).float().unsqueeze(0))
|
||||
|
||||
# Initialize timestep and control freq embedding MLP
|
||||
nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
|
||||
nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
|
||||
nn.init.normal_(self.freq_embedder.mlp[0].weight, std=0.02)
|
||||
nn.init.normal_(self.freq_embedder.mlp[2].weight, std=0.02)
|
||||
|
||||
# Initialize the final layer: zero-out the final linear layer
|
||||
nn.init.constant_(self.final_layer.ffn_final.fc2.weight, 0)
|
||||
nn.init.constant_(self.final_layer.ffn_final.fc2.bias, 0)
|
||||
|
||||
# Move all the params to given data type:
|
||||
self.to(self.dtype)
|
||||
|
||||
def forward(self, x, freq, t, lang_c, img_c, lang_mask=None, img_mask=None):
|
||||
"""
|
||||
Forward pass of RDT.
|
||||
|
||||
x: (B, T, D), state + action token sequence, T = horizon + 1,
|
||||
dimension D is assumed to be the same as the hidden size.
|
||||
freq: (B,), a scalar indicating control frequency.
|
||||
t: (B,) or (1,), diffusion timesteps.
|
||||
lang_c: (B, L_lang, D) or None, language condition tokens (variable length),
|
||||
dimension D is assumed to be the same as the hidden size.
|
||||
img_c: (B, L_img, D) or None, image condition tokens (fixed length),
|
||||
dimension D is assumed to be the same as the hidden size.
|
||||
lang_mask: (B, L_lang) or None, language condition mask (True for valid).
|
||||
img_mask: (B, L_img) or None, image condition mask (True for valid).
|
||||
"""
|
||||
t = self.t_embedder(t).unsqueeze(1) # (B, 1, D) or (1, 1, D)
|
||||
freq = self.freq_embedder(freq).unsqueeze(1) # (B, 1, D)
|
||||
# Append timestep to the input tokens
|
||||
if t.shape[0] == 1:
|
||||
t = t.expand(x.shape[0], -1, -1)
|
||||
x = torch.cat([t, freq, x], dim=1) # (B, T+1, D)
|
||||
|
||||
# Add multimodal position embeddings
|
||||
x = x + self.x_pos_embed
|
||||
# Note the lang is of variable length
|
||||
lang_c = lang_c + self.lang_cond_pos_embed[:, :lang_c.shape[1]]
|
||||
img_c = img_c + self.img_cond_pos_embed
|
||||
|
||||
# Forward pass
|
||||
conds = [lang_c, img_c]
|
||||
masks = [lang_mask, img_mask]
|
||||
for i, block in enumerate(self.blocks):
|
||||
c, mask = conds[i % 2], masks[i % 2]
|
||||
x = block(x, c, mask) # (B, T+1, D)
|
||||
# Inject the language condition at the final layer
|
||||
x = self.final_layer(x) # (B, T+1, out_channels)
|
||||
|
||||
# Only preserve the action tokens
|
||||
x = x[:, -self.horizon:]
|
||||
return x
|
||||
246
RDT/lerobot2rdt/models/rdt_runner.py
Normal file
246
RDT/lerobot2rdt/models/rdt_runner.py
Normal file
@ -0,0 +1,246 @@
|
||||
import re, sys, os
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
|
||||
from diffusers.schedulers.scheduling_dpmsolver_multistep import \
|
||||
DPMSolverMultistepScheduler
|
||||
|
||||
from pathlib import Path
|
||||
# get current workspace
|
||||
current_file = Path(__file__)
|
||||
sys.path.append(os.path.join(current_file.parent))
|
||||
from hub_mixin import CompatiblePyTorchModelHubMixin
|
||||
from rdt.model import RDT
|
||||
|
||||
|
||||
class RDTRunner(nn.Module,
|
||||
CompatiblePyTorchModelHubMixin,
|
||||
repo_url="https://huggingface.co/robotics-diffusion-transformer/rdt-1b"):
|
||||
|
||||
def __init__(self,
|
||||
*,
|
||||
action_dim,
|
||||
pred_horizon,
|
||||
config,
|
||||
lang_token_dim,
|
||||
img_token_dim,
|
||||
state_token_dim,
|
||||
max_lang_cond_len,
|
||||
img_cond_len,
|
||||
lang_pos_embed_config=None,
|
||||
img_pos_embed_config=None,
|
||||
dtype=torch.bfloat16):
|
||||
super(RDTRunner, self).__init__()
|
||||
# Create diffusion model
|
||||
hidden_size = config['rdt']['hidden_size']
|
||||
self.model = RDT(
|
||||
output_dim=action_dim,
|
||||
horizon=pred_horizon,
|
||||
hidden_size=hidden_size,
|
||||
depth=config['rdt']['depth'],
|
||||
num_heads=config['rdt']['num_heads'],
|
||||
max_lang_cond_len=max_lang_cond_len,
|
||||
img_cond_len=img_cond_len,
|
||||
lang_pos_embed_config=lang_pos_embed_config,
|
||||
img_pos_embed_config=img_pos_embed_config,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
# Create adpators for various conditional inputs
|
||||
self.lang_adaptor = self.build_condition_adapter(config['lang_adaptor'],
|
||||
in_features=lang_token_dim,
|
||||
out_features=hidden_size)
|
||||
self.img_adaptor = self.build_condition_adapter(config['img_adaptor'],
|
||||
in_features=img_token_dim,
|
||||
out_features=hidden_size)
|
||||
# A `state` refers to an action or a proprioception vector
|
||||
self.state_adaptor = self.build_condition_adapter(
|
||||
config['state_adaptor'],
|
||||
in_features=state_token_dim * 2, # state + state mask (indicator)
|
||||
out_features=hidden_size)
|
||||
|
||||
# Create the noise scheduler
|
||||
noise_scheduler_config = config['noise_scheduler']
|
||||
self.noise_scheduler = DDPMScheduler(
|
||||
num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
|
||||
beta_schedule=noise_scheduler_config['beta_schedule'],
|
||||
prediction_type=noise_scheduler_config['prediction_type'],
|
||||
clip_sample=noise_scheduler_config['clip_sample'],
|
||||
)
|
||||
self.noise_scheduler_sample = DPMSolverMultistepScheduler(
|
||||
num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
|
||||
beta_schedule=noise_scheduler_config['beta_schedule'],
|
||||
prediction_type=noise_scheduler_config['prediction_type'],
|
||||
)
|
||||
|
||||
self.num_train_timesteps = noise_scheduler_config['num_train_timesteps']
|
||||
self.num_inference_timesteps = noise_scheduler_config['num_inference_timesteps']
|
||||
self.prediction_type = noise_scheduler_config['prediction_type']
|
||||
|
||||
self.pred_horizon = pred_horizon
|
||||
self.action_dim = action_dim
|
||||
|
||||
print("Diffusion params: %e" %
|
||||
sum([p.numel() for p in self.model.parameters()] + [p.numel() for p in self.lang_adaptor.parameters()] +
|
||||
[p.numel()
|
||||
for p in self.img_adaptor.parameters()] + [p.numel() for p in self.state_adaptor.parameters()]))
|
||||
|
||||
def build_condition_adapter(self, projector_type, in_features, out_features):
|
||||
projector = None
|
||||
if projector_type == 'linear':
|
||||
projector = nn.Linear(in_features, out_features)
|
||||
else:
|
||||
mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
|
||||
if mlp_gelu_match:
|
||||
mlp_depth = int(mlp_gelu_match.group(1))
|
||||
modules = [nn.Linear(in_features, out_features)]
|
||||
for _ in range(1, mlp_depth):
|
||||
modules.append(nn.GELU(approximate="tanh"))
|
||||
modules.append(nn.Linear(out_features, out_features))
|
||||
projector = nn.Sequential(*modules)
|
||||
|
||||
if projector is None:
|
||||
raise ValueError(f'Unknown projector type: {projector_type}')
|
||||
|
||||
return projector
|
||||
|
||||
def adapt_conditions(self, lang_tokens, img_tokens, state_tokens):
|
||||
'''
|
||||
lang_tokens: (batch_size, lang_len, lang_token_dim)
|
||||
img_tokens: (batch_size, img_len, img_token_dim)
|
||||
state_tokens: (batch_size, state_len, state_token_dim)
|
||||
|
||||
return: adpated (..., hidden_size) for all input tokens
|
||||
'''
|
||||
adpated_lang = self.lang_adaptor(lang_tokens)
|
||||
adpated_img = self.img_adaptor(img_tokens)
|
||||
adpated_state = self.state_adaptor(state_tokens)
|
||||
|
||||
return adpated_lang, adpated_img, adpated_state
|
||||
|
||||
def conditional_sample(self, lang_cond, lang_attn_mask, img_cond, state_traj, action_mask, ctrl_freqs):
|
||||
'''
|
||||
lang_cond: language conditional data, (batch_size, lang_len, hidden_size).
|
||||
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
|
||||
which should be True-False bool tensor.
|
||||
img_cond: image conditional data, (batch_size, img_len, hidden_size).
|
||||
state_traj: (batch_size, 1, hidden_size), state trajectory.
|
||||
action_mask: (batch_size, 1, action_dim), a 0-1 **float** tensor
|
||||
indicating the valid action dimensions.
|
||||
ctrl_freqs: (batch_size,), control frequency for each sample.
|
||||
|
||||
return: (batch_size, horizon, action_dim)
|
||||
'''
|
||||
device = state_traj.device
|
||||
dtype = state_traj.dtype
|
||||
noisy_action = torch.randn(size=(state_traj.shape[0], self.pred_horizon, self.action_dim),
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
action_mask = action_mask.expand(-1, self.pred_horizon, -1)
|
||||
|
||||
# Set step values
|
||||
self.noise_scheduler_sample.set_timesteps(self.num_inference_timesteps)
|
||||
|
||||
for t in self.noise_scheduler_sample.timesteps:
|
||||
# Prepare state-action trajectory
|
||||
action_traj = torch.cat([noisy_action, action_mask], dim=2)
|
||||
action_traj = self.state_adaptor(action_traj)
|
||||
state_action_traj = torch.cat([state_traj, action_traj], dim=1)
|
||||
|
||||
# Predict the model output
|
||||
model_output = self.model(state_action_traj,
|
||||
ctrl_freqs,
|
||||
t.unsqueeze(-1).to(device),
|
||||
lang_cond,
|
||||
img_cond,
|
||||
lang_mask=lang_attn_mask)
|
||||
|
||||
# Compute previous actions: x_t -> x_t-1
|
||||
noisy_action = self.noise_scheduler_sample.step(model_output, t, noisy_action).prev_sample
|
||||
noisy_action = noisy_action.to(state_traj.dtype)
|
||||
|
||||
# Finally apply the action mask to mask invalid action dimensions
|
||||
noisy_action = noisy_action * action_mask
|
||||
|
||||
return noisy_action
|
||||
|
||||
# ========= Train ============
|
||||
def compute_loss(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_gt, action_mask,
|
||||
ctrl_freqs) -> torch.Tensor:
|
||||
'''
|
||||
lang_tokens: (batch_size, lang_len, lang_token_dim)
|
||||
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
|
||||
which should be True-False bool tensor.
|
||||
img_tokens: (batch_size, img_len, img_token_dim)
|
||||
state_tokens: (batch_size, 1, state_token_dim)
|
||||
action_gt: (batch_size, horizon, state_token_dim), ground-truth actions for supervision
|
||||
action_mask: (batch_size, 1, state_token_dim), a 0-1 **float** tensor.
|
||||
ctrl_freqs: (batch_size,), control frequency for each sample.
|
||||
|
||||
return: loss_value, a scalar tensor
|
||||
'''
|
||||
batch_size = lang_tokens.shape[0]
|
||||
device = lang_tokens.device
|
||||
# Sample noise that we'll add to the actions
|
||||
noise = torch.randn(action_gt.shape, dtype=action_gt.dtype, device=device)
|
||||
# Sample random diffusion timesteps
|
||||
timesteps = torch.randint(0, self.num_train_timesteps, (batch_size, ), device=device).long()
|
||||
# Add noise to the clean actions according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_action = self.noise_scheduler.add_noise(action_gt, noise, timesteps)
|
||||
|
||||
# Concatenate the state and action tokens to form the input sequence
|
||||
state_action_traj = torch.cat([state_tokens, noisy_action], dim=1)
|
||||
# Append the action mask to the input sequence
|
||||
action_mask = action_mask.expand(-1, state_action_traj.shape[1], -1)
|
||||
state_action_traj = torch.cat([state_action_traj, action_mask], dim=2)
|
||||
# Align the dimension with the hidden size
|
||||
lang_cond, img_cond, state_action_traj = self.adapt_conditions(lang_tokens, img_tokens, state_action_traj)
|
||||
# Predict the denoised result
|
||||
pred = self.model(state_action_traj, ctrl_freqs, timesteps, lang_cond, img_cond, lang_mask=lang_attn_mask)
|
||||
|
||||
pred_type = self.prediction_type
|
||||
if pred_type == 'epsilon':
|
||||
target = noise
|
||||
elif pred_type == 'sample':
|
||||
target = action_gt
|
||||
else:
|
||||
raise ValueError(f"Unsupported prediction type {pred_type}")
|
||||
loss = F.mse_loss(pred, target)
|
||||
return loss
|
||||
|
||||
# ========= Inference ============
|
||||
def predict_action(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_mask, ctrl_freqs):
|
||||
'''
|
||||
lang_tokens: (batch_size, lang_len, lang_token_dim)
|
||||
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
|
||||
which should be True-False bool tensor.
|
||||
img_tokens: (batch_size, img_len, img_token_dim)
|
||||
state_tokens: (batch_size, 1, state_token_dim)
|
||||
action_mask: (batch_size, 1, action_dim),
|
||||
which should be a 0-1 **float** tensor.
|
||||
ctrl_freqs: (batch_size,), control frequency for each sample.
|
||||
|
||||
return: (batch_size, horizon, action_dim), predicted action sequence
|
||||
'''
|
||||
# Prepare the state and conditions
|
||||
state_tokens = torch.cat([state_tokens, action_mask], dim=2)
|
||||
lang_cond, img_cond, state_traj = self.adapt_conditions(lang_tokens, img_tokens, state_tokens)
|
||||
|
||||
# Run sampling
|
||||
action_pred = self.conditional_sample(
|
||||
lang_cond,
|
||||
lang_attn_mask,
|
||||
img_cond,
|
||||
state_traj,
|
||||
action_mask,
|
||||
ctrl_freqs,
|
||||
)
|
||||
|
||||
return action_pred
|
||||
|
||||
def forward(self, *args, **kwargs) -> torch.Tensor:
|
||||
return self.compute_loss(*args, **kwargs)
|
||||
20
RDT/lerobot2rdt/read_json.py
Normal file
20
RDT/lerobot2rdt/read_json.py
Normal file
@ -0,0 +1,20 @@
|
||||
import sys
|
||||
import json
|
||||
|
||||
def read_json_value(file_path, key):
|
||||
with open(file_path, "r") as file:
|
||||
data = json.load(file)
|
||||
value = data.get(key)
|
||||
if value is not None:
|
||||
print(value)
|
||||
else:
|
||||
print(f"Key '{key}' not found in {file_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python read_json.py <file_path> <key>")
|
||||
sys.exit(1)
|
||||
|
||||
file_path = sys.argv[1]
|
||||
key = sys.argv[2]
|
||||
read_json_value(file_path, key)
|
||||
24
RDT/lerobot2rdt/requirements.txt
Normal file
24
RDT/lerobot2rdt/requirements.txt
Normal file
@ -0,0 +1,24 @@
|
||||
numpy<2.0
|
||||
packaging==24.0
|
||||
deepspeed==0.14.2
|
||||
accelerate==0.30.1
|
||||
diffusers==0.27.2
|
||||
timm==1.0.3
|
||||
transformers==4.41.0
|
||||
sentencepiece==0.2.0
|
||||
h5py==3.11.0
|
||||
opencv-python==4.9.0.80
|
||||
imgaug==0.4.0
|
||||
pytz==2022.1
|
||||
huggingface_hub==0.23.0
|
||||
pandas==2.3.3
|
||||
|
||||
# requirements_data.txt
|
||||
# tfds-nightly==4.9.4.dev202402070044
|
||||
gsutil==5.27
|
||||
tensorflow==2.15.0.post1
|
||||
pillow==10.2.0
|
||||
pyyaml==6.0.1
|
||||
tensorflow-graphics==2021.12.3
|
||||
imageio==2.34.0
|
||||
imageio-ffmpeg==0.4.9
|
||||
2
RDT/rdt-export/.dockerignore
Normal file
2
RDT/rdt-export/.dockerignore
Normal file
@ -0,0 +1,2 @@
|
||||
input/*
|
||||
output/*
|
||||
48
RDT/rdt-export/Dockerfile
Normal file
48
RDT/rdt-export/Dockerfile
Normal file
@ -0,0 +1,48 @@
|
||||
|
||||
FROM registry.d-robotics.cc/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
|
||||
# ccr-29eug8s3-pub.cnc.bj.baidubce.com/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
|
||||
WORKDIR /app
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV TZ=Asia/Shanghai
|
||||
|
||||
RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
|
||||
sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
|
||||
|
||||
RUN apt-get update --allow-unauthenticated && apt-get install -y \
|
||||
software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y \
|
||||
python3.10 \
|
||||
python3.10-dev \
|
||||
python3-pip \
|
||||
python3.10-distutils \
|
||||
libgl1-mesa-glx \
|
||||
libglib2.0-0 \
|
||||
wget \
|
||||
ffmpeg \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
|
||||
|
||||
COPY . /app/
|
||||
|
||||
RUN python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
# RUN pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu121
|
||||
# RUN pip install torch==2.1.0 torchvision==0.16.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
RUN pip install packaging==24.0
|
||||
|
||||
RUN pip install tfds-nightly==4.9.4.dev202402070044
|
||||
|
||||
RUN pip install flash_attn-2.7.2.post1+cu12torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||
|
||||
# RUN mkdir -p /app/dataset/input /app/dataset/output
|
||||
|
||||
ENTRYPOINT ["bash", "deploy.sh"]
|
||||
71
RDT/rdt-export/configs/base_170M.yaml
Normal file
71
RDT/rdt-export/configs/base_170M.yaml
Normal file
@ -0,0 +1,71 @@
|
||||
common:
|
||||
# The number of historical images
|
||||
img_history_size: 2
|
||||
# The number of future actions to predict
|
||||
action_chunk_size: 64
|
||||
# The number of cameras to be used in the model
|
||||
num_cameras: 3
|
||||
# Dimension for state/action, we use the same space for both state and action
|
||||
# This MUST be equal to configs/state_vec.py
|
||||
state_dim: 128
|
||||
|
||||
|
||||
dataset:
|
||||
# We will extract the data from raw dataset
|
||||
# and store them in the disk buffer by producer
|
||||
# When training, we will read the data
|
||||
# randomly from the buffer by consumer
|
||||
# The producer will replace the data which has been
|
||||
# read by the consumer with new data
|
||||
|
||||
# The path to the buffer (at least 400GB)
|
||||
buf_path: /path/to/buffer
|
||||
# The number of chunks in the buffer
|
||||
buf_num_chunks: 512
|
||||
# The number of samples (step rather than episode) in each chunk
|
||||
buf_chunk_size: 512
|
||||
|
||||
# We will filter the episodes with length less than `epsd_len_thresh_low`
|
||||
epsd_len_thresh_low: 32
|
||||
# For those more than `epsd_len_thresh_high`,
|
||||
# we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
|
||||
# to better balance the training datasets
|
||||
epsd_len_thresh_high: 2048
|
||||
# How to fit the image size
|
||||
image_aspect_ratio: pad
|
||||
# Maximum number of language tokens
|
||||
tokenizer_max_length: 1024
|
||||
|
||||
model:
|
||||
# Config for condition adpators
|
||||
lang_adaptor: mlp2x_gelu
|
||||
img_adaptor: mlp2x_gelu
|
||||
state_adaptor: mlp3x_gelu
|
||||
lang_token_dim: 4096
|
||||
img_token_dim: 1152
|
||||
# Dim of action or proprioception vector
|
||||
# A `state` refers to an action or a proprioception vector
|
||||
state_token_dim: 128
|
||||
# Config for RDT structure
|
||||
rdt:
|
||||
# 1B: num_head 32 hidden_size 2048
|
||||
hidden_size: 1024
|
||||
depth: 14
|
||||
num_heads: 32
|
||||
cond_pos_embed_type: multimodal
|
||||
# For noise scheduler
|
||||
noise_scheduler:
|
||||
type: ddpm
|
||||
num_train_timesteps: 1000
|
||||
num_inference_timesteps: 5
|
||||
beta_schedule: squaredcos_cap_v2 # Critical choice
|
||||
prediction_type: sample
|
||||
clip_sample: False
|
||||
# For EMA (params averaging)
|
||||
# We do not use EMA currently
|
||||
ema:
|
||||
update_after_step: 0
|
||||
inv_gamma: 1.0
|
||||
power: 0.75
|
||||
min_value: 0.0
|
||||
max_value: 0.9999
|
||||
71
RDT/rdt-export/configs/base_1B.yaml
Normal file
71
RDT/rdt-export/configs/base_1B.yaml
Normal file
@ -0,0 +1,71 @@
|
||||
common:
|
||||
# The number of historical images
|
||||
img_history_size: 2
|
||||
# The number of future actions to predict
|
||||
action_chunk_size: 64
|
||||
# The number of cameras to be used in the model
|
||||
num_cameras: 3
|
||||
# Dimension for state/action, we use the same space for both state and action
|
||||
# This MUST be equal to configs/state_vec.py
|
||||
state_dim: 128
|
||||
|
||||
|
||||
dataset:
|
||||
# We will extract the data from raw dataset
|
||||
# and store them in the disk buffer by producer
|
||||
# When training, we will read the data
|
||||
# randomly from the buffer by consumer
|
||||
# The producer will replace the data which has been
|
||||
# read by the consumer with new data
|
||||
|
||||
# The path to the buffer (at least 400GB)
|
||||
buf_path: /path/to/buffer
|
||||
# The number of chunks in the buffer
|
||||
buf_num_chunks: 512
|
||||
# The number of samples (step rather than episode) in each chunk
|
||||
buf_chunk_size: 512
|
||||
|
||||
# We will filter the episodes with length less than `epsd_len_thresh_low`
|
||||
epsd_len_thresh_low: 32
|
||||
# For those more than `epsd_len_thresh_high`,
|
||||
# we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
|
||||
# to better balance the training datasets
|
||||
epsd_len_thresh_high: 2048
|
||||
# How to fit the image size
|
||||
image_aspect_ratio: pad
|
||||
# Maximum number of language tokens
|
||||
tokenizer_max_length: 1024
|
||||
|
||||
model:
|
||||
# Config for condition adpators
|
||||
lang_adaptor: mlp2x_gelu
|
||||
img_adaptor: mlp2x_gelu
|
||||
state_adaptor: mlp3x_gelu
|
||||
lang_token_dim: 4096
|
||||
img_token_dim: 1152
|
||||
# Dim of action or proprioception vector
|
||||
# A `state` refers to an action or a proprioception vector
|
||||
state_token_dim: 128
|
||||
# Config for RDT structure
|
||||
rdt:
|
||||
# 1B: num_head 32 hidden_size 2048
|
||||
hidden_size: 2048
|
||||
depth: 28
|
||||
num_heads: 32
|
||||
cond_pos_embed_type: multimodal
|
||||
# For noise scheduler
|
||||
noise_scheduler:
|
||||
type: ddpm
|
||||
num_train_timesteps: 1000
|
||||
num_inference_timesteps: 5
|
||||
beta_schedule: squaredcos_cap_v2 # Critical choice
|
||||
prediction_type: sample
|
||||
clip_sample: False
|
||||
# For EMA (params averaging)
|
||||
# We do not use EMA currently
|
||||
ema:
|
||||
update_after_step: 0
|
||||
inv_gamma: 1.0
|
||||
power: 0.75
|
||||
min_value: 0.0
|
||||
max_value: 0.9999
|
||||
@ -0,0 +1,50 @@
|
||||
{
|
||||
"A": [
|
||||
[
|
||||
-0.2691913843154907,
|
||||
-0.21995729207992554,
|
||||
-0.182277649641037
|
||||
],
|
||||
[
|
||||
0.35127854347229004,
|
||||
0.2769763469696045,
|
||||
0.17159393429756165
|
||||
]
|
||||
],
|
||||
"B": [
|
||||
[
|
||||
-0.2576896846294403,
|
||||
-0.22244493663311005,
|
||||
-0.20557966828346252
|
||||
],
|
||||
[
|
||||
0.32854634523391724,
|
||||
0.2922680974006653,
|
||||
0.17373555898666382
|
||||
]
|
||||
],
|
||||
"C": [
|
||||
[
|
||||
-0.29205888509750366,
|
||||
-0.24688798189163208,
|
||||
-0.17577645182609558
|
||||
],
|
||||
[
|
||||
0.25053921341896057,
|
||||
0.3277084231376648,
|
||||
0.16431939601898193
|
||||
]
|
||||
],
|
||||
"D": [
|
||||
[
|
||||
-0.25131964683532715,
|
||||
-0.15233077108860016,
|
||||
-0.13294968008995056
|
||||
],
|
||||
[
|
||||
0.19209328293800354,
|
||||
0.19344553351402283,
|
||||
0.1370421051979065
|
||||
]
|
||||
]
|
||||
}
|
||||
65
RDT/rdt-export/configs/dataset_control_freq.json
Normal file
65
RDT/rdt-export/configs/dataset_control_freq.json
Normal file
@ -0,0 +1,65 @@
|
||||
{
|
||||
"fractal20220817_data": 3,
|
||||
"taco_play": 15,
|
||||
"jaco_play": 10,
|
||||
"berkeley_cable_routing": 10,
|
||||
"nyu_door_opening_surprising_effectiveness": 3,
|
||||
"viola": 20,
|
||||
"berkeley_autolab_ur5": 5,
|
||||
"toto": 30,
|
||||
"kuka": 10,
|
||||
"language_table": 10,
|
||||
"columbia_cairlab_pusht_real": 10,
|
||||
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 20,
|
||||
"nyu_rot_dataset_converted_externally_to_rlds":3,
|
||||
"stanford_hydra_dataset_converted_externally_to_rlds": 10,
|
||||
"austin_buds_dataset_converted_externally_to_rlds": 20,
|
||||
"nyu_franka_play_dataset_converted_externally_to_rlds": 3,
|
||||
"maniskill_dataset_converted_externally_to_rlds": 20,
|
||||
"furniture_bench_dataset_converted_externally_to_rlds": 10,
|
||||
"ucsd_kitchen_dataset_converted_externally_to_rlds": 2,
|
||||
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": 3,
|
||||
"austin_sailor_dataset_converted_externally_to_rlds": 20,
|
||||
"austin_sirius_dataset_converted_externally_to_rlds": 20,
|
||||
"bc_z": 10,
|
||||
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": 10,
|
||||
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 10,
|
||||
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
|
||||
"utokyo_xarm_bimanual_converted_externally_to_rlds": 10,
|
||||
"berkeley_mvp_converted_externally_to_rlds": 5,
|
||||
"berkeley_rpt_converted_externally_to_rlds": 30,
|
||||
"kaist_nonprehensile_converted_externally_to_rlds": 10,
|
||||
"stanford_mask_vit_converted_externally_to_rlds": 0,
|
||||
"tokyo_u_lsmo_converted_externally_to_rlds": 10,
|
||||
"dlr_sara_pour_converted_externally_to_rlds": 10,
|
||||
"dlr_sara_grid_clamp_converted_externally_to_rlds": 10,
|
||||
"dlr_edan_shared_control_converted_externally_to_rlds": 5,
|
||||
"asu_table_top_converted_externally_to_rlds": 12.5,
|
||||
"stanford_robocook_converted_externally_to_rlds": 5,
|
||||
"eth_agent_affordances": 66.6,
|
||||
"imperialcollege_sawyer_wrist_cam": 10,
|
||||
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": 20,
|
||||
"uiuc_d3field": 1,
|
||||
"utaustin_mutex": 20,
|
||||
"berkeley_fanuc_manipulation": 10,
|
||||
"cmu_play_fusion": 5,
|
||||
"cmu_stretch": 10,
|
||||
"berkeley_gnm_recon": 3,
|
||||
"berkeley_gnm_cory_hall": 5,
|
||||
"berkeley_gnm_sac_son": 10,
|
||||
"robo_net": 1,
|
||||
"roboturk_real_towercreation": 10,
|
||||
"roboturk_real_laundrylayout": 10,
|
||||
"roboturk_real_objectsearch": 10,
|
||||
"aloha_mobile": 50,
|
||||
"aloha_static": 50,
|
||||
"roboset": 5,
|
||||
"droid": 15,
|
||||
"fmb": 10,
|
||||
"dobbe": 30,
|
||||
"qut_dexterous_manpulation": 30,
|
||||
"agilex": 25,
|
||||
"rh20t": 10,
|
||||
"calvin": 30,
|
||||
"bridgev2": 5
|
||||
}
|
||||
575
RDT/rdt-export/configs/dataset_img_keys.json
Normal file
575
RDT/rdt-export/configs/dataset_img_keys.json
Normal file
@ -0,0 +1,575 @@
|
||||
{
|
||||
"fractal20220817_data": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[
|
||||
1,0,0,0
|
||||
]
|
||||
},
|
||||
"taco_play": {
|
||||
"image_keys": [
|
||||
"rgb_static",
|
||||
"rgb_gripper",
|
||||
"rgb_static",
|
||||
"rgb_static"
|
||||
],
|
||||
"image_mask":[
|
||||
1,1,0,0
|
||||
]
|
||||
},
|
||||
"jaco_play": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image_wrist",
|
||||
"image_wrist",
|
||||
"image_wrist"
|
||||
],
|
||||
"image_mask":[
|
||||
1,1,0,0
|
||||
]
|
||||
},
|
||||
"berkeley_cable_routing": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist45_image",
|
||||
"wrist225_image",
|
||||
"top_image"
|
||||
],
|
||||
"image_mask":[1,1,0,1]
|
||||
},
|
||||
"nyu_door_opening_surprising_effectiveness": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"viola": {
|
||||
"image_keys": [
|
||||
"agentview_rgb",
|
||||
"eye_in_hand_rgb",
|
||||
"eye_in_hand_rgb",
|
||||
"eye_in_hand_rgb"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"berkeley_autolab_ur5": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"hand_image",
|
||||
"hand_image",
|
||||
"hand_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"toto": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"kuka": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"language_table": {
|
||||
"image_keys": [
|
||||
"rgb",
|
||||
"rgb",
|
||||
"rgb",
|
||||
"rgb"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"columbia_cairlab_pusht_real": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"nyu_rot_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"stanford_hydra_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"austin_buds_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"nyu_franka_play_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image_additional_view",
|
||||
"image_additional_view",
|
||||
"image_additional_view"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"maniskill_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"furniture_bench_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"ucsd_kitchen_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"austin_sailor_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"austin_sirius_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"bc_z": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"hand_image",
|
||||
"hand_image",
|
||||
"image2"
|
||||
],
|
||||
"image_mask":[1,1,0,1]
|
||||
},
|
||||
"utokyo_xarm_bimanual_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"berkeley_mvp_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"hand_image",
|
||||
"hand_image",
|
||||
"hand_image",
|
||||
"hand_image"
|
||||
],
|
||||
"image_mask":[0,1,0,0]
|
||||
},
|
||||
"berkeley_rpt_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"hand_image",
|
||||
"hand_image",
|
||||
"hand_image",
|
||||
"hand_image"
|
||||
],
|
||||
"image_mask":[0,1,0,0]
|
||||
},
|
||||
"kaist_nonprehensile_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"stanford_mask_vit_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"tokyo_u_lsmo_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"dlr_sara_pour_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"dlr_sara_grid_clamp_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"dlr_edan_shared_control_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"asu_table_top_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"stanford_robocook_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image_2",
|
||||
"image_1",
|
||||
"image_3",
|
||||
"image_4"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"eth_agent_affordances": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"imperialcollege_sawyer_wrist_cam": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[0,1,0,0]
|
||||
},
|
||||
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"uiuc_d3field": {
|
||||
"image_keys": [
|
||||
"image_1",
|
||||
"image_2",
|
||||
"image_3",
|
||||
"image_4"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"utaustin_mutex": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"berkeley_fanuc_manipulation": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"cmu_play_fusion": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"cmu_stretch": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"berkeley_gnm_recon": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"berkeley_gnm_cory_hall": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"berkeley_gnm_sac_son": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"robo_net": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image1",
|
||||
"image2",
|
||||
"image2"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"roboturk_real_towercreation": {
|
||||
"image_keys": [
|
||||
"top_rgb_frame",
|
||||
"front_rgb_frame",
|
||||
"front_rgb_frame",
|
||||
"front_rgb_frame"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"roboturk_real_laundrylayout": {
|
||||
"image_keys": [
|
||||
"top_rgb_frame",
|
||||
"front_rgb_frame",
|
||||
"front_rgb_frame",
|
||||
"front_rgb_frame"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"roboturk_real_objectsearch": {
|
||||
"image_keys": [
|
||||
"top_rgb_frame",
|
||||
"front_rgb_frame",
|
||||
"front_rgb_frame",
|
||||
"front_rgb_frame"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"aloha_mobile": {
|
||||
"image_keys": [
|
||||
"cam_high",
|
||||
"cam_right_wrist",
|
||||
"cam_left_wrist",
|
||||
"cam_right_wrist"
|
||||
],
|
||||
"image_mask":[1,1,1,0]
|
||||
},
|
||||
"aloha_static": {
|
||||
"image_keys": [
|
||||
"cam_high",
|
||||
"cam_right_wrist",
|
||||
"cam_left_wrist",
|
||||
"cam_low"
|
||||
],
|
||||
"image_mask":[1,1,1,1]
|
||||
},
|
||||
"roboset": {
|
||||
"image_keys": [
|
||||
"rgb_top",
|
||||
"rgb_right",
|
||||
"rgb_left",
|
||||
"rgb_right"
|
||||
],
|
||||
"image_mask":[1,1,1,0]
|
||||
},
|
||||
"droid": {
|
||||
"image_keys": [
|
||||
"exterior_image_1_left",
|
||||
"wrist_image_left",
|
||||
"wrist_image_left",
|
||||
"exterior_image_2_left"
|
||||
],
|
||||
"image_mask":[1,1,0,1]
|
||||
},
|
||||
"fmb": {
|
||||
"image_keys": [
|
||||
"image_side_1",
|
||||
"image_wrist_1",
|
||||
"image_wrist_1",
|
||||
"image_side_2"
|
||||
],
|
||||
"image_mask":[1,1,0,1]
|
||||
},
|
||||
"dobbe": {
|
||||
"image_keys": [
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[0,1,0,0]
|
||||
},
|
||||
"qut_dexterous_manpulation": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"agilex": {
|
||||
"image_keys": [
|
||||
"cam_high",
|
||||
"cam_right_wrist",
|
||||
"cam_left_wrist",
|
||||
"cam_right_wrist"
|
||||
],
|
||||
"image_mask":[1,1,1,0]
|
||||
},
|
||||
"rh20t": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"calvin": {
|
||||
"image_keys": [
|
||||
"rgb_static",
|
||||
"rgb_gripper",
|
||||
"rgb_gripper",
|
||||
"rgb_gripper"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"bridgev2": {
|
||||
"image_keys": [
|
||||
"images0",
|
||||
"images0",
|
||||
"images0",
|
||||
"images0"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
}
|
||||
}
|
||||
525
RDT/rdt-export/configs/dataset_stat.json
Normal file
525
RDT/rdt-export/configs/dataset_stat.json
Normal file
@ -0,0 +1,525 @@
|
||||
{
|
||||
"agilex": {
|
||||
"dataset_name": "agilex",
|
||||
"state_mean": [
|
||||
-0.0036545392947090432,
|
||||
-0.2773659935760079,
|
||||
0.3147616748061523,
|
||||
0.3813313179910183,
|
||||
0.04028575944090457,
|
||||
0.034888520819083294,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0
|
||||
],
|
||||
"state_std": [
|
||||
0.05763674563578847,
|
||||
0.2580181064167735,
|
||||
0.19785840483767897,
|
||||
0.05020347749331385,
|
||||
0.054529239104671424,
|
||||
0.05020521339363586,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0
|
||||
],
|
||||
"state_min": [
|
||||
-0.17447535196940103,
|
||||
-0.5522612677680121,
|
||||
-0.3340397516886393,
|
||||
0.21861712137858072,
|
||||
-0.09725829230414497,
|
||||
0.003396739231215583,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0
|
||||
],
|
||||
"state_max": [
|
||||
0.21961932712131077,
|
||||
0.30613206227620443,
|
||||
0.5444545321994357,
|
||||
0.4866888682047526,
|
||||
0.31486290825737845,
|
||||
0.3355223337809245,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0
|
||||
]
|
||||
}
|
||||
}
|
||||
3
RDT/rdt-export/configs/finetune_datasets.json
Normal file
3
RDT/rdt-export/configs/finetune_datasets.json
Normal file
@ -0,0 +1,3 @@
|
||||
[
|
||||
"agilex"
|
||||
]
|
||||
3
RDT/rdt-export/configs/finetune_sample_weights.json
Normal file
3
RDT/rdt-export/configs/finetune_sample_weights.json
Normal file
@ -0,0 +1,3 @@
|
||||
{
|
||||
"agilex": 100
|
||||
}
|
||||
48
RDT/rdt-export/configs/pretrain_datasets.json
Normal file
48
RDT/rdt-export/configs/pretrain_datasets.json
Normal file
@ -0,0 +1,48 @@
|
||||
[
|
||||
"fractal20220817_data",
|
||||
"jaco_play",
|
||||
"taco_play",
|
||||
"berkeley_cable_routing",
|
||||
"viola",
|
||||
"berkeley_autolab_ur5",
|
||||
"toto",
|
||||
"nyu_door_opening_surprising_effectiveness",
|
||||
"columbia_cairlab_pusht_real",
|
||||
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds",
|
||||
"austin_buds_dataset_converted_externally_to_rlds",
|
||||
"kuka",
|
||||
"utokyo_xarm_bimanual_converted_externally_to_rlds",
|
||||
"stanford_hydra_dataset_converted_externally_to_rlds",
|
||||
"maniskill_dataset_converted_externally_to_rlds",
|
||||
"ucsd_kitchen_dataset_converted_externally_to_rlds",
|
||||
"ucsd_pick_and_place_dataset_converted_externally_to_rlds",
|
||||
"austin_sailor_dataset_converted_externally_to_rlds",
|
||||
"austin_sirius_dataset_converted_externally_to_rlds",
|
||||
"bc_z",
|
||||
"utokyo_pr2_opening_fridge_converted_externally_to_rlds",
|
||||
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds",
|
||||
"utokyo_xarm_pick_and_place_converted_externally_to_rlds",
|
||||
"berkeley_mvp_converted_externally_to_rlds",
|
||||
"berkeley_rpt_converted_externally_to_rlds",
|
||||
"kaist_nonprehensile_converted_externally_to_rlds",
|
||||
"tokyo_u_lsmo_converted_externally_to_rlds",
|
||||
"dlr_sara_grid_clamp_converted_externally_to_rlds",
|
||||
"stanford_robocook_converted_externally_to_rlds",
|
||||
"imperialcollege_sawyer_wrist_cam",
|
||||
"iamlab_cmu_pickup_insert_converted_externally_to_rlds",
|
||||
"utaustin_mutex",
|
||||
"berkeley_fanuc_manipulation",
|
||||
"cmu_play_fusion",
|
||||
"language_table",
|
||||
"furniture_bench_dataset_converted_externally_to_rlds",
|
||||
"droid",
|
||||
"fmb",
|
||||
"dobbe",
|
||||
"qut_dexterous_manpulation",
|
||||
"aloha_mobile",
|
||||
"aloha_static",
|
||||
"roboset",
|
||||
"rh20t",
|
||||
"calvin",
|
||||
"bridgev2"
|
||||
]
|
||||
48
RDT/rdt-export/configs/pretrain_sample_weights.json
Normal file
48
RDT/rdt-export/configs/pretrain_sample_weights.json
Normal file
@ -0,0 +1,48 @@
|
||||
{
|
||||
"fractal20220817_data": 271,
|
||||
"taco_play": 60,
|
||||
"jaco_play": 33,
|
||||
"berkeley_cable_routing": 8,
|
||||
"nyu_door_opening_surprising_effectiveness": 10,
|
||||
"viola": 12,
|
||||
"berkeley_autolab_ur5": 32,
|
||||
"toto": 32,
|
||||
"kuka": 50,
|
||||
"language_table": 100,
|
||||
"columbia_cairlab_pusht_real": 12,
|
||||
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 55,
|
||||
"stanford_hydra_dataset_converted_externally_to_rlds": 24,
|
||||
"austin_buds_dataset_converted_externally_to_rlds": 7,
|
||||
"maniskill_dataset_converted_externally_to_rlds": 174,
|
||||
"furniture_bench_dataset_converted_externally_to_rlds": 71,
|
||||
"ucsd_kitchen_dataset_converted_externally_to_rlds": 12,
|
||||
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": 37,
|
||||
"austin_sailor_dataset_converted_externally_to_rlds": 15,
|
||||
"austin_sirius_dataset_converted_externally_to_rlds": 24,
|
||||
"bc_z": 208,
|
||||
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": 9,
|
||||
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 15,
|
||||
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
|
||||
"utokyo_xarm_bimanual_converted_externally_to_rlds": 1,
|
||||
"berkeley_mvp_converted_externally_to_rlds": 22,
|
||||
"berkeley_rpt_converted_externally_to_rlds": 30,
|
||||
"kaist_nonprehensile_converted_externally_to_rlds": 14,
|
||||
"tokyo_u_lsmo_converted_externally_to_rlds": 7,
|
||||
"dlr_sara_grid_clamp_converted_externally_to_rlds": 1,
|
||||
"stanford_robocook_converted_externally_to_rlds": 50,
|
||||
"imperialcollege_sawyer_wrist_cam": 13,
|
||||
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": 25,
|
||||
"utaustin_mutex": 39,
|
||||
"berkeley_fanuc_manipulation": 20,
|
||||
"cmu_play_fusion": 24,
|
||||
"droid": 303,
|
||||
"fmb": 42,
|
||||
"dobbe": 36,
|
||||
"qut_dexterous_manpulation": 14,
|
||||
"aloha_mobile": 150,
|
||||
"aloha_static": 150,
|
||||
"roboset": 135,
|
||||
"rh20t": 331,
|
||||
"calvin": 100,
|
||||
"bridgev2": 224
|
||||
}
|
||||
126
RDT/rdt-export/configs/state_vec.py
Normal file
126
RDT/rdt-export/configs/state_vec.py
Normal file
@ -0,0 +1,126 @@
|
||||
STATE_VEC_IDX_MAPPING = {
|
||||
# [0, 10): right arm joint positions
|
||||
**{
|
||||
"arm_joint_{}_pos".format(i): i
|
||||
for i in range(10)
|
||||
},
|
||||
**{
|
||||
"right_arm_joint_{}_pos".format(i): i
|
||||
for i in range(10)
|
||||
},
|
||||
# [10, 15): right gripper joint positions
|
||||
**{
|
||||
"gripper_joint_{}_pos".format(i): i + 10
|
||||
for i in range(5)
|
||||
},
|
||||
**{
|
||||
"right_gripper_joint_{}_pos".format(i): i + 10
|
||||
for i in range(5)
|
||||
},
|
||||
"gripper_open": 10, # alias of right_gripper_joint_0_pos
|
||||
"right_gripper_open": 10,
|
||||
# [15, 25): right arm joint velocities
|
||||
**{
|
||||
"arm_joint_{}_vel".format(i): i + 15
|
||||
for i in range(10)
|
||||
},
|
||||
**{
|
||||
"right_arm_joint_{}_vel".format(i): i + 15
|
||||
for i in range(10)
|
||||
},
|
||||
# [25, 30): right gripper joint velocities
|
||||
**{
|
||||
"gripper_joint_{}_vel".format(i): i + 25
|
||||
for i in range(5)
|
||||
},
|
||||
**{
|
||||
"right_gripper_joint_{}_vel".format(i): i + 25
|
||||
for i in range(5)
|
||||
},
|
||||
"gripper_open_vel": 25, # alias of right_gripper_joint_0_vel
|
||||
"right_gripper_open_vel": 25,
|
||||
# [30, 33): right end effector positions
|
||||
"eef_pos_x": 30,
|
||||
"right_eef_pos_x": 30,
|
||||
"eef_pos_y": 31,
|
||||
"right_eef_pos_y": 31,
|
||||
"eef_pos_z": 32,
|
||||
"right_eef_pos_z": 32,
|
||||
# [33, 39): right end effector 6D pose
|
||||
"eef_angle_0": 33,
|
||||
"right_eef_angle_0": 33,
|
||||
"eef_angle_1": 34,
|
||||
"right_eef_angle_1": 34,
|
||||
"eef_angle_2": 35,
|
||||
"right_eef_angle_2": 35,
|
||||
"eef_angle_3": 36,
|
||||
"right_eef_angle_3": 36,
|
||||
"eef_angle_4": 37,
|
||||
"right_eef_angle_4": 37,
|
||||
"eef_angle_5": 38,
|
||||
"right_eef_angle_5": 38,
|
||||
# [39, 42): right end effector velocities
|
||||
"eef_vel_x": 39,
|
||||
"right_eef_vel_x": 39,
|
||||
"eef_vel_y": 40,
|
||||
"right_eef_vel_y": 40,
|
||||
"eef_vel_z": 41,
|
||||
"right_eef_vel_z": 41,
|
||||
# [42, 45): right end effector angular velocities
|
||||
"eef_angular_vel_roll": 42,
|
||||
"right_eef_angular_vel_roll": 42,
|
||||
"eef_angular_vel_pitch": 43,
|
||||
"right_eef_angular_vel_pitch": 43,
|
||||
"eef_angular_vel_yaw": 44,
|
||||
"right_eef_angular_vel_yaw": 44,
|
||||
# [45, 50): reserved
|
||||
# [50, 60): left arm joint positions
|
||||
**{
|
||||
"left_arm_joint_{}_pos".format(i): i + 50
|
||||
for i in range(10)
|
||||
},
|
||||
# [60, 65): left gripper joint positions
|
||||
**{
|
||||
"left_gripper_joint_{}_pos".format(i): i + 60
|
||||
for i in range(5)
|
||||
},
|
||||
"left_gripper_open": 60, # alias of left_gripper_joint_0_pos
|
||||
# [65, 75): left arm joint velocities
|
||||
**{
|
||||
"left_arm_joint_{}_vel".format(i): i + 65
|
||||
for i in range(10)
|
||||
},
|
||||
# [75, 80): left gripper joint velocities
|
||||
**{
|
||||
"left_gripper_joint_{}_vel".format(i): i + 75
|
||||
for i in range(5)
|
||||
},
|
||||
"left_gripper_open_vel": 75, # alias of left_gripper_joint_0_vel
|
||||
# [80, 83): left end effector positions
|
||||
"left_eef_pos_x": 80,
|
||||
"left_eef_pos_y": 81,
|
||||
"left_eef_pos_z": 82,
|
||||
# [83, 89): left end effector 6D pose
|
||||
"left_eef_angle_0": 83,
|
||||
"left_eef_angle_1": 84,
|
||||
"left_eef_angle_2": 85,
|
||||
"left_eef_angle_3": 86,
|
||||
"left_eef_angle_4": 87,
|
||||
"left_eef_angle_5": 88,
|
||||
# [89, 92): left end effector velocities
|
||||
"left_eef_vel_x": 89,
|
||||
"left_eef_vel_y": 90,
|
||||
"left_eef_vel_z": 91,
|
||||
# [92, 95): left end effector angular velocities
|
||||
"left_eef_angular_vel_roll": 92,
|
||||
"left_eef_angular_vel_pitch": 93,
|
||||
"left_eef_angular_vel_yaw": 94,
|
||||
# [95, 100): reserved
|
||||
# [100, 102): base linear velocities
|
||||
"base_vel_x": 100,
|
||||
"base_vel_y": 101,
|
||||
# [102, 103): base angular velocities
|
||||
"base_angular_vel": 102,
|
||||
# [103, 128): reserved
|
||||
}
|
||||
STATE_VEC_LEN = 128
|
||||
14
RDT/rdt-export/configs/zero2.json
Normal file
14
RDT/rdt-export/configs/zero2.json
Normal file
@ -0,0 +1,14 @@
|
||||
{
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 1e9
|
||||
}
|
||||
}
|
||||
993
RDT/rdt-export/export.py
Normal file
993
RDT/rdt-export/export.py
Normal file
@ -0,0 +1,993 @@
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import logging
|
||||
import argparse
|
||||
from time import time
|
||||
from collections import OrderedDict
|
||||
from dataclasses import dataclass
|
||||
import yaml
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
import h5py
|
||||
from PIL import Image as PImage
|
||||
|
||||
import onnx
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torchvision import transforms
|
||||
|
||||
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
|
||||
from diffusers.schedulers.scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
|
||||
|
||||
from scripts.agilex_model import create_model
|
||||
from configs.state_vec import STATE_VEC_IDX_MAPPING
|
||||
from models.hub_mixin import CompatiblePyTorchModelHubMixin
|
||||
from models.rdt.blocks import (FinalLayer, RDTBlock, TimestepEmbedder, get_1d_sincos_pos_embed_from_grid, get_multimodal_cond_pos_embed)
|
||||
from models.multimodal_encoder.siglip_encoder import SiglipVisionTower
|
||||
from models.multimodal_encoder.t5_encoder import T5Embedder
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
datefmt='%H:%M:%S')
|
||||
logger = logging.getLogger("RDT_EXPORT")
|
||||
|
||||
os.environ["WANDB_MODE"] = "disabled"
|
||||
|
||||
@dataclass
|
||||
class ExportConfig:
|
||||
task_id: str = None
|
||||
output_path: str = None
|
||||
model_path: str = None
|
||||
calibration_num: int = 100
|
||||
lang_calibration_num: int = 1
|
||||
dataset_path: str = None
|
||||
gpu_id: str = "0"
|
||||
march: str = None
|
||||
model_type: str = None
|
||||
pretrained_vision_encoder_name_or_path: str = None
|
||||
ctrl_freq: int = 25
|
||||
cal_data_device: str = "cuda"
|
||||
|
||||
|
||||
AGILEX_STATE_INDICES = [
|
||||
STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"] for i in range(6)
|
||||
]
|
||||
|
||||
def dump_img_adaptor(img_tokens):
|
||||
global img_adaptor_cal_ws
|
||||
global dump_cnt, dump_dataset_name
|
||||
np.save(os.path.join(img_adaptor_cal_ws, f"img_adaptor_{dump_dataset_name}_{dump_cnt}.npy"), img_tokens.float().contiguous().cpu().detach().numpy())
|
||||
|
||||
def dump_dit(state_action_traj, ctrl_freqs, t, lang_cond, img_cond, lang_attn_mask):
|
||||
t_str = str(t)
|
||||
x = state_action_traj.float().contiguous().cpu().detach().numpy()
|
||||
freq = ctrl_freqs.float().contiguous().cpu().detach().numpy().astype(np.int32).copy()
|
||||
t_ = t.float().contiguous().cpu().detach().numpy()
|
||||
t_ = np.expand_dims(t_.astype(np.int32), axis=0).copy()
|
||||
lang_c = lang_cond.float().contiguous().cpu().detach().numpy()
|
||||
img_c = img_cond.float().contiguous().cpu().detach().numpy()
|
||||
lang_mask = lang_attn_mask.float().contiguous().cpu().detach().numpy()
|
||||
pad_rows = 64 - lang_mask.shape[1]
|
||||
padded = np.pad(lang_mask, ((0,0), (0,pad_rows)), mode="constant")
|
||||
mask_float = np.where(padded, 0.0, -512.0).astype(np.float32)
|
||||
lang_cond_padded = np.pad(lang_c, pad_width=((0, 0), (0, pad_rows), (0,0)), mode="constant", constant_values=0)
|
||||
global dit_cal_path_x, dit_cal_path_freq, dit_cal_path_t, dit_cal_path_lang_c, dit_cal_path_img_c, dit_cal_path_lang_mask
|
||||
global dump_cnt, dump_dataset_name
|
||||
np.save(os.path.join(dit_cal_path_x, f"x_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), x)
|
||||
np.save(os.path.join(dit_cal_path_freq, f"freq_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), freq)
|
||||
np.save(os.path.join(dit_cal_path_t, f"t_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), t_)
|
||||
np.save(os.path.join(dit_cal_path_lang_c, f"lang_c_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), lang_cond_padded)
|
||||
np.save(os.path.join(dit_cal_path_img_c, f"img_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), img_c)
|
||||
np.save(os.path.join(dit_cal_path_lang_mask, f"lang_mask_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), mask_float)
|
||||
|
||||
def create_dump_model(args, **kwargs):
|
||||
# left_arm_dim, right_arm_dim = (args["arm_dim"]["left_arm_dim"], args["arm_dim"]["right_arm_dim"],)
|
||||
# AGILEX_STATE_INDICES = ([STATE_VEC_IDX_MAPPING[f"left_arm_joint_{i}_pos"]
|
||||
# for i in range(left_arm_dim)] + [STATE_VEC_IDX_MAPPING["left_gripper_open"]] +
|
||||
# [STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"]
|
||||
# for i in range(right_arm_dim)] + [STATE_VEC_IDX_MAPPING[f"right_gripper_open"]])
|
||||
|
||||
model = RoboticDiffusionTransformerModel_Dump(args, **kwargs)
|
||||
pretrained = kwargs.get("pretrained", None)
|
||||
if pretrained is not None and os.path.isfile(pretrained):
|
||||
model.load_pretrained_weights(pretrained)
|
||||
return model
|
||||
|
||||
class RDT_Dump(nn.Module):
|
||||
def __init__(self,
|
||||
output_dim=128,
|
||||
horizon=32,
|
||||
hidden_size=1152,
|
||||
depth=28,
|
||||
num_heads=16,
|
||||
max_lang_cond_len=1024,
|
||||
img_cond_len=4096,
|
||||
lang_pos_embed_config=None,
|
||||
img_pos_embed_config=None,
|
||||
dtype=torch.bfloat16):
|
||||
super().__init__()
|
||||
self.horizon = horizon
|
||||
self.hidden_size = hidden_size
|
||||
self.max_lang_cond_len = max_lang_cond_len
|
||||
self.img_cond_len = img_cond_len
|
||||
self.dtype = dtype
|
||||
self.lang_pos_embed_config = lang_pos_embed_config
|
||||
self.img_pos_embed_config = img_pos_embed_config
|
||||
self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
|
||||
self.freq_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
|
||||
# We will use trainable sin-cos embeddings
|
||||
# [timestep; state; action]
|
||||
self.x_pos_embed = nn.Parameter(torch.zeros(1, horizon + 3, hidden_size))
|
||||
# Language conditions
|
||||
self.lang_cond_pos_embed = nn.Parameter(torch.zeros(1, max_lang_cond_len, hidden_size))
|
||||
# Image conditions
|
||||
self.img_cond_pos_embed = nn.Parameter(torch.zeros(1, img_cond_len, hidden_size))
|
||||
self.blocks = nn.ModuleList([RDTBlock(hidden_size, num_heads) for _ in range(depth)])
|
||||
self.final_layer = FinalLayer(hidden_size, output_dim)
|
||||
self.initialize_weights()
|
||||
def initialize_weights(self):
|
||||
# Initialize transformer layers:
|
||||
def _basic_init(module):
|
||||
if isinstance(module, nn.Linear):
|
||||
torch.nn.init.xavier_uniform_(module.weight)
|
||||
if module.bias is not None:
|
||||
nn.init.constant_(module.bias, 0)
|
||||
self.apply(_basic_init)
|
||||
# Initialize pos_embed by sin-cos embedding
|
||||
x_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
|
||||
mm_cond_lens=OrderedDict([
|
||||
('timestep', 1),
|
||||
('ctrl_freq', 1),
|
||||
('state', 1),
|
||||
('action', self.horizon),
|
||||
]))
|
||||
self.x_pos_embed.data.copy_(torch.from_numpy(x_pos_embed).float().unsqueeze(0))
|
||||
if self.lang_pos_embed_config is None:
|
||||
lang_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size, torch.arange(self.max_lang_cond_len))
|
||||
else:
|
||||
lang_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size, mm_cond_lens=OrderedDict(self.lang_pos_embed_config), embed_modality=False)
|
||||
self.lang_cond_pos_embed.data.copy_(torch.from_numpy(lang_cond_pos_embed).float().unsqueeze(0))
|
||||
if self.img_pos_embed_config is None:
|
||||
img_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size, torch.arange(self.img_cond_len))
|
||||
else:
|
||||
img_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size, mm_cond_lens=OrderedDict(self.img_pos_embed_config), embed_modality=False)
|
||||
self.img_cond_pos_embed.data.copy_(torch.from_numpy(img_cond_pos_embed).float().unsqueeze(0))
|
||||
# Initialize timestep and control freq embedding MLP
|
||||
nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
|
||||
nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
|
||||
nn.init.normal_(self.freq_embedder.mlp[0].weight, std=0.02)
|
||||
nn.init.normal_(self.freq_embedder.mlp[2].weight, std=0.02)
|
||||
# Initialize the final layer: zero-out the final linear layer
|
||||
nn.init.constant_(self.final_layer.ffn_final.fc2.weight, 0)
|
||||
nn.init.constant_(self.final_layer.ffn_final.fc2.bias, 0)
|
||||
# Move all the params to given data type:
|
||||
self.to(self.dtype)
|
||||
def forward(self, x, freq, t, lang_c, img_c, lang_mask=None, img_mask=None):
|
||||
t = self.t_embedder(t).unsqueeze(1) # (B, 1, D) or (1, 1, D)
|
||||
freq = self.freq_embedder(freq).unsqueeze(1) # (B, 1, D)
|
||||
# Append timestep to the input tokens
|
||||
if t.shape[0] == 1:
|
||||
t = t.expand(x.shape[0], -1, -1)
|
||||
x = torch.cat([t, freq, x], dim=1) # (B, T+1, D)
|
||||
# Add multimodal position embeddings
|
||||
x = x + self.x_pos_embed
|
||||
# Note the lang is of variable length
|
||||
lang_c = lang_c + self.lang_cond_pos_embed[:, :lang_c.shape[1]]
|
||||
img_c = img_c + self.img_cond_pos_embed
|
||||
# Forward pass
|
||||
conds = [lang_c, img_c]
|
||||
masks = [lang_mask, img_mask]
|
||||
for i, block in enumerate(self.blocks):
|
||||
c, mask = conds[i % 2], masks[i % 2]
|
||||
x = block(x, c, mask) # (B, T+1, D)
|
||||
# Inject the language condition at the final layer
|
||||
x = self.final_layer(x) # (B, T+1, out_channels)
|
||||
# Only preserve the action tokens
|
||||
x = x[:, -self.horizon:]
|
||||
return x
|
||||
|
||||
class RDTRunner_Dump(nn.Module,
|
||||
CompatiblePyTorchModelHubMixin,
|
||||
repo_url="https://huggingface.co/robotics-diffusion-transformer/rdt-1b"):
|
||||
def __init__(self,
|
||||
*,
|
||||
action_dim,
|
||||
pred_horizon,
|
||||
config,
|
||||
lang_token_dim,
|
||||
img_token_dim,
|
||||
state_token_dim,
|
||||
max_lang_cond_len,
|
||||
img_cond_len,
|
||||
lang_pos_embed_config=None,
|
||||
img_pos_embed_config=None,
|
||||
dtype=torch.bfloat16):
|
||||
super(RDTRunner_Dump, self).__init__()
|
||||
# Create diffusion model
|
||||
hidden_size = config['rdt']['hidden_size']
|
||||
self.model = RDT_Dump(
|
||||
output_dim=action_dim,
|
||||
horizon=pred_horizon,
|
||||
hidden_size=hidden_size,
|
||||
depth=config['rdt']['depth'],
|
||||
num_heads=config['rdt']['num_heads'],
|
||||
max_lang_cond_len=max_lang_cond_len,
|
||||
img_cond_len=img_cond_len,
|
||||
lang_pos_embed_config=lang_pos_embed_config,
|
||||
img_pos_embed_config=img_pos_embed_config,
|
||||
dtype=dtype,
|
||||
)
|
||||
# Create adpators for various conditional inputs
|
||||
self.lang_adaptor = self.build_condition_adapter(config['lang_adaptor'], in_features=lang_token_dim, out_features=hidden_size)
|
||||
self.img_adaptor = self.build_condition_adapter(config['img_adaptor'], in_features=img_token_dim, out_features=hidden_size)
|
||||
# A `state` refers to an action or a proprioception vector
|
||||
self.state_adaptor = self.build_condition_adapter(
|
||||
config['state_adaptor'],
|
||||
in_features=state_token_dim * 2, # state + state mask (indicator)
|
||||
out_features=hidden_size)
|
||||
# Create the noise scheduler
|
||||
noise_scheduler_config = config['noise_scheduler']
|
||||
self.noise_scheduler = DDPMScheduler(
|
||||
num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
|
||||
beta_schedule=noise_scheduler_config['beta_schedule'],
|
||||
prediction_type=noise_scheduler_config['prediction_type'],
|
||||
clip_sample=noise_scheduler_config['clip_sample'],
|
||||
)
|
||||
self.noise_scheduler_sample = DPMSolverMultistepScheduler(
|
||||
num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
|
||||
beta_schedule=noise_scheduler_config['beta_schedule'],
|
||||
prediction_type=noise_scheduler_config['prediction_type'],
|
||||
)
|
||||
self.num_train_timesteps = noise_scheduler_config['num_train_timesteps']
|
||||
self.num_inference_timesteps = noise_scheduler_config['num_inference_timesteps']
|
||||
self.prediction_type = noise_scheduler_config['prediction_type']
|
||||
self.pred_horizon = pred_horizon
|
||||
self.action_dim = action_dim
|
||||
print("Diffusion params: %e" %
|
||||
sum([p.numel() for p in self.model.parameters()] + [p.numel() for p in self.lang_adaptor.parameters()] +
|
||||
[p.numel()
|
||||
for p in self.img_adaptor.parameters()] + [p.numel() for p in self.state_adaptor.parameters()]))
|
||||
def build_condition_adapter(self, projector_type, in_features, out_features):
|
||||
projector = None
|
||||
if projector_type == 'linear':
|
||||
projector = nn.Linear(in_features, out_features)
|
||||
else:
|
||||
mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
|
||||
if mlp_gelu_match:
|
||||
mlp_depth = int(mlp_gelu_match.group(1))
|
||||
modules = [nn.Linear(in_features, out_features)]
|
||||
for _ in range(1, mlp_depth):
|
||||
modules.append(nn.GELU(approximate="tanh"))
|
||||
modules.append(nn.Linear(out_features, out_features))
|
||||
projector = nn.Sequential(*modules)
|
||||
if projector is None:
|
||||
raise ValueError(f'Unknown projector type: {projector_type}')
|
||||
return projector
|
||||
def adapt_conditions(self, lang_tokens, img_tokens, state_tokens):
|
||||
adpated_lang = self.lang_adaptor(lang_tokens)
|
||||
dump_img_adaptor(img_tokens)
|
||||
adpated_img = self.img_adaptor(img_tokens)
|
||||
adpated_state = self.state_adaptor(state_tokens)
|
||||
return adpated_lang, adpated_img, adpated_state
|
||||
def conditional_sample(self, lang_cond, lang_attn_mask, img_cond, state_traj, action_mask, ctrl_freqs):
|
||||
device = state_traj.device
|
||||
dtype = state_traj.dtype
|
||||
noisy_action = torch.randn(size=(state_traj.shape[0], self.pred_horizon, self.action_dim), dtype=dtype, device=device)
|
||||
action_mask = action_mask.expand(-1, self.pred_horizon, -1)
|
||||
# Set step values
|
||||
self.noise_scheduler_sample.set_timesteps(self.num_inference_timesteps)
|
||||
for t in self.noise_scheduler_sample.timesteps:
|
||||
# Prepare state-action trajectory
|
||||
action_traj = torch.cat([noisy_action, action_mask], dim=2)
|
||||
action_traj = self.state_adaptor(action_traj)
|
||||
state_action_traj = torch.cat([state_traj, action_traj], dim=1)
|
||||
# dump
|
||||
dump_dit(state_action_traj, ctrl_freqs, t, lang_cond, img_cond, lang_attn_mask)
|
||||
# Predict the model output
|
||||
model_output = self.model(state_action_traj,
|
||||
ctrl_freqs,
|
||||
t.unsqueeze(-1).to(device),
|
||||
lang_cond,
|
||||
img_cond,
|
||||
lang_mask=lang_attn_mask)
|
||||
# Compute previous actions: x_t -> x_t-1
|
||||
noisy_action = self.noise_scheduler_sample.step(model_output, t, noisy_action).prev_sample
|
||||
noisy_action = noisy_action.to(state_traj.dtype)
|
||||
# Finally apply the action mask to mask invalid action dimensions
|
||||
noisy_action = noisy_action * action_mask
|
||||
return noisy_action
|
||||
def compute_loss(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_gt, action_mask,
|
||||
ctrl_freqs) -> torch.Tensor:
|
||||
'''
|
||||
lang_tokens: (batch_size, lang_len, lang_token_dim)
|
||||
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
|
||||
which should be True-False bool tensor.
|
||||
img_tokens: (batch_size, img_len, img_token_dim)
|
||||
state_tokens: (batch_size, 1, state_token_dim)
|
||||
action_gt: (batch_size, horizon, state_token_dim), ground-truth actions for supervision
|
||||
action_mask: (batch_size, 1, state_token_dim), a 0-1 **float** tensor.
|
||||
ctrl_freqs: (batch_size,), control frequency for each sample.
|
||||
|
||||
return: loss_value, a scalar tensor
|
||||
'''
|
||||
batch_size = lang_tokens.shape[0]
|
||||
device = lang_tokens.device
|
||||
# Sample noise that we'll add to the actions
|
||||
noise = torch.randn(action_gt.shape, dtype=action_gt.dtype, device=device)
|
||||
# Sample random diffusion timesteps
|
||||
timesteps = torch.randint(0, self.num_train_timesteps, (batch_size, ), device=device).long()
|
||||
# Add noise to the clean actions according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_action = self.noise_scheduler.add_noise(action_gt, noise, timesteps)
|
||||
# Concatenate the state and action tokens to form the input sequence
|
||||
state_action_traj = torch.cat([state_tokens, noisy_action], dim=1)
|
||||
# Append the action mask to the input sequence
|
||||
action_mask = action_mask.expand(-1, state_action_traj.shape[1], -1)
|
||||
state_action_traj = torch.cat([state_action_traj, action_mask], dim=2)
|
||||
# Align the dimension with the hidden size
|
||||
lang_cond, img_cond, state_action_traj = self.adapt_conditions(lang_tokens, img_tokens, state_action_traj)
|
||||
# Predict the denoised result
|
||||
pred = self.model(state_action_traj, ctrl_freqs, timesteps, lang_cond, img_cond, lang_mask=lang_attn_mask)
|
||||
pred_type = self.prediction_type
|
||||
if pred_type == 'epsilon':
|
||||
target = noise
|
||||
elif pred_type == 'sample':
|
||||
target = action_gt
|
||||
else:
|
||||
raise ValueError(f"Unsupported prediction type {pred_type}")
|
||||
loss = F.mse_loss(pred, target)
|
||||
return loss
|
||||
def predict_action(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_mask, ctrl_freqs):
|
||||
'''
|
||||
lang_tokens: (batch_size, lang_len, lang_token_dim)
|
||||
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
|
||||
which should be True-False bool tensor.
|
||||
img_tokens: (batch_size, img_len, img_token_dim)
|
||||
state_tokens: (batch_size, 1, state_token_dim)
|
||||
action_mask: (batch_size, 1, action_dim),
|
||||
which should be a 0-1 **float** tensor.
|
||||
ctrl_freqs: (batch_size,), control frequency for each sample.
|
||||
|
||||
return: (batch_size, horizon, action_dim), predicted action sequence
|
||||
'''
|
||||
# Prepare the state and conditions
|
||||
state_tokens = torch.cat([state_tokens, action_mask], dim=2)
|
||||
lang_cond, img_cond, state_traj = self.adapt_conditions(lang_tokens, img_tokens, state_tokens)
|
||||
# Run sampling
|
||||
action_pred = self.conditional_sample(
|
||||
lang_cond,
|
||||
lang_attn_mask,
|
||||
img_cond,
|
||||
state_traj,
|
||||
action_mask,
|
||||
ctrl_freqs,
|
||||
)
|
||||
return action_pred
|
||||
def forward(self, *args, **kwargs) -> torch.Tensor:
|
||||
return self.compute_loss(*args, **kwargs)
|
||||
|
||||
class RoboticDiffusionTransformerModel_Dump(object):
|
||||
def __init__(
|
||||
self,
|
||||
args,
|
||||
device="cuda",
|
||||
dtype=torch.bfloat16,
|
||||
image_size=None,
|
||||
control_frequency=25,
|
||||
pretrained=None,
|
||||
pretrained_vision_encoder_name_or_path=None,
|
||||
):
|
||||
self.args = args
|
||||
self.dtype = dtype
|
||||
self.image_size = image_size
|
||||
self.device = device
|
||||
self.control_frequency = control_frequency
|
||||
# We do not use the text encoder due to limited GPU memory
|
||||
# self.text_tokenizer, self.text_model = self.get_text_encoder(pretrained_text_encoder_name_or_path)
|
||||
self.image_processor, self.vision_model = self.get_vision_encoder(pretrained_vision_encoder_name_or_path)
|
||||
self.policy = self.get_policy(pretrained)
|
||||
|
||||
self.reset()
|
||||
|
||||
def get_policy(self, pretrained):
|
||||
# Initialize model with arguments
|
||||
if pretrained is None or os.path.isfile(pretrained):
|
||||
img_cond_len = (self.args["common"]["img_history_size"] * self.args["common"]["num_cameras"] *
|
||||
self.vision_model.num_patches)
|
||||
|
||||
_model = RDTRunner_Dump(
|
||||
action_dim=self.args["common"]["state_dim"],
|
||||
pred_horizon=self.args["common"]["action_chunk_size"],
|
||||
config=self.args["model"],
|
||||
lang_token_dim=self.args["model"]["lang_token_dim"],
|
||||
img_token_dim=self.args["model"]["img_token_dim"],
|
||||
state_token_dim=self.args["model"]["state_token_dim"],
|
||||
max_lang_cond_len=self.args["dataset"]["tokenizer_max_length"],
|
||||
img_cond_len=img_cond_len,
|
||||
img_pos_embed_config=[
|
||||
# No initial pos embed in the last grid size
|
||||
# since we've already done in ViT
|
||||
(
|
||||
"image",
|
||||
(
|
||||
self.args["common"]["img_history_size"],
|
||||
self.args["common"]["num_cameras"],
|
||||
-self.vision_model.num_patches,
|
||||
),
|
||||
),
|
||||
],
|
||||
lang_pos_embed_config=[
|
||||
# Similarly, no initial pos embed for language
|
||||
("lang", -self.args["dataset"]["tokenizer_max_length"]),
|
||||
],
|
||||
dtype=self.dtype,
|
||||
)
|
||||
else:
|
||||
_model = RDTRunner_Dump.from_pretrained(pretrained)
|
||||
|
||||
return _model
|
||||
|
||||
def get_text_encoder(self, pretrained_text_encoder_name_or_path):
|
||||
text_embedder = T5Embedder(
|
||||
from_pretrained=pretrained_text_encoder_name_or_path,
|
||||
model_max_length=self.args["dataset"]["tokenizer_max_length"],
|
||||
device=self.device,
|
||||
)
|
||||
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
|
||||
return tokenizer, text_encoder
|
||||
def get_vision_encoder(self, pretrained_vision_encoder_name_or_path):
|
||||
vision_encoder = SiglipVisionTower(vision_tower=pretrained_vision_encoder_name_or_path, args=None)
|
||||
image_processor = vision_encoder.image_processor
|
||||
return image_processor, vision_encoder
|
||||
def reset(self):
|
||||
device = self.device
|
||||
weight_dtype = self.dtype
|
||||
self.policy.eval()
|
||||
# self.text_model.eval()
|
||||
self.vision_model.eval()
|
||||
self.policy = self.policy.to(device, dtype=weight_dtype)
|
||||
# self.text_model = self.text_model.to(device, dtype=weight_dtype)
|
||||
self.vision_model = self.vision_model.to(device, dtype=weight_dtype)
|
||||
def load_pretrained_weights(self, pretrained=None):
|
||||
if pretrained is None:
|
||||
return
|
||||
print(f"Loading weights from {pretrained}")
|
||||
filename = os.path.basename(pretrained)
|
||||
if filename.endswith(".pt"):
|
||||
checkpoint = torch.load(pretrained)
|
||||
self.policy.load_state_dict(checkpoint["module"])
|
||||
elif filename.endswith(".safetensors"):
|
||||
from safetensors.torch import load_model
|
||||
load_model(self.policy, pretrained)
|
||||
else:
|
||||
raise NotImplementedError(f"Unknown checkpoint format: {pretrained}")
|
||||
def encode_instruction(self, instruction, device="cuda"):
|
||||
tokens = self.text_tokenizer(instruction, return_tensors="pt", padding="longest",
|
||||
truncation=True)["input_ids"].to(device)
|
||||
tokens = tokens.view(1, -1)
|
||||
with torch.no_grad():
|
||||
pred = self.text_model(tokens).last_hidden_state.detach()
|
||||
return pred
|
||||
def _format_joint_to_state(self, joints):
|
||||
# Rescale the gripper to the range of [0, 1]
|
||||
joints = joints / torch.tensor(
|
||||
[[[180, 180, 180, 180, 180, 180]]],
|
||||
device=joints.device,
|
||||
dtype=joints.dtype,
|
||||
)
|
||||
B, N, _ = joints.shape
|
||||
state = torch.zeros(
|
||||
(B, N, self.args["model"]["state_token_dim"]),
|
||||
device=joints.device,
|
||||
dtype=joints.dtype,
|
||||
)
|
||||
# Fill into the unified state vector
|
||||
state[:, :, AGILEX_STATE_INDICES] = joints
|
||||
# Assemble the mask indicating each dimension's availability
|
||||
state_elem_mask = torch.zeros(
|
||||
(B, self.args["model"]["state_token_dim"]),
|
||||
device=joints.device,
|
||||
dtype=joints.dtype,
|
||||
)
|
||||
state_elem_mask[:, AGILEX_STATE_INDICES] = 1
|
||||
return state, state_elem_mask
|
||||
def _unformat_action_to_joint(self, action):
|
||||
action_indices = AGILEX_STATE_INDICES
|
||||
joints = action[:, :, action_indices]
|
||||
# Rescale the gripper back to the action range
|
||||
# Note that the action range and proprioception range are different
|
||||
# for Mobile ALOHA robot
|
||||
joints = joints * torch.tensor(
|
||||
[[[180, 180, 180, 180, 180, 180]]],
|
||||
device=joints.device,
|
||||
dtype=joints.dtype,
|
||||
)
|
||||
return joints
|
||||
@torch.no_grad()
|
||||
def step(self, proprio, images, text_embeds):
|
||||
device = self.device
|
||||
dtype = self.dtype
|
||||
# The background image used for padding
|
||||
background_color = np.array([int(x * 255) for x in self.image_processor.image_mean], dtype=np.uint8).reshape(1, 1, 3)
|
||||
background_image = (np.ones(
|
||||
(
|
||||
self.image_processor.size["height"],
|
||||
self.image_processor.size["width"],
|
||||
3,
|
||||
),
|
||||
dtype=np.uint8,
|
||||
) * background_color)
|
||||
# Preprocess the images by order and encode them
|
||||
image_tensor_list = []
|
||||
for image in images:
|
||||
if image is None:
|
||||
# Replace it with the background image
|
||||
image = PImage.fromarray(background_image)
|
||||
else:
|
||||
# Convert numpy array to PIL Image if needed
|
||||
if isinstance(image, np.ndarray):
|
||||
image = PImage.fromarray(image)
|
||||
if self.image_size is not None:
|
||||
image = transforms.Resize(self.data_args.image_size)(image)
|
||||
if self.args["dataset"].get("auto_adjust_image_brightness", False):
|
||||
pixel_values = list(image.getdata())
|
||||
average_brightness = sum(sum(pixel) for pixel in pixel_values) / (len(pixel_values) * 255.0 * 3)
|
||||
if average_brightness <= 0.15:
|
||||
image = transforms.ColorJitter(brightness=(1.75, 1.75))(image)
|
||||
if self.args["dataset"].get("image_aspect_ratio", "pad") == "pad":
|
||||
def expand2square(pil_img, background_color):
|
||||
width, height = pil_img.size
|
||||
if width == height:
|
||||
return pil_img
|
||||
elif width > height:
|
||||
result = PImage.new(pil_img.mode, (width, width), background_color)
|
||||
result.paste(pil_img, (0, (width - height) // 2))
|
||||
return result
|
||||
else:
|
||||
result = PImage.new(pil_img.mode, (height, height), background_color)
|
||||
result.paste(pil_img, ((height - width) // 2, 0))
|
||||
return result
|
||||
image = expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean))
|
||||
image = self.image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
|
||||
image_tensor_list.append(image)
|
||||
image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
|
||||
image_embeds = self.vision_model(image_tensor).detach()
|
||||
image_embeds = image_embeds.reshape(-1, self.vision_model.hidden_size).unsqueeze(0)
|
||||
# Prepare the proprioception states and the control frequency
|
||||
joints = proprio.to(device).unsqueeze(0) # (1, 1, 14)
|
||||
states, state_elem_mask = self._format_joint_to_state(joints) # (1, 1, 128), (1, 128)
|
||||
states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(device, dtype=dtype)
|
||||
states = states[:, -1:, :] # (1, 1, 128)
|
||||
ctrl_freqs = torch.tensor([self.control_frequency]).to(device)
|
||||
text_embeds = text_embeds.to(device, dtype=dtype)
|
||||
# Predict the next action chunk given the inputs
|
||||
trajectory = self.policy.predict_action(
|
||||
lang_tokens=text_embeds,
|
||||
lang_attn_mask=torch.ones(text_embeds.shape[:2], dtype=torch.bool, device=text_embeds.device),
|
||||
img_tokens=image_embeds,
|
||||
state_tokens=states,
|
||||
action_mask=state_elem_mask.unsqueeze(1),
|
||||
ctrl_freqs=ctrl_freqs,
|
||||
)
|
||||
trajectory = self._unformat_action_to_joint(trajectory).to(torch.float32)
|
||||
return trajectory
|
||||
|
||||
def get_training_samples(data_dirs, num_samples=5, instructions_per_episode=1):
|
||||
"""
|
||||
Get training samples from one or multiple data directories.
|
||||
|
||||
Args:
|
||||
data_dirs: A single directory path (str) or a list of directory paths
|
||||
num_samples: Total number of samples to generate across all directories
|
||||
instructions_per_episode: Number of instructions per episode
|
||||
"""
|
||||
training_samples = []
|
||||
|
||||
# Handle both single directory and list of directories
|
||||
if isinstance(data_dirs, str):
|
||||
data_dirs = [data_dirs]
|
||||
|
||||
logger.info(f"Get Training Data From: {len(data_dirs)} dataset(s).")
|
||||
|
||||
# First, collect all available episode files from all directories
|
||||
episode_files = []
|
||||
for data_dir in data_dirs:
|
||||
if not os.path.isdir(data_dir):
|
||||
logger.warning(f"Directory not found: {data_dir}, skipping")
|
||||
continue
|
||||
for root, dirs, files in os.walk(data_dir):
|
||||
for file in files:
|
||||
if file.endswith('.hdf5'):
|
||||
file_path = os.path.join(root, file)
|
||||
episode_files.append(file_path)
|
||||
|
||||
if len(episode_files) == 0:
|
||||
logger.warning(f"No episode files found in the provided directories")
|
||||
return training_samples
|
||||
|
||||
logger.info(f"Found {len(episode_files)} episode files across all datasets.")
|
||||
|
||||
# Generate samples by randomly selecting from episodes
|
||||
while len(training_samples) < num_samples:
|
||||
# Randomly select an episode file
|
||||
file_path = np.random.choice(episode_files)
|
||||
try:
|
||||
with h5py.File(file_path, 'r') as f:
|
||||
observations = f['observations']
|
||||
actions = f['action'][:]
|
||||
images = observations['images']
|
||||
qpos = observations['qpos'][:]
|
||||
episode_dir = os.path.dirname(file_path)
|
||||
instructions_dir = os.path.join(episode_dir, 'instructions')
|
||||
num_steps = len(qpos)
|
||||
if num_steps > 1: # Image部分需要左中右三帧加上对饮历史帧组成4374维
|
||||
lang_step_idx = int(np.random.randint(0, max(instructions_per_episode, 1)))
|
||||
instructions_dir = os.path.join(os.path.dirname(file_path), "instructions")
|
||||
|
||||
lang_embed, lang_str = None, None
|
||||
# lang embed (optional)
|
||||
lang_embed_path = os.path.join(instructions_dir, f"lang_embed_{lang_step_idx}.pt")
|
||||
if os.path.exists(lang_embed_path):
|
||||
try:
|
||||
lang_embed = torch.load(lang_embed_path, map_location="cpu")
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading {lang_embed_path}: {e}")
|
||||
|
||||
# lang string (optional)
|
||||
lang_str_path = os.path.join(instructions_dir, f"txt_lang_embed_{lang_step_idx}.txt")
|
||||
if os.path.exists(lang_str_path):
|
||||
try:
|
||||
with open(lang_str_path, "r", encoding="utf-8") as tf:
|
||||
lang_str = tf.read().strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading {lang_str_path}: {e}")
|
||||
lang_str = lang_str or ""
|
||||
|
||||
# 获取多摄像头多历史帧图像
|
||||
step_idx = np.random.randint(0, num_steps)
|
||||
multi_cam_images = {}
|
||||
ref_frame = images['cam_high'][0]
|
||||
ref_img = cv2.imdecode(np.frombuffer(ref_frame, np.uint8), cv2.IMREAD_COLOR)
|
||||
IMG_HEIGHT, IMG_WIDTH = ref_img.shape[:2]
|
||||
# IMG_HEIGHT, IMG_WIDTH = images['cam_high'][0].shape[:2]
|
||||
ground_image = np.zeros((IMG_HEIGHT, IMG_WIDTH, 3), dtype=np.uint8)
|
||||
for cam_name in ['cam_high', 'cam_left_wrist', 'cam_right_wrist']:
|
||||
|
||||
if cam_name in images:
|
||||
cam_images = []
|
||||
# 获取2个历史帧的图像
|
||||
for i in range(max(step_idx - 1, 0), step_idx + 1): # 2个历史帧
|
||||
img_bits = images[cam_name][i]
|
||||
img = cv2.imdecode(np.frombuffer(img_bits, np.uint8), cv2.IMREAD_COLOR)
|
||||
# img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
||||
cam_images.append(img)
|
||||
if len(cam_images) < 2:
|
||||
cam_images = [cam_images[0]] * 2
|
||||
multi_cam_images[cam_name] = cam_images
|
||||
else:
|
||||
cam_images = []
|
||||
for i in range(max(step_idx - 1, 0), step_idx + 1): # 2个历史帧
|
||||
img_bits = ground_image
|
||||
# img = cv2.imdecode(np.frombuffer(img_bits, np.uint8), cv2.IMREAD_COLOR)
|
||||
cam_images.append(img_bits)
|
||||
if len(cam_images) < 2:
|
||||
cam_images = [cam_images[0]] * 2
|
||||
multi_cam_images[cam_name] = cam_images
|
||||
training_samples.append({
|
||||
'multi_cam_images': multi_cam_images,
|
||||
'joints': actions[step_idx],
|
||||
'lang_embed': lang_embed,
|
||||
'lang_str': lang_str,
|
||||
'source': file_path,
|
||||
'step': step_idx
|
||||
})
|
||||
logger.debug(f"TimeStep: {step_idx}, Sample: {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Faild: {file_path} : {e}")
|
||||
continue
|
||||
logger.info(f"Total Num: {len(training_samples)}.")
|
||||
return training_samples
|
||||
|
||||
|
||||
def main(config_path):
|
||||
with open(config_path, "r") as f:
|
||||
cfg = json.load(f)
|
||||
|
||||
export_info = cfg.get("export", {})
|
||||
|
||||
opt = ExportConfig(
|
||||
task_id=cfg.get("task_id"),
|
||||
output_path=os.path.join(export_info.get("output_path", "."), cfg.get("task_id", "")),
|
||||
model_path=export_info.get("model_path"),
|
||||
calibration_num=export_info.get("calibration_num", 100),
|
||||
dataset_path=export_info.get("dataset_path"),
|
||||
gpu_id=cfg.get("gpu_id", "0"),
|
||||
march=export_info.get("march"),
|
||||
model_type=export_info.get("model_type"),
|
||||
pretrained_vision_encoder_name_or_path="/home/qi.xiong/DualArm/Work_Docker/RDT/weights/siglip-so400m-patch14-384",
|
||||
ctrl_freq=export_info.get("ctrl_freq", 25),
|
||||
cal_data_device=cfg.get("cal_data_device", "cuda"),
|
||||
lang_calibration_num=export_info.get("lang_calibration_num", 1)
|
||||
)
|
||||
|
||||
if opt.model_type not in ["170M", "1B"]:
|
||||
raise ValueError(f"RDT ONLY SUPPORT 170M AND 1B, BUT GOT {opt.model_type}")
|
||||
|
||||
logger.info(f"Export config loaded: {opt}")
|
||||
os.makedirs(opt.output_path, exist_ok=True)
|
||||
|
||||
# PrePare Output Workspace
|
||||
## BPU_RDT_Policy
|
||||
bpu_rdt_name = "BPU_RDT_Policy_170M" if opt.model_type == "170M" else "BPU_RDT_Policy_1B"
|
||||
bpu_rdt_path = os.path.join(opt.output_path, bpu_rdt_name)
|
||||
os.makedirs(bpu_rdt_path, exist_ok=True)
|
||||
os.system(f"cp configs/base_{opt.model_type}.yaml {bpu_rdt_path}/base.yaml")
|
||||
rdt_config_path = os.path.join(bpu_rdt_path, "base.yaml")
|
||||
## Test_Datas
|
||||
test_data_name = "test_data"
|
||||
test_data_path = os.path.join(opt.output_path, test_data_name)
|
||||
os.makedirs(test_data_path, exist_ok=True)
|
||||
## instruction
|
||||
instruction_ws_name = "instructions"
|
||||
instruction_ws_path = os.path.join(opt.output_path, instruction_ws_name)
|
||||
os.makedirs(instruction_ws_path, exist_ok=True)
|
||||
for name in os.listdir(opt.dataset_path):
|
||||
os.makedirs(os.path.join(instruction_ws_path, name), exist_ok=True)
|
||||
|
||||
## image adaptor
|
||||
global img_adaptor_cal_ws
|
||||
img_adaptor_ws_name = "img_adaptor_WorkSpace"
|
||||
img_adaptor_cal_name = "rdt_image_adaptor_calibration"
|
||||
img_adaptor_name = "rdt_image_adaptor.onnx"
|
||||
img_adaptor_config_name = "config.yaml"
|
||||
img_adaptor_ws = os.path.join(opt.output_path, img_adaptor_ws_name)
|
||||
img_adaptor_path = os.path.join(img_adaptor_ws, img_adaptor_name)
|
||||
img_adaptor_cal_ws = os.path.join(img_adaptor_ws, img_adaptor_cal_name)
|
||||
os.makedirs(img_adaptor_ws, exist_ok=True)
|
||||
os.makedirs(img_adaptor_cal_ws, exist_ok=True)
|
||||
|
||||
## action adaptor
|
||||
state_adaptor_name1 = "rdt_state_adaptor_1x1x256.onnx"
|
||||
state_adaptor_path1 = os.path.join(opt.output_path, bpu_rdt_name, state_adaptor_name1)
|
||||
state_adaptor_name2 = "rdt_state_adaptor_1x64x256.onnx"
|
||||
state_adaptor_path2 = os.path.join(opt.output_path, bpu_rdt_name, state_adaptor_name2)
|
||||
|
||||
## lang adaptor
|
||||
lang_adaptor_name = "rdt_lang_adaptor.onnx"
|
||||
lang_adaptor_path = os.path.join(opt.output_path, bpu_rdt_name, lang_adaptor_name)
|
||||
|
||||
## DiT Policy
|
||||
dit_ws_name = "DiT_WorkSpace"
|
||||
dit_cal_name = "rdt_dit_calibration"
|
||||
dit_name = "rdt_dit.onnx"
|
||||
dit_config_name = "config.yaml"
|
||||
dit_json_name = "quant_config.json"
|
||||
dit_ws = os.path.join(opt.output_path, dit_ws_name)
|
||||
dit_path = os.path.join(dit_ws, dit_name)
|
||||
dit_cal_ws = os.path.join(dit_ws, dit_cal_name)
|
||||
os.makedirs(dit_ws, exist_ok=True)
|
||||
os.makedirs(dit_cal_ws, exist_ok=True)
|
||||
|
||||
global dit_cal_path_x, dit_cal_path_freq, dit_cal_path_t, dit_cal_path_lang_c, dit_cal_path_img_c, dit_cal_path_lang_mask
|
||||
dit_cal_path_x = os.path.join(dit_cal_ws, "x")
|
||||
os.makedirs(dit_cal_path_x, exist_ok=True)
|
||||
dit_cal_path_freq = os.path.join(dit_cal_ws, "freq")
|
||||
os.makedirs(dit_cal_path_freq, exist_ok=True)
|
||||
dit_cal_path_t = os.path.join(dit_cal_ws, "t")
|
||||
os.makedirs(dit_cal_path_t, exist_ok=True)
|
||||
dit_cal_path_lang_c = os.path.join(dit_cal_ws, "lang_c")
|
||||
os.makedirs(dit_cal_path_lang_c, exist_ok=True)
|
||||
dit_cal_path_img_c = os.path.join(dit_cal_ws, "img_c")
|
||||
os.makedirs(dit_cal_path_img_c, exist_ok=True)
|
||||
dit_cal_path_lang_mask = os.path.join(dit_cal_ws, "lang_mask")
|
||||
os.makedirs(dit_cal_path_lang_mask, exist_ok=True)
|
||||
|
||||
# Prepare Calibrate Data
|
||||
with open(rdt_config_path, "r") as f:
|
||||
rdt_config = yaml.safe_load(f)
|
||||
|
||||
dump_model = create_dump_model(
|
||||
args=rdt_config,
|
||||
dtype=torch.float32,
|
||||
pretrained=opt.model_path,
|
||||
pretrained_vision_encoder_name_or_path=opt.pretrained_vision_encoder_name_or_path,
|
||||
control_frequency=opt.ctrl_freq,
|
||||
device=opt.cal_data_device
|
||||
)
|
||||
|
||||
# Prepare Calbriation Data
|
||||
# load training data from all datasets
|
||||
global dump_cnt, dump_dataset_name
|
||||
test_data_cnt = 0
|
||||
|
||||
# Collect all dataset paths
|
||||
all_dataset_paths = []
|
||||
for dump_dataset_name in os.listdir(opt.dataset_path):
|
||||
dump_dataset_path = os.path.join(opt.dataset_path, dump_dataset_name)
|
||||
if os.path.isdir(dump_dataset_path):
|
||||
all_dataset_paths.append(dump_dataset_path)
|
||||
|
||||
# Get training samples from all datasets together
|
||||
training_samples = get_training_samples(all_dataset_paths, num_samples=opt.calibration_num, instructions_per_episode=opt.lang_calibration_num)
|
||||
|
||||
if len(training_samples) == 0:
|
||||
logger.warning("No training samples found, skipping calibration data generation")
|
||||
else:
|
||||
# Only process up to the number of samples we actually have
|
||||
num_samples_to_process = min(len(training_samples), opt.calibration_num)
|
||||
for dump_cnt in range(num_samples_to_process):
|
||||
sample = training_samples[dump_cnt]
|
||||
# Extract dataset name from the sample's source path
|
||||
sample_source = sample['source']
|
||||
dump_dataset_name = os.path.basename(os.path.dirname(os.path.dirname(sample_source)))
|
||||
instruction_emb = {
|
||||
"lang_cond": sample["lang_embed"].float().cpu(),
|
||||
"lang_str": sample["lang_str"]
|
||||
}
|
||||
ins_str_name = sample["lang_str"].replace(" ", "_") + "__"
|
||||
torch.save(instruction_emb, os.path.join(instruction_ws_path, dump_dataset_name, f"{ins_str_name}.pt"))
|
||||
image_arrs = [
|
||||
sample['multi_cam_images']['cam_high'][0],
|
||||
sample['multi_cam_images']['cam_right_wrist'][0],
|
||||
sample['multi_cam_images']['cam_left_wrist'][0],
|
||||
sample['multi_cam_images']['cam_high'][1],
|
||||
sample['multi_cam_images']['cam_right_wrist'][1],
|
||||
sample['multi_cam_images']['cam_left_wrist'][1],
|
||||
]
|
||||
test_data_cnt += 1
|
||||
np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_high_0.npy"), sample['multi_cam_images']['cam_high'][0])
|
||||
np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_right_wrist_0.npy"), sample['multi_cam_images']['cam_right_wrist'][0])
|
||||
np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_left_wrist_0.npy"), sample['multi_cam_images']['cam_left_wrist'][0])
|
||||
np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_high_1.npy"), sample['multi_cam_images']['cam_high'][1])
|
||||
np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_right_wrist_1.npy"), sample['multi_cam_images']['cam_right_wrist'][1])
|
||||
np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_left_wrist_1.npy"), sample['multi_cam_images']['cam_left_wrist'][1])
|
||||
images = [PImage.fromarray(arr) if arr is not None else None for arr in image_arrs]
|
||||
proprio = torch.from_numpy(sample['joints']).float().unsqueeze(0).to(opt.cal_data_device)
|
||||
np.save(os.path.join(test_data_path, f"{test_data_cnt}_joints.npy"), sample['joints'])
|
||||
lang_embeddings = sample['lang_embed'].float().unsqueeze(0).to(opt.cal_data_device)
|
||||
torch.save(lang_embeddings, os.path.join(test_data_path, f"{test_data_cnt}_lang_embeddings.pt"))
|
||||
dump_model.reset()
|
||||
begin_time = time()
|
||||
actions = dump_model.step(proprio=proprio, images=images, text_embeds=lang_embeddings).squeeze(0).cpu().numpy()
|
||||
np.save(os.path.join(test_data_path, f"{test_data_cnt}_actions.npy"), actions)
|
||||
logger.debug(f"Dump: Cost {(1000*(time() - begin_time)):.1f} ms, cnt: {dump_cnt}, name: {dump_dataset_name}")
|
||||
logger.info("End Generate Calibration Data.")
|
||||
del dump_model
|
||||
|
||||
# Load RDT Policy: CPU Model For ONNX Export
|
||||
with open(rdt_config_path, "r") as f:
|
||||
rdt_config = yaml.safe_load(f)
|
||||
|
||||
model = create_model(
|
||||
args=rdt_config,
|
||||
dtype=torch.float32,
|
||||
pretrained=opt.model_path,
|
||||
pretrained_vision_encoder_name_or_path=opt.pretrained_vision_encoder_name_or_path,
|
||||
control_frequency=opt.ctrl_freq,
|
||||
device="cpu"
|
||||
)
|
||||
|
||||
# image adaptor: ONNX Model
|
||||
m = model.policy.img_adaptor
|
||||
m.eval()
|
||||
|
||||
input_data = torch.randn(1, 4374, rdt_config['model']['img_token_dim']) # 假设批量大小为1
|
||||
output = m(input_data)
|
||||
|
||||
torch.onnx.export(
|
||||
m,
|
||||
input_data,
|
||||
img_adaptor_path,
|
||||
opset_version=14,
|
||||
do_constant_folding=True,
|
||||
input_names=["img_tokens"],
|
||||
output_names=["adapted_img"],
|
||||
dynamic_axes=None,
|
||||
verbose=False
|
||||
)
|
||||
logger.info("Export RDT [img_adaptor] Model Success.")
|
||||
|
||||
# DiT
|
||||
hidden_size = rdt_config['model']["rdt"]['hidden_size']
|
||||
|
||||
m = model.policy.model
|
||||
m = m.eval().cpu()
|
||||
x = torch.randn(1, 65, hidden_size)
|
||||
freq = torch.tensor([1], dtype=torch.int32)
|
||||
t = torch.tensor([10], dtype=torch.int32)
|
||||
lang_c = torch.randn(1, 64, hidden_size)
|
||||
img_c = torch.randn(1, 4374, hidden_size)
|
||||
lang_mask = torch.ones(1, 64, dtype=torch.float32)
|
||||
dummy_inputs = (x, freq, t, lang_c, img_c, lang_mask)
|
||||
# outputs = m(x, freq, t, lang_c, img_c, lang_mask)
|
||||
torch.onnx.export(
|
||||
m,
|
||||
dummy_inputs,
|
||||
dit_path,
|
||||
# export_params=True,
|
||||
opset_version=14,
|
||||
do_constant_folding=True,
|
||||
input_names=["x", "freq", "t", "lang_c", "img_c", "lang_mask"],
|
||||
output_names=["actions"],
|
||||
verbose=False
|
||||
)
|
||||
|
||||
logger.info("Export RDT [DiT] Model Success.")
|
||||
|
||||
# state adaptor
|
||||
m = model.policy.state_adaptor
|
||||
m.eval()
|
||||
|
||||
input_data = torch.randn(1, 1, 256) # 假设批量大小为1
|
||||
output = m(input_data)
|
||||
|
||||
torch.onnx.export(
|
||||
m,
|
||||
input_data,
|
||||
state_adaptor_path1,
|
||||
export_params=True,
|
||||
opset_version=14,
|
||||
do_constant_folding=True,
|
||||
input_names=["state_tokens"],
|
||||
output_names=["state_traj"],
|
||||
dynamic_axes=None,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
logging.info("Export RDT [state 1x1x256] Model Success.")
|
||||
|
||||
input_data = torch.randn(1, 64, 256)
|
||||
output = m(input_data)
|
||||
|
||||
torch.onnx.export(
|
||||
m,
|
||||
input_data,
|
||||
state_adaptor_path2,
|
||||
export_params=True,
|
||||
opset_version=14,
|
||||
do_constant_folding=True,
|
||||
input_names=['state_tokens'],
|
||||
output_names=['state_traj'],
|
||||
dynamic_axes=None,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
logging.info("Export RDT [state 1x64x256] Model Success.")
|
||||
|
||||
# lang adaptor
|
||||
|
||||
m = model.policy.lang_adaptor
|
||||
m.eval()
|
||||
|
||||
input_data = torch.randn(1, 14, 4096)
|
||||
output = m(input_data)
|
||||
|
||||
torch.onnx.export(
|
||||
m,
|
||||
input_data,
|
||||
lang_adaptor_path,
|
||||
export_params=True,
|
||||
opset_version=14,
|
||||
do_constant_folding=True,
|
||||
input_names=["text_embeds"],
|
||||
output_names=["lang_cond"],
|
||||
dynamic_axes={
|
||||
"text_embeds": {1: "N"},
|
||||
"lang_cond": {1: "N"}
|
||||
},
|
||||
verbose=False
|
||||
)
|
||||
|
||||
logger.info("Export RDT [lang adaptor] Model Success.")
|
||||
|
||||
######## Prepare Calbibration Data
|
||||
|
||||
if __name__ == "__main__":
|
||||
main("/home/qi.xiong/DualArm/Work_Docker/RDT/rdt-export/input/config.json")
|
||||
logger.info("All Models Have Been Exported Success.")
|
||||
|
||||
0
RDT/rdt-export/models/__init__.py
Normal file
0
RDT/rdt-export/models/__init__.py
Normal file
82
RDT/rdt-export/models/ema_model.py
Normal file
82
RDT/rdt-export/models/ema_model.py
Normal file
@ -0,0 +1,82 @@
|
||||
# Reference: DiffusionPolicy [https://github.com/real-stanford/diffusion_policy]
|
||||
|
||||
import torch
|
||||
from torch.nn.modules.batchnorm import _BatchNorm
|
||||
|
||||
|
||||
class EMAModel:
|
||||
"""
|
||||
Exponential Moving Average of models weights
|
||||
"""
|
||||
|
||||
def __init__(self, model, update_after_step=0, inv_gamma=1.0, power=2 / 3, min_value=0.0, max_value=0.9999):
|
||||
"""
|
||||
@crowsonkb's notes on EMA Warmup:
|
||||
If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
|
||||
to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
|
||||
gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
|
||||
at 215.4k steps).
|
||||
Args:
|
||||
inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
|
||||
power (float): Exponential factor of EMA warmup. Default: 2/3.
|
||||
min_value (float): The minimum EMA decay rate. Default: 0.
|
||||
"""
|
||||
|
||||
self.averaged_model = model
|
||||
self.averaged_model.eval()
|
||||
self.averaged_model.requires_grad_(False)
|
||||
|
||||
self.update_after_step = update_after_step
|
||||
self.inv_gamma = inv_gamma
|
||||
self.power = power
|
||||
self.min_value = min_value
|
||||
self.max_value = max_value
|
||||
|
||||
self.decay = 0.0
|
||||
self.optimization_step = 0
|
||||
|
||||
def get_decay(self, optimization_step):
|
||||
"""
|
||||
Compute the decay factor for the exponential moving average.
|
||||
"""
|
||||
step = max(0, optimization_step - self.update_after_step - 1)
|
||||
value = 1 - (1 + step / self.inv_gamma)**-self.power
|
||||
|
||||
if step <= 0:
|
||||
return 0.0
|
||||
|
||||
return max(self.min_value, min(value, self.max_value))
|
||||
|
||||
@torch.no_grad()
|
||||
def step(self, new_model):
|
||||
self.decay = self.get_decay(self.optimization_step)
|
||||
|
||||
# old_all_dataptrs = set()
|
||||
# for param in new_model.parameters():
|
||||
# data_ptr = param.data_ptr()
|
||||
# if data_ptr != 0:
|
||||
# old_all_dataptrs.add(data_ptr)
|
||||
|
||||
all_dataptrs = set()
|
||||
for module, ema_module in zip(new_model.modules(), self.averaged_model.modules()):
|
||||
for param, ema_param in zip(module.parameters(recurse=False), ema_module.parameters(recurse=False)):
|
||||
# iterative over immediate parameters only.
|
||||
if isinstance(param, dict):
|
||||
raise RuntimeError('Dict parameter not supported')
|
||||
|
||||
# data_ptr = param.data_ptr()
|
||||
# if data_ptr != 0:
|
||||
# all_dataptrs.add(data_ptr)
|
||||
|
||||
if isinstance(module, _BatchNorm):
|
||||
# skip batchnorms
|
||||
ema_param.copy_(param.to(dtype=ema_param.dtype).data)
|
||||
elif not param.requires_grad:
|
||||
ema_param.copy_(param.to(dtype=ema_param.dtype).data)
|
||||
else:
|
||||
ema_param.mul_(self.decay)
|
||||
ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=1 - self.decay)
|
||||
|
||||
# verify that iterating over module and then parameters is identical to parameters recursively.
|
||||
# assert old_all_dataptrs == all_dataptrs
|
||||
self.optimization_step += 1
|
||||
75
RDT/rdt-export/models/hub_mixin.py
Normal file
75
RDT/rdt-export/models/hub_mixin.py
Normal file
@ -0,0 +1,75 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Union
|
||||
|
||||
from huggingface_hub import PyTorchModelHubMixin
|
||||
from huggingface_hub.constants import (PYTORCH_WEIGHTS_NAME, SAFETENSORS_SINGLE_FILE)
|
||||
from huggingface_hub.file_download import hf_hub_download
|
||||
from huggingface_hub.utils import EntryNotFoundError, is_torch_available
|
||||
|
||||
if is_torch_available():
|
||||
import torch # type: ignore
|
||||
|
||||
|
||||
class CompatiblePyTorchModelHubMixin(PyTorchModelHubMixin):
|
||||
"""Mixin class to load Pytorch models from the Hub."""
|
||||
|
||||
def _save_pretrained(self, save_directory: Path) -> None:
|
||||
"""Save weights from a Pytorch model to a local directory."""
|
||||
# To bypass saving into safetensor by default
|
||||
model_to_save = self.module if hasattr(self, "module") else self # type: ignore
|
||||
torch.save(model_to_save.state_dict(), save_directory / PYTORCH_WEIGHTS_NAME)
|
||||
|
||||
@classmethod
|
||||
def _from_pretrained(
|
||||
cls,
|
||||
*,
|
||||
model_id: str,
|
||||
revision: Optional[str],
|
||||
cache_dir: Optional[Union[str, Path]],
|
||||
force_download: bool,
|
||||
proxies: Optional[Dict],
|
||||
resume_download: Optional[bool],
|
||||
local_files_only: bool,
|
||||
token: Union[str, bool, None],
|
||||
map_location: str = "cpu",
|
||||
strict: bool = False,
|
||||
**model_kwargs,
|
||||
):
|
||||
"""Load Pytorch pretrained weights and return the loaded model."""
|
||||
model = cls(**model_kwargs)
|
||||
if os.path.isdir(model_id):
|
||||
print("Loading weights from local directory")
|
||||
try:
|
||||
model_file = os.path.join(model_id, SAFETENSORS_SINGLE_FILE)
|
||||
return cls._load_as_safetensor(model, model_file, map_location, strict)
|
||||
except FileNotFoundError:
|
||||
model_file = os.path.join(model_id, PYTORCH_WEIGHTS_NAME)
|
||||
return cls._load_as_pickle(model, model_file, map_location, strict)
|
||||
else:
|
||||
try:
|
||||
model_file = hf_hub_download(
|
||||
repo_id=model_id,
|
||||
filename=SAFETENSORS_SINGLE_FILE,
|
||||
revision=revision,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
token=token,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
return cls._load_as_safetensor(model, model_file, map_location, strict)
|
||||
except EntryNotFoundError:
|
||||
model_file = hf_hub_download(
|
||||
repo_id=model_id,
|
||||
filename=PYTORCH_WEIGHTS_NAME,
|
||||
revision=revision,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
token=token,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
return cls._load_as_pickle(model, model_file, map_location, strict)
|
||||
159
RDT/rdt-export/models/multimodal_encoder/clip_encoder.py
Normal file
159
RDT/rdt-export/models/multimodal_encoder/clip_encoder.py
Normal file
@ -0,0 +1,159 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
|
||||
|
||||
|
||||
class CLIPVisionTower(nn.Module):
|
||||
|
||||
def __init__(self, vision_tower, args, delay_load=False):
|
||||
super().__init__()
|
||||
|
||||
self.is_loaded = False
|
||||
|
||||
self.vision_tower_name = vision_tower
|
||||
self.select_layer = args.mm_vision_select_layer
|
||||
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
|
||||
|
||||
if not delay_load:
|
||||
self.load_model()
|
||||
elif getattr(args, 'unfreeze_mm_vision_tower', False):
|
||||
self.load_model()
|
||||
else:
|
||||
self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
|
||||
|
||||
def load_model(self, device_map=None):
|
||||
if self.is_loaded:
|
||||
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
|
||||
return
|
||||
|
||||
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
|
||||
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
||||
self.vision_tower.requires_grad_(False)
|
||||
|
||||
self.is_loaded = True
|
||||
|
||||
def feature_select(self, image_forward_outs):
|
||||
image_features = image_forward_outs.hidden_states[self.select_layer]
|
||||
if self.select_feature == 'patch':
|
||||
image_features = image_features[:, 1:]
|
||||
elif self.select_feature == 'cls_patch':
|
||||
image_features = image_features
|
||||
else:
|
||||
raise ValueError(f'Unexpected select feature: {self.select_feature}')
|
||||
return image_features
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, images):
|
||||
if type(images) is list:
|
||||
image_features = []
|
||||
for image in images:
|
||||
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
|
||||
output_hidden_states=True)
|
||||
image_feature = self.feature_select(image_forward_out).to(image.dtype)
|
||||
image_features.append(image_feature)
|
||||
else:
|
||||
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
|
||||
output_hidden_states=True)
|
||||
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
||||
|
||||
return image_features
|
||||
|
||||
@property
|
||||
def dummy_feature(self):
|
||||
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self.vision_tower.dtype
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return self.vision_tower.device
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
if self.is_loaded:
|
||||
return self.vision_tower.config
|
||||
else:
|
||||
return self.cfg_only
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.config.hidden_size
|
||||
|
||||
@property
|
||||
def num_patches_per_side(self):
|
||||
return self.config.image_size // self.config.patch_size
|
||||
|
||||
@property
|
||||
def num_patches(self):
|
||||
return (self.config.image_size // self.config.patch_size)**2
|
||||
|
||||
|
||||
class CLIPVisionTowerS2(CLIPVisionTower):
|
||||
|
||||
def __init__(self, vision_tower, args, delay_load=False):
|
||||
super().__init__(vision_tower, args, delay_load)
|
||||
|
||||
self.s2_scales = getattr(args, 's2_scales', '336,672,1008')
|
||||
self.s2_scales = list(map(int, self.s2_scales.split(',')))
|
||||
self.s2_scales.sort()
|
||||
self.s2_split_size = self.s2_scales[0]
|
||||
self.s2_image_size = self.s2_scales[-1]
|
||||
|
||||
try:
|
||||
from s2wrapper import forward as multiscale_forward
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git'
|
||||
)
|
||||
self.multiscale_forward = multiscale_forward
|
||||
|
||||
# change resize/crop size in preprocessing to the largest image size in s2_scale
|
||||
if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False):
|
||||
self.image_processor.size['shortest_edge'] = self.s2_image_size
|
||||
self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
|
||||
|
||||
def load_model(self, device_map=None):
|
||||
if self.is_loaded:
|
||||
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
|
||||
return
|
||||
|
||||
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
|
||||
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
||||
self.vision_tower.requires_grad_(False)
|
||||
|
||||
self.image_processor.size['shortest_edge'] = self.s2_image_size
|
||||
self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
|
||||
|
||||
self.is_loaded = True
|
||||
|
||||
@torch.no_grad()
|
||||
def forward_feature(self, images):
|
||||
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
|
||||
output_hidden_states=True)
|
||||
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
||||
return image_features
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, images):
|
||||
if type(images) is list:
|
||||
image_features = []
|
||||
for image in images:
|
||||
image_feature = self.multiscale_forward(self.forward_feature,
|
||||
image.unsqueeze(0),
|
||||
img_sizes=self.s2_scales,
|
||||
max_split_size=self.s2_split_size)
|
||||
image_features.append(image_feature)
|
||||
else:
|
||||
image_features = self.multiscale_forward(self.forward_feature,
|
||||
images,
|
||||
img_sizes=self.s2_scales,
|
||||
max_split_size=self.s2_split_size)
|
||||
|
||||
return image_features
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.config.hidden_size * len(self.s2_scales)
|
||||
87
RDT/rdt-export/models/multimodal_encoder/dinov2_encoder.py
Normal file
87
RDT/rdt-export/models/multimodal_encoder/dinov2_encoder.py
Normal file
@ -0,0 +1,87 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import AutoConfig, AutoImageProcessor, AutoModel, Dinov2Model
|
||||
|
||||
|
||||
class DinoV2VisionTower(nn.Module):
|
||||
|
||||
def __init__(self, vision_tower, args, delay_load=False):
|
||||
super().__init__()
|
||||
|
||||
self.is_loaded = False
|
||||
|
||||
self.vision_tower_name = vision_tower
|
||||
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
|
||||
|
||||
if not delay_load:
|
||||
self.load_model()
|
||||
elif getattr(args, 'unfreeze_mm_vision_tower', False):
|
||||
self.load_model()
|
||||
else:
|
||||
self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
|
||||
|
||||
def load_model(self, device_map=None):
|
||||
if self.is_loaded:
|
||||
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
|
||||
return
|
||||
|
||||
self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name)
|
||||
self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
||||
self.vision_tower.requires_grad_(False) # FIXME:
|
||||
|
||||
self.is_loaded = True
|
||||
|
||||
def feature_select(self, image_forward_outs):
|
||||
image_features = image_forward_outs.last_hidden_state
|
||||
if self.select_feature == 'patch':
|
||||
image_features = image_features[:, 1:] # (B, 1369, 1536)
|
||||
elif self.select_feature == 'cls_patch':
|
||||
image_features = image_features # (B, 1, 1536)
|
||||
else:
|
||||
raise ValueError(f'Unexpected select feature: {self.select_feature}')
|
||||
return image_features
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, images):
|
||||
if type(images) is list:
|
||||
image_features = []
|
||||
for image in images:
|
||||
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
|
||||
image_feature = self.feature_select(image_forward_out).to(image.dtype)
|
||||
image_features.append(image_feature)
|
||||
else:
|
||||
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
|
||||
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
||||
|
||||
return image_features
|
||||
|
||||
@property
|
||||
def dummy_feature(self):
|
||||
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self.vision_tower.dtype
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return self.vision_tower.device
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
if self.is_loaded:
|
||||
return self.vision_tower.config
|
||||
else:
|
||||
return self.cfg_only
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.config.hidden_size
|
||||
|
||||
@property
|
||||
def num_patches_per_side(self):
|
||||
return self.config.image_size // self.config.patch_size
|
||||
|
||||
@property
|
||||
def num_patches(self):
|
||||
return (self.config.image_size // self.config.patch_size)**2
|
||||
86
RDT/rdt-export/models/multimodal_encoder/siglip_encoder.py
Normal file
86
RDT/rdt-export/models/multimodal_encoder/siglip_encoder.py
Normal file
@ -0,0 +1,86 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import AutoConfig, SiglipImageProcessor, SiglipVisionModel
|
||||
|
||||
|
||||
class SiglipVisionTower(nn.Module):
|
||||
|
||||
def __init__(self, vision_tower, args, delay_load=False):
|
||||
super().__init__()
|
||||
|
||||
self.is_loaded = False
|
||||
|
||||
self.vision_tower_name = vision_tower
|
||||
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
|
||||
|
||||
if not delay_load:
|
||||
self.load_model()
|
||||
elif getattr(args, 'unfreeze_mm_vision_tower', False):
|
||||
self.load_model()
|
||||
else:
|
||||
self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
|
||||
|
||||
def load_model(self, device_map=None):
|
||||
if self.is_loaded:
|
||||
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
|
||||
return
|
||||
|
||||
self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
|
||||
self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
||||
self.vision_tower.eval()
|
||||
|
||||
self.is_loaded = True
|
||||
|
||||
def feature_select(self, image_forward_outs):
|
||||
if self.select_feature == 'patch':
|
||||
image_features = image_forward_outs.last_hidden_state # (B, 729, 1536)
|
||||
elif self.select_feature == 'cls_patch':
|
||||
image_features = image_forward_outs.pooler_output # (B, 1, 1536)
|
||||
else:
|
||||
raise ValueError(f'Unexpected select feature: {self.select_feature}')
|
||||
return image_features
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, images):
|
||||
if type(images) is list:
|
||||
image_features = []
|
||||
for image in images:
|
||||
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
|
||||
image_feature = self.feature_select(image_forward_out).to(image.dtype)
|
||||
image_features.append(image_feature)
|
||||
else:
|
||||
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
|
||||
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
||||
|
||||
return image_features
|
||||
|
||||
@property
|
||||
def dummy_feature(self):
|
||||
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self.vision_tower.dtype
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return self.vision_tower.device
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
if self.is_loaded:
|
||||
return self.vision_tower.config
|
||||
else:
|
||||
return self.cfg_only
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.config.hidden_size
|
||||
|
||||
@property
|
||||
def num_patches_per_side(self):
|
||||
return self.config.image_size // self.config.patch_size
|
||||
|
||||
@property
|
||||
def num_patches(self):
|
||||
return (self.config.image_size // self.config.patch_size)**2
|
||||
111
RDT/rdt-export/models/multimodal_encoder/t5_encoder.py
Normal file
111
RDT/rdt-export/models/multimodal_encoder/t5_encoder.py
Normal file
@ -0,0 +1,111 @@
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
|
||||
class T5Embedder:
|
||||
# available_models = ["google/t5-v1_1-xxl"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
device,
|
||||
from_pretrained=None,
|
||||
*,
|
||||
cache_dir=None,
|
||||
hf_token=None,
|
||||
use_text_preprocessing=True,
|
||||
t5_model_kwargs=None,
|
||||
torch_dtype=None,
|
||||
use_offload_folder=None,
|
||||
model_max_length=120,
|
||||
local_files_only=False,
|
||||
):
|
||||
# from_pretrained="google/t5-v1_1-xxl" # zijian
|
||||
self.device = torch.device(device)
|
||||
self.torch_dtype = torch_dtype or torch.bfloat16
|
||||
self.cache_dir = cache_dir
|
||||
|
||||
if t5_model_kwargs is None:
|
||||
t5_model_kwargs = {
|
||||
"low_cpu_mem_usage": True,
|
||||
"torch_dtype": self.torch_dtype,
|
||||
}
|
||||
|
||||
if use_offload_folder is not None:
|
||||
t5_model_kwargs["offload_folder"] = use_offload_folder
|
||||
t5_model_kwargs["device_map"] = {
|
||||
"shared": self.device,
|
||||
"encoder.embed_tokens": self.device,
|
||||
"encoder.block.0": self.device,
|
||||
"encoder.block.1": self.device,
|
||||
"encoder.block.2": self.device,
|
||||
"encoder.block.3": self.device,
|
||||
"encoder.block.4": self.device,
|
||||
"encoder.block.5": self.device,
|
||||
"encoder.block.6": self.device,
|
||||
"encoder.block.7": self.device,
|
||||
"encoder.block.8": self.device,
|
||||
"encoder.block.9": self.device,
|
||||
"encoder.block.10": self.device,
|
||||
"encoder.block.11": self.device,
|
||||
"encoder.block.12": "disk",
|
||||
"encoder.block.13": "disk",
|
||||
"encoder.block.14": "disk",
|
||||
"encoder.block.15": "disk",
|
||||
"encoder.block.16": "disk",
|
||||
"encoder.block.17": "disk",
|
||||
"encoder.block.18": "disk",
|
||||
"encoder.block.19": "disk",
|
||||
"encoder.block.20": "disk",
|
||||
"encoder.block.21": "disk",
|
||||
"encoder.block.22": "disk",
|
||||
"encoder.block.23": "disk",
|
||||
"encoder.final_layer_norm": "disk",
|
||||
"encoder.dropout": "disk",
|
||||
}
|
||||
else:
|
||||
t5_model_kwargs["device_map"] = {
|
||||
"shared": self.device,
|
||||
"encoder": self.device,
|
||||
}
|
||||
|
||||
self.use_text_preprocessing = use_text_preprocessing
|
||||
self.hf_token = hf_token
|
||||
|
||||
# assert from_pretrained in self.available_models
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
from_pretrained,
|
||||
model_max_length=model_max_length,
|
||||
cache_dir=cache_dir,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
self.model = T5EncoderModel.from_pretrained(
|
||||
from_pretrained,
|
||||
cache_dir=cache_dir,
|
||||
local_files_only=local_files_only,
|
||||
**t5_model_kwargs,
|
||||
).eval()
|
||||
self.model_max_length = model_max_length
|
||||
|
||||
def get_text_embeddings(self, texts):
|
||||
text_tokens_and_mask = self.tokenizer(
|
||||
texts,
|
||||
max_length=self.model_max_length,
|
||||
padding="longest",
|
||||
truncation=True,
|
||||
return_attention_mask=True,
|
||||
add_special_tokens=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
input_ids = text_tokens_and_mask["input_ids"].to(self.device)
|
||||
attention_mask = text_tokens_and_mask["attention_mask"].to(self.device)
|
||||
with torch.no_grad():
|
||||
text_encoder_embs = self.model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
)["last_hidden_state"].detach()
|
||||
return text_encoder_embs, attention_mask
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
T5Embedder(from_pretrained="google/t5-v1_1-xxl", device='cuda:7')
|
||||
304
RDT/rdt-export/models/rdt/blocks.py
Normal file
304
RDT/rdt-export/models/rdt/blocks.py
Normal file
@ -0,0 +1,304 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
|
||||
# This source code is licensed under the license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
# --------------------------------------------------------
|
||||
# References:
|
||||
# DiT: https://github.com/facebookresearch/DiT
|
||||
# GLIDE: https://github.com/openai/glide-text2im
|
||||
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
|
||||
# --------------------------------------------------------
|
||||
|
||||
import math
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.jit import Final
|
||||
from timm.models.vision_transformer import Attention, Mlp, RmsNorm, use_fused_attn
|
||||
|
||||
|
||||
#################################################################################
|
||||
# Embedding Layers for Timesteps and Condition Inptus #
|
||||
#################################################################################
|
||||
class TimestepEmbedder(nn.Module):
|
||||
"""
|
||||
Embeds scalar timesteps into vector representations.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size, frequency_embedding_size=256, dtype=torch.bfloat16):
|
||||
super().__init__()
|
||||
self.mlp = nn.Sequential(
|
||||
nn.Linear(frequency_embedding_size, hidden_size, bias=True),
|
||||
nn.SiLU(),
|
||||
nn.Linear(hidden_size, hidden_size, bias=True),
|
||||
)
|
||||
self.frequency_embedding_size = frequency_embedding_size
|
||||
self.dtype = dtype
|
||||
|
||||
def timestep_embedding(self, t, dim, max_period=10000):
|
||||
"""
|
||||
Create sinusoidal timestep embeddings.
|
||||
:param t: a 1-D Tensor of N indices, one per batch element.
|
||||
These may be fractional.
|
||||
:param dim: the dimension of the output.
|
||||
:param max_period: controls the minimum frequency of the embeddings.
|
||||
:return: an (N, D) Tensor of positional embeddings.
|
||||
"""
|
||||
# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
|
||||
half = dim // 2
|
||||
freqs = torch.exp(-math.log(max_period) *
|
||||
torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
|
||||
args = t[:, None].float() * freqs[None]
|
||||
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
||||
if dim % 2:
|
||||
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||
return embedding.to(self.dtype)
|
||||
|
||||
def forward(self, t):
|
||||
t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
|
||||
t_emb = self.mlp(t_freq)
|
||||
return t_emb
|
||||
|
||||
|
||||
#################################################################################
|
||||
# Cross Attention Layers #
|
||||
#################################################################################
|
||||
class CrossAttention(nn.Module):
|
||||
"""
|
||||
A cross-attention layer with flash attention.
|
||||
"""
|
||||
fused_attn: Final[bool]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim: int,
|
||||
num_heads: int = 8,
|
||||
qkv_bias: bool = False,
|
||||
qk_norm: bool = False,
|
||||
attn_drop: float = 0,
|
||||
proj_drop: float = 0,
|
||||
norm_layer: nn.Module = nn.LayerNorm,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
assert dim % num_heads == 0, 'dim should be divisible by num_heads'
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
self.scale = self.head_dim**-0.5
|
||||
self.fused_attn = use_fused_attn()
|
||||
|
||||
self.q = nn.Linear(dim, dim, bias=qkv_bias)
|
||||
self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
|
||||
self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
|
||||
self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
|
||||
self.attn_drop = nn.Dropout(attn_drop)
|
||||
self.proj = nn.Linear(dim, dim)
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
|
||||
def forward(self, x: torch.Tensor, c: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
||||
B, N, C = x.shape
|
||||
_, L, _ = c.shape
|
||||
q = self.q(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
|
||||
kv = self.kv(c).reshape(B, L, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
|
||||
k, v = kv.unbind(0)
|
||||
q, k = self.q_norm(q), self.k_norm(k)
|
||||
|
||||
# Prepare attn mask (B, L) to mask the conditioion
|
||||
if mask is not None:
|
||||
mask = mask.reshape(B, 1, 1, L)
|
||||
mask = mask.expand(-1, -1, N, -1)
|
||||
|
||||
if self.fused_attn:
|
||||
x = F.scaled_dot_product_attention(query=q,
|
||||
key=k,
|
||||
value=v,
|
||||
dropout_p=self.attn_drop.p if self.training else 0.,
|
||||
attn_mask=mask)
|
||||
else:
|
||||
q = q * self.scale
|
||||
attn = q @ k.transpose(-2, -1)
|
||||
if mask is not None:
|
||||
attn = attn.masked_fill_(mask.logical_not(), float('-inf'))
|
||||
attn = attn.softmax(dim=-1)
|
||||
if self.attn_drop.p > 0:
|
||||
attn = self.attn_drop(attn)
|
||||
x = attn @ v
|
||||
|
||||
x = x.permute(0, 2, 1, 3).reshape(B, N, C)
|
||||
x = self.proj(x)
|
||||
if self.proj_drop.p > 0:
|
||||
x = self.proj_drop(x)
|
||||
return x
|
||||
|
||||
|
||||
#################################################################################
|
||||
# RDT Block #
|
||||
#################################################################################
|
||||
class RDTBlock(nn.Module):
|
||||
"""
|
||||
A RDT block with cross-attention conditioning.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size, num_heads, **block_kwargs):
|
||||
super().__init__()
|
||||
self.norm1 = RmsNorm(hidden_size, eps=1e-6)
|
||||
self.attn = Attention(dim=hidden_size,
|
||||
num_heads=num_heads,
|
||||
qkv_bias=True,
|
||||
qk_norm=True,
|
||||
norm_layer=RmsNorm,
|
||||
**block_kwargs)
|
||||
self.cross_attn = CrossAttention(hidden_size,
|
||||
num_heads=num_heads,
|
||||
qkv_bias=True,
|
||||
qk_norm=True,
|
||||
norm_layer=RmsNorm,
|
||||
**block_kwargs)
|
||||
|
||||
self.norm2 = RmsNorm(hidden_size, eps=1e-6)
|
||||
approx_gelu = lambda: nn.GELU(approximate="tanh")
|
||||
self.ffn = Mlp(in_features=hidden_size, hidden_features=hidden_size, act_layer=approx_gelu, drop=0)
|
||||
self.norm3 = RmsNorm(hidden_size, eps=1e-6)
|
||||
|
||||
def forward(self, x, c, mask=None):
|
||||
origin_x = x
|
||||
x = self.norm1(x)
|
||||
x = self.attn(x)
|
||||
x = x + origin_x
|
||||
|
||||
origin_x = x
|
||||
x = self.norm2(x)
|
||||
x = self.cross_attn(x, c, mask)
|
||||
x = x + origin_x
|
||||
|
||||
origin_x = x
|
||||
x = self.norm3(x)
|
||||
x = self.ffn(x)
|
||||
x = x + origin_x
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class FinalLayer(nn.Module):
|
||||
"""
|
||||
The final layer of RDT.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size, out_channels):
|
||||
super().__init__()
|
||||
self.norm_final = RmsNorm(hidden_size, eps=1e-6)
|
||||
approx_gelu = lambda: nn.GELU(approximate="tanh")
|
||||
self.ffn_final = Mlp(in_features=hidden_size,
|
||||
hidden_features=hidden_size,
|
||||
out_features=out_channels,
|
||||
act_layer=approx_gelu,
|
||||
drop=0)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.norm_final(x)
|
||||
x = self.ffn_final(x)
|
||||
return x
|
||||
|
||||
|
||||
#################################################################################
|
||||
# Sine/Cosine Positional Embedding Functions #
|
||||
#################################################################################
|
||||
# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
|
||||
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
|
||||
"""
|
||||
embed_dim: output dimension for each position
|
||||
pos: a list of positions to be encoded: size (M,)
|
||||
out: (M, D)
|
||||
"""
|
||||
assert embed_dim % 2 == 0
|
||||
omega = np.arange(embed_dim // 2, dtype=np.float64)
|
||||
omega /= embed_dim / 2.
|
||||
omega = 1. / 10000**omega # (D/2,)
|
||||
|
||||
if not isinstance(pos, np.ndarray):
|
||||
pos = np.array(pos, dtype=np.float64)
|
||||
pos = pos.reshape(-1) # (M,)
|
||||
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
|
||||
|
||||
emb_sin = np.sin(out) # (M, D/2)
|
||||
emb_cos = np.cos(out) # (M, D/2)
|
||||
|
||||
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
|
||||
return emb
|
||||
|
||||
|
||||
def get_nd_sincos_pos_embed_from_grid(embed_dim, grid_sizes):
|
||||
"""
|
||||
embed_dim: output dimension for each position
|
||||
grid_sizes: the grids sizes in each dimension (K,).
|
||||
out: (grid_sizes[0], ..., grid_sizes[K-1], D)
|
||||
"""
|
||||
num_sizes = len(grid_sizes)
|
||||
# For grid size of 1, we do not need to add any positional embedding
|
||||
num_valid_sizes = len([x for x in grid_sizes if x > 1])
|
||||
emb = np.zeros(grid_sizes + (embed_dim, ))
|
||||
# Uniformly divide the embedding dimension for each grid size
|
||||
dim_for_each_grid = embed_dim // num_valid_sizes
|
||||
# To make it even
|
||||
if dim_for_each_grid % 2 != 0:
|
||||
dim_for_each_grid -= 1
|
||||
valid_size_idx = 0
|
||||
for size_idx in range(num_sizes):
|
||||
grid_size = grid_sizes[size_idx]
|
||||
if grid_size <= 1:
|
||||
continue
|
||||
pos = np.arange(grid_size)
|
||||
posemb_shape = [1] * len(grid_sizes) + [dim_for_each_grid]
|
||||
posemb_shape[size_idx] = -1
|
||||
emb[..., valid_size_idx * dim_for_each_grid:(valid_size_idx + 1) * dim_for_each_grid] += \
|
||||
get_1d_sincos_pos_embed_from_grid(dim_for_each_grid, pos).reshape(posemb_shape)
|
||||
valid_size_idx += 1
|
||||
return emb
|
||||
|
||||
|
||||
def get_multimodal_cond_pos_embed(embed_dim, mm_cond_lens: OrderedDict, embed_modality=True):
|
||||
"""
|
||||
Generate position embeddings for multimodal conditions.
|
||||
|
||||
mm_cond_lens: an OrderedDict containing
|
||||
(modality name, modality token length) pairs.
|
||||
For `"image"` modality, the value can be a multi-dimensional tuple.
|
||||
If the length < 0, it means there is no position embedding for the modality or grid.
|
||||
embed_modality: whether to embed the modality information. Default is True.
|
||||
"""
|
||||
num_modalities = len(mm_cond_lens)
|
||||
modality_pos_embed = np.zeros((num_modalities, embed_dim))
|
||||
if embed_modality:
|
||||
# Get embeddings for various modalites
|
||||
# We put it in the first half
|
||||
modality_sincos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, torch.arange(num_modalities))
|
||||
modality_pos_embed[:, :embed_dim // 2] = modality_sincos_embed
|
||||
# The second half is for position embeddings
|
||||
pos_embed_dim = embed_dim // 2
|
||||
else:
|
||||
# The whole embedding is for position embeddings
|
||||
pos_embed_dim = embed_dim
|
||||
|
||||
# Get embeddings for positions inside each modality
|
||||
c_pos_emb = np.zeros((0, embed_dim))
|
||||
for idx, (modality, cond_len) in enumerate(mm_cond_lens.items()):
|
||||
if modality == "image" and \
|
||||
(isinstance(cond_len, tuple) or isinstance(cond_len, list)):
|
||||
all_grid_sizes = tuple([abs(x) for x in cond_len])
|
||||
embed_grid_sizes = tuple([x if x > 0 else 1 for x in cond_len])
|
||||
cond_sincos_embed = get_nd_sincos_pos_embed_from_grid(pos_embed_dim, embed_grid_sizes)
|
||||
cond_pos_embed = np.zeros(all_grid_sizes + (embed_dim, ))
|
||||
cond_pos_embed[..., -pos_embed_dim:] += cond_sincos_embed
|
||||
cond_pos_embed = cond_pos_embed.reshape((-1, embed_dim))
|
||||
else:
|
||||
cond_sincos_embed = get_1d_sincos_pos_embed_from_grid(pos_embed_dim,
|
||||
torch.arange(cond_len if cond_len > 0 else 1))
|
||||
cond_pos_embed = np.zeros((abs(cond_len), embed_dim))
|
||||
cond_pos_embed[:, -pos_embed_dim:] += cond_sincos_embed
|
||||
cond_pos_embed += modality_pos_embed[idx]
|
||||
c_pos_emb = np.concatenate([c_pos_emb, cond_pos_embed], axis=0)
|
||||
|
||||
return c_pos_emb
|
||||
156
RDT/rdt-export/models/rdt/model.py
Normal file
156
RDT/rdt-export/models/rdt/model.py
Normal file
@ -0,0 +1,156 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
|
||||
# This source code is licensed under the license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
# --------------------------------------------------------
|
||||
# References:
|
||||
# DiT: https://github.com/facebookresearch/DiT
|
||||
# GLIDE: https://github.com/openai/glide-text2im
|
||||
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
|
||||
# --------------------------------------------------------
|
||||
from collections import OrderedDict
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from pathlib import Path
|
||||
import sys, os
|
||||
# get current workspace
|
||||
current_file = Path(__file__)
|
||||
sys.path.append(str(current_file.parent.parent))
|
||||
|
||||
from rdt.blocks import (FinalLayer, RDTBlock, TimestepEmbedder, get_1d_sincos_pos_embed_from_grid,
|
||||
get_multimodal_cond_pos_embed)
|
||||
|
||||
|
||||
class RDT(nn.Module):
|
||||
"""
|
||||
Class for Robotics Diffusion Transformers.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
output_dim=128,
|
||||
horizon=32,
|
||||
hidden_size=1152,
|
||||
depth=28,
|
||||
num_heads=16,
|
||||
max_lang_cond_len=1024,
|
||||
img_cond_len=4096,
|
||||
lang_pos_embed_config=None,
|
||||
img_pos_embed_config=None,
|
||||
dtype=torch.bfloat16):
|
||||
super().__init__()
|
||||
self.horizon = horizon
|
||||
self.hidden_size = hidden_size
|
||||
self.max_lang_cond_len = max_lang_cond_len
|
||||
self.img_cond_len = img_cond_len
|
||||
self.dtype = dtype
|
||||
self.lang_pos_embed_config = lang_pos_embed_config
|
||||
self.img_pos_embed_config = img_pos_embed_config
|
||||
|
||||
self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
|
||||
self.freq_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
|
||||
|
||||
# We will use trainable sin-cos embeddings
|
||||
# [timestep; state; action]
|
||||
self.x_pos_embed = nn.Parameter(torch.zeros(1, horizon + 3, hidden_size))
|
||||
# Language conditions
|
||||
self.lang_cond_pos_embed = nn.Parameter(torch.zeros(1, max_lang_cond_len, hidden_size))
|
||||
# Image conditions
|
||||
self.img_cond_pos_embed = nn.Parameter(torch.zeros(1, img_cond_len, hidden_size))
|
||||
|
||||
self.blocks = nn.ModuleList([RDTBlock(hidden_size, num_heads) for _ in range(depth)])
|
||||
self.final_layer = FinalLayer(hidden_size, output_dim)
|
||||
self.initialize_weights()
|
||||
|
||||
def initialize_weights(self):
|
||||
# Initialize transformer layers:
|
||||
def _basic_init(module):
|
||||
if isinstance(module, nn.Linear):
|
||||
torch.nn.init.xavier_uniform_(module.weight)
|
||||
if module.bias is not None:
|
||||
nn.init.constant_(module.bias, 0)
|
||||
|
||||
self.apply(_basic_init)
|
||||
|
||||
# Initialize pos_embed by sin-cos embedding
|
||||
x_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
|
||||
mm_cond_lens=OrderedDict([
|
||||
('timestep', 1),
|
||||
('ctrl_freq', 1),
|
||||
('state', 1),
|
||||
('action', self.horizon),
|
||||
]))
|
||||
self.x_pos_embed.data.copy_(torch.from_numpy(x_pos_embed).float().unsqueeze(0))
|
||||
|
||||
if self.lang_pos_embed_config is None:
|
||||
lang_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size,
|
||||
torch.arange(self.max_lang_cond_len))
|
||||
else:
|
||||
lang_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
|
||||
mm_cond_lens=OrderedDict(self.lang_pos_embed_config),
|
||||
embed_modality=False)
|
||||
self.lang_cond_pos_embed.data.copy_(torch.from_numpy(lang_cond_pos_embed).float().unsqueeze(0))
|
||||
|
||||
if self.img_pos_embed_config is None:
|
||||
img_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size, torch.arange(self.img_cond_len))
|
||||
else:
|
||||
img_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
|
||||
mm_cond_lens=OrderedDict(self.img_pos_embed_config),
|
||||
embed_modality=False)
|
||||
self.img_cond_pos_embed.data.copy_(torch.from_numpy(img_cond_pos_embed).float().unsqueeze(0))
|
||||
|
||||
# Initialize timestep and control freq embedding MLP
|
||||
nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
|
||||
nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
|
||||
nn.init.normal_(self.freq_embedder.mlp[0].weight, std=0.02)
|
||||
nn.init.normal_(self.freq_embedder.mlp[2].weight, std=0.02)
|
||||
|
||||
# Initialize the final layer: zero-out the final linear layer
|
||||
nn.init.constant_(self.final_layer.ffn_final.fc2.weight, 0)
|
||||
nn.init.constant_(self.final_layer.ffn_final.fc2.bias, 0)
|
||||
|
||||
# Move all the params to given data type:
|
||||
self.to(self.dtype)
|
||||
|
||||
def forward(self, x, freq, t, lang_c, img_c, lang_mask=None, img_mask=None):
|
||||
"""
|
||||
Forward pass of RDT.
|
||||
|
||||
x: (B, T, D), state + action token sequence, T = horizon + 1,
|
||||
dimension D is assumed to be the same as the hidden size.
|
||||
freq: (B,), a scalar indicating control frequency.
|
||||
t: (B,) or (1,), diffusion timesteps.
|
||||
lang_c: (B, L_lang, D) or None, language condition tokens (variable length),
|
||||
dimension D is assumed to be the same as the hidden size.
|
||||
img_c: (B, L_img, D) or None, image condition tokens (fixed length),
|
||||
dimension D is assumed to be the same as the hidden size.
|
||||
lang_mask: (B, L_lang) or None, language condition mask (True for valid).
|
||||
img_mask: (B, L_img) or None, image condition mask (True for valid).
|
||||
"""
|
||||
t = self.t_embedder(t).unsqueeze(1) # (B, 1, D) or (1, 1, D)
|
||||
freq = self.freq_embedder(freq).unsqueeze(1) # (B, 1, D)
|
||||
# Append timestep to the input tokens
|
||||
if t.shape[0] == 1:
|
||||
t = t.expand(x.shape[0], -1, -1)
|
||||
x = torch.cat([t, freq, x], dim=1) # (B, T+1, D)
|
||||
|
||||
# Add multimodal position embeddings
|
||||
x = x + self.x_pos_embed
|
||||
# Note the lang is of variable length
|
||||
lang_c = lang_c + self.lang_cond_pos_embed[:, :lang_c.shape[1]]
|
||||
img_c = img_c + self.img_cond_pos_embed
|
||||
|
||||
# Forward pass
|
||||
conds = [lang_c, img_c]
|
||||
masks = [lang_mask, img_mask]
|
||||
for i, block in enumerate(self.blocks):
|
||||
c, mask = conds[i % 2], masks[i % 2]
|
||||
x = block(x, c, mask) # (B, T+1, D)
|
||||
# Inject the language condition at the final layer
|
||||
x = self.final_layer(x) # (B, T+1, out_channels)
|
||||
|
||||
# Only preserve the action tokens
|
||||
x = x[:, -self.horizon:]
|
||||
return x
|
||||
246
RDT/rdt-export/models/rdt_runner.py
Normal file
246
RDT/rdt-export/models/rdt_runner.py
Normal file
@ -0,0 +1,246 @@
|
||||
import re, sys, os
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
|
||||
from diffusers.schedulers.scheduling_dpmsolver_multistep import \
|
||||
DPMSolverMultistepScheduler
|
||||
|
||||
from pathlib import Path
|
||||
# get current workspace
|
||||
current_file = Path(__file__)
|
||||
sys.path.append(os.path.join(current_file.parent))
|
||||
from hub_mixin import CompatiblePyTorchModelHubMixin
|
||||
from rdt.model import RDT
|
||||
|
||||
|
||||
class RDTRunner(nn.Module,
|
||||
CompatiblePyTorchModelHubMixin,
|
||||
repo_url="https://huggingface.co/robotics-diffusion-transformer/rdt-1b"):
|
||||
|
||||
def __init__(self,
|
||||
*,
|
||||
action_dim,
|
||||
pred_horizon,
|
||||
config,
|
||||
lang_token_dim,
|
||||
img_token_dim,
|
||||
state_token_dim,
|
||||
max_lang_cond_len,
|
||||
img_cond_len,
|
||||
lang_pos_embed_config=None,
|
||||
img_pos_embed_config=None,
|
||||
dtype=torch.bfloat16):
|
||||
super(RDTRunner, self).__init__()
|
||||
# Create diffusion model
|
||||
hidden_size = config['rdt']['hidden_size']
|
||||
self.model = RDT(
|
||||
output_dim=action_dim,
|
||||
horizon=pred_horizon,
|
||||
hidden_size=hidden_size,
|
||||
depth=config['rdt']['depth'],
|
||||
num_heads=config['rdt']['num_heads'],
|
||||
max_lang_cond_len=max_lang_cond_len,
|
||||
img_cond_len=img_cond_len,
|
||||
lang_pos_embed_config=lang_pos_embed_config,
|
||||
img_pos_embed_config=img_pos_embed_config,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
# Create adpators for various conditional inputs
|
||||
self.lang_adaptor = self.build_condition_adapter(config['lang_adaptor'],
|
||||
in_features=lang_token_dim,
|
||||
out_features=hidden_size)
|
||||
self.img_adaptor = self.build_condition_adapter(config['img_adaptor'],
|
||||
in_features=img_token_dim,
|
||||
out_features=hidden_size)
|
||||
# A `state` refers to an action or a proprioception vector
|
||||
self.state_adaptor = self.build_condition_adapter(
|
||||
config['state_adaptor'],
|
||||
in_features=state_token_dim * 2, # state + state mask (indicator)
|
||||
out_features=hidden_size)
|
||||
|
||||
# Create the noise scheduler
|
||||
noise_scheduler_config = config['noise_scheduler']
|
||||
self.noise_scheduler = DDPMScheduler(
|
||||
num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
|
||||
beta_schedule=noise_scheduler_config['beta_schedule'],
|
||||
prediction_type=noise_scheduler_config['prediction_type'],
|
||||
clip_sample=noise_scheduler_config['clip_sample'],
|
||||
)
|
||||
self.noise_scheduler_sample = DPMSolverMultistepScheduler(
|
||||
num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
|
||||
beta_schedule=noise_scheduler_config['beta_schedule'],
|
||||
prediction_type=noise_scheduler_config['prediction_type'],
|
||||
)
|
||||
|
||||
self.num_train_timesteps = noise_scheduler_config['num_train_timesteps']
|
||||
self.num_inference_timesteps = noise_scheduler_config['num_inference_timesteps']
|
||||
self.prediction_type = noise_scheduler_config['prediction_type']
|
||||
|
||||
self.pred_horizon = pred_horizon
|
||||
self.action_dim = action_dim
|
||||
|
||||
print("Diffusion params: %e" %
|
||||
sum([p.numel() for p in self.model.parameters()] + [p.numel() for p in self.lang_adaptor.parameters()] +
|
||||
[p.numel()
|
||||
for p in self.img_adaptor.parameters()] + [p.numel() for p in self.state_adaptor.parameters()]))
|
||||
|
||||
def build_condition_adapter(self, projector_type, in_features, out_features):
|
||||
projector = None
|
||||
if projector_type == 'linear':
|
||||
projector = nn.Linear(in_features, out_features)
|
||||
else:
|
||||
mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
|
||||
if mlp_gelu_match:
|
||||
mlp_depth = int(mlp_gelu_match.group(1))
|
||||
modules = [nn.Linear(in_features, out_features)]
|
||||
for _ in range(1, mlp_depth):
|
||||
modules.append(nn.GELU(approximate="tanh"))
|
||||
modules.append(nn.Linear(out_features, out_features))
|
||||
projector = nn.Sequential(*modules)
|
||||
|
||||
if projector is None:
|
||||
raise ValueError(f'Unknown projector type: {projector_type}')
|
||||
|
||||
return projector
|
||||
|
||||
def adapt_conditions(self, lang_tokens, img_tokens, state_tokens):
|
||||
'''
|
||||
lang_tokens: (batch_size, lang_len, lang_token_dim)
|
||||
img_tokens: (batch_size, img_len, img_token_dim)
|
||||
state_tokens: (batch_size, state_len, state_token_dim)
|
||||
|
||||
return: adpated (..., hidden_size) for all input tokens
|
||||
'''
|
||||
adpated_lang = self.lang_adaptor(lang_tokens)
|
||||
adpated_img = self.img_adaptor(img_tokens)
|
||||
adpated_state = self.state_adaptor(state_tokens)
|
||||
|
||||
return adpated_lang, adpated_img, adpated_state
|
||||
|
||||
def conditional_sample(self, lang_cond, lang_attn_mask, img_cond, state_traj, action_mask, ctrl_freqs):
|
||||
'''
|
||||
lang_cond: language conditional data, (batch_size, lang_len, hidden_size).
|
||||
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
|
||||
which should be True-False bool tensor.
|
||||
img_cond: image conditional data, (batch_size, img_len, hidden_size).
|
||||
state_traj: (batch_size, 1, hidden_size), state trajectory.
|
||||
action_mask: (batch_size, 1, action_dim), a 0-1 **float** tensor
|
||||
indicating the valid action dimensions.
|
||||
ctrl_freqs: (batch_size,), control frequency for each sample.
|
||||
|
||||
return: (batch_size, horizon, action_dim)
|
||||
'''
|
||||
device = state_traj.device
|
||||
dtype = state_traj.dtype
|
||||
noisy_action = torch.randn(size=(state_traj.shape[0], self.pred_horizon, self.action_dim),
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
action_mask = action_mask.expand(-1, self.pred_horizon, -1)
|
||||
|
||||
# Set step values
|
||||
self.noise_scheduler_sample.set_timesteps(self.num_inference_timesteps)
|
||||
|
||||
for t in self.noise_scheduler_sample.timesteps:
|
||||
# Prepare state-action trajectory
|
||||
action_traj = torch.cat([noisy_action, action_mask], dim=2)
|
||||
action_traj = self.state_adaptor(action_traj)
|
||||
state_action_traj = torch.cat([state_traj, action_traj], dim=1)
|
||||
|
||||
# Predict the model output
|
||||
model_output = self.model(state_action_traj,
|
||||
ctrl_freqs,
|
||||
t.unsqueeze(-1).to(device),
|
||||
lang_cond,
|
||||
img_cond,
|
||||
lang_mask=lang_attn_mask)
|
||||
|
||||
# Compute previous actions: x_t -> x_t-1
|
||||
noisy_action = self.noise_scheduler_sample.step(model_output, t, noisy_action).prev_sample
|
||||
noisy_action = noisy_action.to(state_traj.dtype)
|
||||
|
||||
# Finally apply the action mask to mask invalid action dimensions
|
||||
noisy_action = noisy_action * action_mask
|
||||
|
||||
return noisy_action
|
||||
|
||||
# ========= Train ============
|
||||
def compute_loss(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_gt, action_mask,
|
||||
ctrl_freqs) -> torch.Tensor:
|
||||
'''
|
||||
lang_tokens: (batch_size, lang_len, lang_token_dim)
|
||||
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
|
||||
which should be True-False bool tensor.
|
||||
img_tokens: (batch_size, img_len, img_token_dim)
|
||||
state_tokens: (batch_size, 1, state_token_dim)
|
||||
action_gt: (batch_size, horizon, state_token_dim), ground-truth actions for supervision
|
||||
action_mask: (batch_size, 1, state_token_dim), a 0-1 **float** tensor.
|
||||
ctrl_freqs: (batch_size,), control frequency for each sample.
|
||||
|
||||
return: loss_value, a scalar tensor
|
||||
'''
|
||||
batch_size = lang_tokens.shape[0]
|
||||
device = lang_tokens.device
|
||||
# Sample noise that we'll add to the actions
|
||||
noise = torch.randn(action_gt.shape, dtype=action_gt.dtype, device=device)
|
||||
# Sample random diffusion timesteps
|
||||
timesteps = torch.randint(0, self.num_train_timesteps, (batch_size, ), device=device).long()
|
||||
# Add noise to the clean actions according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_action = self.noise_scheduler.add_noise(action_gt, noise, timesteps)
|
||||
|
||||
# Concatenate the state and action tokens to form the input sequence
|
||||
state_action_traj = torch.cat([state_tokens, noisy_action], dim=1)
|
||||
# Append the action mask to the input sequence
|
||||
action_mask = action_mask.expand(-1, state_action_traj.shape[1], -1)
|
||||
state_action_traj = torch.cat([state_action_traj, action_mask], dim=2)
|
||||
# Align the dimension with the hidden size
|
||||
lang_cond, img_cond, state_action_traj = self.adapt_conditions(lang_tokens, img_tokens, state_action_traj)
|
||||
# Predict the denoised result
|
||||
pred = self.model(state_action_traj, ctrl_freqs, timesteps, lang_cond, img_cond, lang_mask=lang_attn_mask)
|
||||
|
||||
pred_type = self.prediction_type
|
||||
if pred_type == 'epsilon':
|
||||
target = noise
|
||||
elif pred_type == 'sample':
|
||||
target = action_gt
|
||||
else:
|
||||
raise ValueError(f"Unsupported prediction type {pred_type}")
|
||||
loss = F.mse_loss(pred, target)
|
||||
return loss
|
||||
|
||||
# ========= Inference ============
|
||||
def predict_action(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_mask, ctrl_freqs):
|
||||
'''
|
||||
lang_tokens: (batch_size, lang_len, lang_token_dim)
|
||||
lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
|
||||
which should be True-False bool tensor.
|
||||
img_tokens: (batch_size, img_len, img_token_dim)
|
||||
state_tokens: (batch_size, 1, state_token_dim)
|
||||
action_mask: (batch_size, 1, action_dim),
|
||||
which should be a 0-1 **float** tensor.
|
||||
ctrl_freqs: (batch_size,), control frequency for each sample.
|
||||
|
||||
return: (batch_size, horizon, action_dim), predicted action sequence
|
||||
'''
|
||||
# Prepare the state and conditions
|
||||
state_tokens = torch.cat([state_tokens, action_mask], dim=2)
|
||||
lang_cond, img_cond, state_traj = self.adapt_conditions(lang_tokens, img_tokens, state_tokens)
|
||||
|
||||
# Run sampling
|
||||
action_pred = self.conditional_sample(
|
||||
lang_cond,
|
||||
lang_attn_mask,
|
||||
img_cond,
|
||||
state_traj,
|
||||
action_mask,
|
||||
ctrl_freqs,
|
||||
)
|
||||
|
||||
return action_pred
|
||||
|
||||
def forward(self, *args, **kwargs) -> torch.Tensor:
|
||||
return self.compute_loss(*args, **kwargs)
|
||||
35
RDT/rdt-export/requirements.txt
Normal file
35
RDT/rdt-export/requirements.txt
Normal file
@ -0,0 +1,35 @@
|
||||
numpy<2.0
|
||||
packaging==24.0
|
||||
wandb==0.17.0
|
||||
deepspeed==0.14.2
|
||||
accelerate==0.30.1
|
||||
diffusers==0.27.2
|
||||
timm==1.0.3
|
||||
transformers==4.41.0
|
||||
sentencepiece==0.2.0
|
||||
h5py==3.11.0
|
||||
opencv-python==4.9.0.80
|
||||
imgaug==0.4.0
|
||||
pytz==2022.1
|
||||
huggingface_hub==0.23.0
|
||||
|
||||
torch==2.1.0
|
||||
torchvision==0.16.0
|
||||
pyzmq
|
||||
msgpack
|
||||
msgpack_numpy
|
||||
zstandard
|
||||
onnx
|
||||
onnxruntime
|
||||
onnxsim
|
||||
onnxsim
|
||||
|
||||
# requirements_data.txt
|
||||
# tfds-nightly==4.9.4.dev202402070044
|
||||
gsutil==5.27
|
||||
tensorflow==2.15.0.post1
|
||||
pillow==10.2.0
|
||||
pyyaml==6.0.1
|
||||
tensorflow-graphics==2021.12.3
|
||||
imageio==2.34.0
|
||||
imageio-ffmpeg==0.4.9
|
||||
941
RDT/rdt-export/scripts/agilex_inference.py
Normal file
941
RDT/rdt-export/scripts/agilex_inference.py
Normal file
@ -0,0 +1,941 @@
|
||||
#!/home/lin/software/miniconda3/envs/aloha/bin/python
|
||||
# -- coding: UTF-8
|
||||
"""
|
||||
#!/usr/bin/python3
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import yaml
|
||||
from collections import deque
|
||||
|
||||
import numpy as np
|
||||
import rospy
|
||||
import torch
|
||||
from cv_bridge import CvBridge
|
||||
from geometry_msgs.msg import Twist
|
||||
from nav_msgs.msg import Odometry
|
||||
from PIL import Image as PImage
|
||||
from sensor_msgs.msg import Image, JointState
|
||||
from std_msgs.msg import Header
|
||||
import cv2
|
||||
|
||||
from scripts.agilex_model import create_model
|
||||
|
||||
# sys.path.append("./")
|
||||
|
||||
CAMERA_NAMES = ["cam_high", "cam_right_wrist", "cam_left_wrist"]
|
||||
|
||||
observation_window = None
|
||||
|
||||
lang_embeddings = None
|
||||
|
||||
# debug
|
||||
preload_images = None
|
||||
|
||||
|
||||
# Initialize the model
|
||||
def make_policy(args):
|
||||
with open(args.config_path, "r") as fp:
|
||||
config = yaml.safe_load(fp)
|
||||
args.config = config
|
||||
|
||||
# pretrained_text_encoder_name_or_path = "google/t5-v1_1-xxl"
|
||||
pretrained_vision_encoder_name_or_path = "google/siglip-so400m-patch14-384"
|
||||
model = create_model(
|
||||
args=args.config,
|
||||
dtype=torch.bfloat16,
|
||||
pretrained=args.pretrained_model_name_or_path,
|
||||
# pretrained_text_encoder_name_or_path=pretrained_text_encoder_name_or_path,
|
||||
pretrained_vision_encoder_name_or_path=pretrained_vision_encoder_name_or_path,
|
||||
control_frequency=args.ctrl_freq,
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def set_seed(seed):
|
||||
torch.manual_seed(seed)
|
||||
np.random.seed(seed)
|
||||
|
||||
|
||||
# Interpolate the actions to make the robot move smoothly
|
||||
def interpolate_action(args, prev_action, cur_action):
|
||||
steps = np.concatenate((np.array(args.arm_steps_length), np.array(args.arm_steps_length)), axis=0)
|
||||
diff = np.abs(cur_action - prev_action)
|
||||
step = np.ceil(diff / steps).astype(int)
|
||||
step = np.max(step)
|
||||
if step <= 1:
|
||||
return cur_action[np.newaxis, :]
|
||||
new_actions = np.linspace(prev_action, cur_action, step + 1)
|
||||
return new_actions[1:]
|
||||
|
||||
|
||||
def get_config(args):
|
||||
config = {
|
||||
"episode_len": args.max_publish_step,
|
||||
"state_dim": 14,
|
||||
"chunk_size": args.chunk_size,
|
||||
"camera_names": CAMERA_NAMES,
|
||||
}
|
||||
return config
|
||||
|
||||
|
||||
# Get the observation from the ROS topic
|
||||
def get_ros_observation(args, ros_operator):
|
||||
rate = rospy.Rate(args.publish_rate)
|
||||
print_flag = True
|
||||
|
||||
while True and not rospy.is_shutdown():
|
||||
result = ros_operator.get_frame()
|
||||
if not result:
|
||||
if print_flag:
|
||||
print("syn fail when get_ros_observation")
|
||||
print_flag = False
|
||||
rate.sleep()
|
||||
continue
|
||||
print_flag = True
|
||||
(
|
||||
img_front,
|
||||
img_left,
|
||||
img_right,
|
||||
img_front_depth,
|
||||
img_left_depth,
|
||||
img_right_depth,
|
||||
puppet_arm_left,
|
||||
puppet_arm_right,
|
||||
robot_base,
|
||||
) = result
|
||||
# print(f"sync success when get_ros_observation")
|
||||
return (img_front, img_left, img_right, puppet_arm_left, puppet_arm_right)
|
||||
|
||||
|
||||
# Update the observation window buffer
|
||||
def update_observation_window(args, config, ros_operator):
|
||||
# JPEG transformation
|
||||
# Align with training
|
||||
def jpeg_mapping(img):
|
||||
img = cv2.imencode(".jpg", img)[1].tobytes()
|
||||
img = cv2.imdecode(np.frombuffer(img, np.uint8), cv2.IMREAD_COLOR)
|
||||
return img
|
||||
|
||||
global observation_window
|
||||
if observation_window is None:
|
||||
observation_window = deque(maxlen=2)
|
||||
|
||||
# Append the first dummy image
|
||||
observation_window.append({
|
||||
"qpos": None,
|
||||
"images": {
|
||||
config["camera_names"][0]: None,
|
||||
config["camera_names"][1]: None,
|
||||
config["camera_names"][2]: None,
|
||||
},
|
||||
})
|
||||
|
||||
img_front, img_left, img_right, puppet_arm_left, puppet_arm_right = (get_ros_observation(args, ros_operator))
|
||||
img_front = jpeg_mapping(img_front)
|
||||
img_left = jpeg_mapping(img_left)
|
||||
img_right = jpeg_mapping(img_right)
|
||||
|
||||
qpos = np.concatenate(
|
||||
(np.array(puppet_arm_left.position), np.array(puppet_arm_right.position)),
|
||||
axis=0,
|
||||
)
|
||||
qpos = torch.from_numpy(qpos).float().cuda()
|
||||
observation_window.append({
|
||||
"qpos": qpos,
|
||||
"images": {
|
||||
config["camera_names"][0]: img_front,
|
||||
config["camera_names"][1]: img_right,
|
||||
config["camera_names"][2]: img_left,
|
||||
},
|
||||
})
|
||||
|
||||
|
||||
# RDT inference
|
||||
def inference_fn(args, config, policy, t):
|
||||
global observation_window
|
||||
global lang_embeddings
|
||||
|
||||
# print(f"Start inference_thread_fn: t={t}")
|
||||
while True and not rospy.is_shutdown():
|
||||
time1 = time.time()
|
||||
|
||||
# fetch images in sequence [front, right, left]
|
||||
image_arrs = [
|
||||
observation_window[-2]["images"][config["camera_names"][0]],
|
||||
observation_window[-2]["images"][config["camera_names"][1]],
|
||||
observation_window[-2]["images"][config["camera_names"][2]],
|
||||
observation_window[-1]["images"][config["camera_names"][0]],
|
||||
observation_window[-1]["images"][config["camera_names"][1]],
|
||||
observation_window[-1]["images"][config["camera_names"][2]],
|
||||
]
|
||||
|
||||
# fetch debug images in sequence [front, right, left]
|
||||
# image_arrs = [
|
||||
# preload_images[config['camera_names'][0]][max(t - 1, 0)],
|
||||
# preload_images[config['camera_names'][2]][max(t - 1, 0)],
|
||||
# preload_images[config['camera_names'][1]][max(t - 1, 0)],
|
||||
# preload_images[config['camera_names'][0]][t],
|
||||
# preload_images[config['camera_names'][2]][t],
|
||||
# preload_images[config['camera_names'][1]][t]
|
||||
# ]
|
||||
# # encode the images
|
||||
# for i in range(len(image_arrs)):
|
||||
# image_arrs[i] = cv2.imdecode(np.frombuffer(image_arrs[i], np.uint8), cv2.IMREAD_COLOR)
|
||||
# proprio = torch.from_numpy(preload_images['qpos'][t]).float().cuda()
|
||||
|
||||
images = [PImage.fromarray(arr) if arr is not None else None for arr in image_arrs]
|
||||
|
||||
# for i, pos in enumerate(['f', 'r', 'l'] * 2):
|
||||
# images[i].save(f'{t}-{i}-{pos}.png')
|
||||
|
||||
# get last qpos in shape [14, ]
|
||||
proprio = observation_window[-1]["qpos"]
|
||||
# unsqueeze to [1, 14]
|
||||
proprio = proprio.unsqueeze(0)
|
||||
|
||||
# actions shaped as [1, 64, 14] in format [left, right]
|
||||
actions = (policy.step(proprio=proprio, images=images, text_embeds=lang_embeddings).squeeze(0).cpu().numpy())
|
||||
# print(f"inference_actions: {actions.squeeze()}")
|
||||
|
||||
# print(f"Model inference time: {time.time() - time1} s")
|
||||
|
||||
# print(f"Finish inference_thread_fn: t={t}")
|
||||
return actions
|
||||
|
||||
|
||||
# Main loop for the manipulation task
|
||||
def model_inference(args, config, ros_operator):
|
||||
global lang_embeddings
|
||||
|
||||
# Load rdt model
|
||||
policy = make_policy(args)
|
||||
|
||||
lang_dict = torch.load(args.lang_embeddings_path)
|
||||
print(f"Running with instruction: \"{lang_dict['instruction']}\" from \"{lang_dict['name']}\"")
|
||||
lang_embeddings = lang_dict["embeddings"]
|
||||
|
||||
max_publish_step = config["episode_len"]
|
||||
chunk_size = config["chunk_size"]
|
||||
|
||||
# Initialize position of the puppet arm
|
||||
left0 = [
|
||||
-0.00133514404296875,
|
||||
0.00209808349609375,
|
||||
0.01583099365234375,
|
||||
-0.032616615295410156,
|
||||
-0.00286102294921875,
|
||||
0.00095367431640625,
|
||||
3.557830810546875,
|
||||
]
|
||||
right0 = [
|
||||
-0.00133514404296875,
|
||||
0.00438690185546875,
|
||||
0.034523963928222656,
|
||||
-0.053597450256347656,
|
||||
-0.00476837158203125,
|
||||
-0.00209808349609375,
|
||||
3.557830810546875,
|
||||
]
|
||||
left1 = [
|
||||
-0.00133514404296875,
|
||||
0.00209808349609375,
|
||||
0.01583099365234375,
|
||||
-0.032616615295410156,
|
||||
-0.00286102294921875,
|
||||
0.00095367431640625,
|
||||
-0.3393220901489258,
|
||||
]
|
||||
right1 = [
|
||||
-0.00133514404296875,
|
||||
0.00247955322265625,
|
||||
0.01583099365234375,
|
||||
-0.032616615295410156,
|
||||
-0.00286102294921875,
|
||||
0.00095367431640625,
|
||||
-0.3397035598754883,
|
||||
]
|
||||
ros_operator.puppet_arm_publish_continuous(left0, right0)
|
||||
input("Press enter to continue")
|
||||
ros_operator.puppet_arm_publish_continuous(left1, right1)
|
||||
# Initialize the previous action to be the initial robot state
|
||||
pre_action = np.zeros(config["state_dim"])
|
||||
pre_action[:14] = np.array([
|
||||
-0.00133514404296875,
|
||||
0.00209808349609375,
|
||||
0.01583099365234375,
|
||||
-0.032616615295410156,
|
||||
-0.00286102294921875,
|
||||
0.00095367431640625,
|
||||
-0.3393220901489258,
|
||||
] + [
|
||||
-0.00133514404296875,
|
||||
0.00247955322265625,
|
||||
0.01583099365234375,
|
||||
-0.032616615295410156,
|
||||
-0.00286102294921875,
|
||||
0.00095367431640625,
|
||||
-0.3397035598754883,
|
||||
])
|
||||
action = None
|
||||
# Inference loop
|
||||
with torch.inference_mode():
|
||||
while True and not rospy.is_shutdown():
|
||||
# The current time step
|
||||
t = 0
|
||||
rate = rospy.Rate(args.publish_rate)
|
||||
|
||||
action_buffer = np.zeros([chunk_size, config["state_dim"]])
|
||||
|
||||
while t < max_publish_step and not rospy.is_shutdown():
|
||||
# Update observation window
|
||||
update_observation_window(args, config, ros_operator)
|
||||
|
||||
# When coming to the end of the action chunk
|
||||
if t % chunk_size == 0:
|
||||
# Start inference
|
||||
action_buffer = inference_fn(args, config, policy, t).copy()
|
||||
|
||||
raw_action = action_buffer[t % chunk_size]
|
||||
action = raw_action
|
||||
# Interpolate the original action sequence
|
||||
if args.use_actions_interpolation:
|
||||
# print(f"Time {t}, pre {pre_action}, act {action}")
|
||||
interp_actions = interpolate_action(args, pre_action, action)
|
||||
else:
|
||||
interp_actions = action[np.newaxis, :]
|
||||
# Execute the interpolated actions one by one
|
||||
for act in interp_actions:
|
||||
left_action = act[:7]
|
||||
right_action = act[7:14]
|
||||
|
||||
if not args.disable_puppet_arm:
|
||||
ros_operator.puppet_arm_publish(left_action,
|
||||
right_action) # puppet_arm_publish_continuous_thread
|
||||
|
||||
if args.use_robot_base:
|
||||
vel_action = act[14:16]
|
||||
ros_operator.robot_base_publish(vel_action)
|
||||
rate.sleep()
|
||||
# print(f"doing action: {act}")
|
||||
t += 1
|
||||
|
||||
print("Published Step", t)
|
||||
pre_action = action.copy()
|
||||
|
||||
|
||||
# ROS operator class
|
||||
class RosOperator:
|
||||
|
||||
def __init__(self, args):
|
||||
self.robot_base_deque = None
|
||||
self.puppet_arm_right_deque = None
|
||||
self.puppet_arm_left_deque = None
|
||||
self.img_front_deque = None
|
||||
self.img_right_deque = None
|
||||
self.img_left_deque = None
|
||||
self.img_front_depth_deque = None
|
||||
self.img_right_depth_deque = None
|
||||
self.img_left_depth_deque = None
|
||||
self.bridge = None
|
||||
self.puppet_arm_left_publisher = None
|
||||
self.puppet_arm_right_publisher = None
|
||||
self.robot_base_publisher = None
|
||||
self.puppet_arm_publish_thread = None
|
||||
self.puppet_arm_publish_lock = None
|
||||
self.args = args
|
||||
self.init()
|
||||
self.init_ros()
|
||||
|
||||
def init(self):
|
||||
self.bridge = CvBridge()
|
||||
self.img_left_deque = deque()
|
||||
self.img_right_deque = deque()
|
||||
self.img_front_deque = deque()
|
||||
self.img_left_depth_deque = deque()
|
||||
self.img_right_depth_deque = deque()
|
||||
self.img_front_depth_deque = deque()
|
||||
self.puppet_arm_left_deque = deque()
|
||||
self.puppet_arm_right_deque = deque()
|
||||
self.robot_base_deque = deque()
|
||||
self.puppet_arm_publish_lock = threading.Lock()
|
||||
self.puppet_arm_publish_lock.acquire()
|
||||
|
||||
def puppet_arm_publish(self, left, right):
|
||||
joint_state_msg = JointState()
|
||||
joint_state_msg.header = Header()
|
||||
joint_state_msg.header.stamp = rospy.Time.now() # Set timestep
|
||||
joint_state_msg.name = [
|
||||
"joint0",
|
||||
"joint1",
|
||||
"joint2",
|
||||
"joint3",
|
||||
"joint4",
|
||||
"joint5",
|
||||
"joint6",
|
||||
] # 设置关节名称
|
||||
joint_state_msg.position = left
|
||||
self.puppet_arm_left_publisher.publish(joint_state_msg)
|
||||
joint_state_msg.position = right
|
||||
self.puppet_arm_right_publisher.publish(joint_state_msg)
|
||||
|
||||
def robot_base_publish(self, vel):
|
||||
vel_msg = Twist()
|
||||
vel_msg.linear.x = vel[0]
|
||||
vel_msg.linear.y = 0
|
||||
vel_msg.linear.z = 0
|
||||
vel_msg.angular.x = 0
|
||||
vel_msg.angular.y = 0
|
||||
vel_msg.angular.z = vel[1]
|
||||
self.robot_base_publisher.publish(vel_msg)
|
||||
|
||||
def puppet_arm_publish_continuous(self, left, right):
|
||||
rate = rospy.Rate(self.args.publish_rate)
|
||||
left_arm = None
|
||||
right_arm = None
|
||||
while True and not rospy.is_shutdown():
|
||||
if len(self.puppet_arm_left_deque) != 0:
|
||||
left_arm = list(self.puppet_arm_left_deque[-1].position)
|
||||
if len(self.puppet_arm_right_deque) != 0:
|
||||
right_arm = list(self.puppet_arm_right_deque[-1].position)
|
||||
if left_arm is None or right_arm is None:
|
||||
rate.sleep()
|
||||
continue
|
||||
else:
|
||||
break
|
||||
left_symbol = [1 if left[i] - left_arm[i] > 0 else -1 for i in range(len(left))]
|
||||
right_symbol = [1 if right[i] - right_arm[i] > 0 else -1 for i in range(len(right))]
|
||||
flag = True
|
||||
step = 0
|
||||
while flag and not rospy.is_shutdown():
|
||||
if self.puppet_arm_publish_lock.acquire(False):
|
||||
return
|
||||
left_diff = [abs(left[i] - left_arm[i]) for i in range(len(left))]
|
||||
right_diff = [abs(right[i] - right_arm[i]) for i in range(len(right))]
|
||||
flag = False
|
||||
for i in range(len(left)):
|
||||
if left_diff[i] < self.args.arm_steps_length[i]:
|
||||
left_arm[i] = left[i]
|
||||
else:
|
||||
left_arm[i] += left_symbol[i] * self.args.arm_steps_length[i]
|
||||
flag = True
|
||||
for i in range(len(right)):
|
||||
if right_diff[i] < self.args.arm_steps_length[i]:
|
||||
right_arm[i] = right[i]
|
||||
else:
|
||||
right_arm[i] += right_symbol[i] * self.args.arm_steps_length[i]
|
||||
flag = True
|
||||
joint_state_msg = JointState()
|
||||
joint_state_msg.header = Header()
|
||||
joint_state_msg.header.stamp = rospy.Time.now() # Set the timestep
|
||||
joint_state_msg.name = [
|
||||
"joint0",
|
||||
"joint1",
|
||||
"joint2",
|
||||
"joint3",
|
||||
"joint4",
|
||||
"joint5",
|
||||
"joint6",
|
||||
] # 设置关节名称
|
||||
joint_state_msg.position = left_arm
|
||||
self.puppet_arm_left_publisher.publish(joint_state_msg)
|
||||
joint_state_msg.position = right_arm
|
||||
self.puppet_arm_right_publisher.publish(joint_state_msg)
|
||||
step += 1
|
||||
print("puppet_arm_publish_continuous:", step)
|
||||
rate.sleep()
|
||||
|
||||
def puppet_arm_publish_linear(self, left, right):
|
||||
num_step = 100
|
||||
rate = rospy.Rate(200)
|
||||
|
||||
left_arm = None
|
||||
right_arm = None
|
||||
|
||||
while True and not rospy.is_shutdown():
|
||||
if len(self.puppet_arm_left_deque) != 0:
|
||||
left_arm = list(self.puppet_arm_left_deque[-1].position)
|
||||
if len(self.puppet_arm_right_deque) != 0:
|
||||
right_arm = list(self.puppet_arm_right_deque[-1].position)
|
||||
if left_arm is None or right_arm is None:
|
||||
rate.sleep()
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
traj_left_list = np.linspace(left_arm, left, num_step)
|
||||
traj_right_list = np.linspace(right_arm, right, num_step)
|
||||
|
||||
for i in range(len(traj_left_list)):
|
||||
traj_left = traj_left_list[i]
|
||||
traj_right = traj_right_list[i]
|
||||
traj_left[-1] = left[-1]
|
||||
traj_right[-1] = right[-1]
|
||||
joint_state_msg = JointState()
|
||||
joint_state_msg.header = Header()
|
||||
joint_state_msg.header.stamp = rospy.Time.now() # 设置时间戳
|
||||
joint_state_msg.name = [
|
||||
"joint0",
|
||||
"joint1",
|
||||
"joint2",
|
||||
"joint3",
|
||||
"joint4",
|
||||
"joint5",
|
||||
"joint6",
|
||||
] # 设置关节名称
|
||||
joint_state_msg.position = traj_left
|
||||
self.puppet_arm_left_publisher.publish(joint_state_msg)
|
||||
joint_state_msg.position = traj_right
|
||||
self.puppet_arm_right_publisher.publish(joint_state_msg)
|
||||
rate.sleep()
|
||||
|
||||
def puppet_arm_publish_continuous_thread(self, left, right):
|
||||
if self.puppet_arm_publish_thread is not None:
|
||||
self.puppet_arm_publish_lock.release()
|
||||
self.puppet_arm_publish_thread.join()
|
||||
self.puppet_arm_publish_lock.acquire(False)
|
||||
self.puppet_arm_publish_thread = None
|
||||
self.puppet_arm_publish_thread = threading.Thread(target=self.puppet_arm_publish_continuous, args=(left, right))
|
||||
self.puppet_arm_publish_thread.start()
|
||||
|
||||
def get_frame(self):
|
||||
if (len(self.img_left_deque) == 0 or len(self.img_right_deque) == 0 or len(self.img_front_deque) == 0 or
|
||||
(self.args.use_depth_image and (len(self.img_left_depth_deque) == 0 or len(self.img_right_depth_deque) == 0
|
||||
or len(self.img_front_depth_deque) == 0))):
|
||||
return False
|
||||
if self.args.use_depth_image:
|
||||
frame_time = min([
|
||||
self.img_left_deque[-1].header.stamp.to_sec(),
|
||||
self.img_right_deque[-1].header.stamp.to_sec(),
|
||||
self.img_front_deque[-1].header.stamp.to_sec(),
|
||||
self.img_left_depth_deque[-1].header.stamp.to_sec(),
|
||||
self.img_right_depth_deque[-1].header.stamp.to_sec(),
|
||||
self.img_front_depth_deque[-1].header.stamp.to_sec(),
|
||||
])
|
||||
else:
|
||||
frame_time = min([
|
||||
self.img_left_deque[-1].header.stamp.to_sec(),
|
||||
self.img_right_deque[-1].header.stamp.to_sec(),
|
||||
self.img_front_deque[-1].header.stamp.to_sec(),
|
||||
])
|
||||
|
||||
if (len(self.img_left_deque) == 0 or self.img_left_deque[-1].header.stamp.to_sec() < frame_time):
|
||||
return False
|
||||
if (len(self.img_right_deque) == 0 or self.img_right_deque[-1].header.stamp.to_sec() < frame_time):
|
||||
return False
|
||||
if (len(self.img_front_deque) == 0 or self.img_front_deque[-1].header.stamp.to_sec() < frame_time):
|
||||
return False
|
||||
if (len(self.puppet_arm_left_deque) == 0 or self.puppet_arm_left_deque[-1].header.stamp.to_sec() < frame_time):
|
||||
return False
|
||||
if (len(self.puppet_arm_right_deque) == 0
|
||||
or self.puppet_arm_right_deque[-1].header.stamp.to_sec() < frame_time):
|
||||
return False
|
||||
if self.args.use_depth_image and (len(self.img_left_depth_deque) == 0
|
||||
or self.img_left_depth_deque[-1].header.stamp.to_sec() < frame_time):
|
||||
return False
|
||||
if self.args.use_depth_image and (len(self.img_right_depth_deque) == 0
|
||||
or self.img_right_depth_deque[-1].header.stamp.to_sec() < frame_time):
|
||||
return False
|
||||
if self.args.use_depth_image and (len(self.img_front_depth_deque) == 0
|
||||
or self.img_front_depth_deque[-1].header.stamp.to_sec() < frame_time):
|
||||
return False
|
||||
if self.args.use_robot_base and (len(self.robot_base_deque) == 0
|
||||
or self.robot_base_deque[-1].header.stamp.to_sec() < frame_time):
|
||||
return False
|
||||
|
||||
while self.img_left_deque[0].header.stamp.to_sec() < frame_time:
|
||||
self.img_left_deque.popleft()
|
||||
img_left = self.bridge.imgmsg_to_cv2(self.img_left_deque.popleft(), "passthrough")
|
||||
|
||||
while self.img_right_deque[0].header.stamp.to_sec() < frame_time:
|
||||
self.img_right_deque.popleft()
|
||||
img_right = self.bridge.imgmsg_to_cv2(self.img_right_deque.popleft(), "passthrough")
|
||||
|
||||
while self.img_front_deque[0].header.stamp.to_sec() < frame_time:
|
||||
self.img_front_deque.popleft()
|
||||
img_front = self.bridge.imgmsg_to_cv2(self.img_front_deque.popleft(), "passthrough")
|
||||
|
||||
while self.puppet_arm_left_deque[0].header.stamp.to_sec() < frame_time:
|
||||
self.puppet_arm_left_deque.popleft()
|
||||
puppet_arm_left = self.puppet_arm_left_deque.popleft()
|
||||
|
||||
while self.puppet_arm_right_deque[0].header.stamp.to_sec() < frame_time:
|
||||
self.puppet_arm_right_deque.popleft()
|
||||
puppet_arm_right = self.puppet_arm_right_deque.popleft()
|
||||
|
||||
img_left_depth = None
|
||||
if self.args.use_depth_image:
|
||||
while self.img_left_depth_deque[0].header.stamp.to_sec() < frame_time:
|
||||
self.img_left_depth_deque.popleft()
|
||||
img_left_depth = self.bridge.imgmsg_to_cv2(self.img_left_depth_deque.popleft(), "passthrough")
|
||||
|
||||
img_right_depth = None
|
||||
if self.args.use_depth_image:
|
||||
while self.img_right_depth_deque[0].header.stamp.to_sec() < frame_time:
|
||||
self.img_right_depth_deque.popleft()
|
||||
img_right_depth = self.bridge.imgmsg_to_cv2(self.img_right_depth_deque.popleft(), "passthrough")
|
||||
|
||||
img_front_depth = None
|
||||
if self.args.use_depth_image:
|
||||
while self.img_front_depth_deque[0].header.stamp.to_sec() < frame_time:
|
||||
self.img_front_depth_deque.popleft()
|
||||
img_front_depth = self.bridge.imgmsg_to_cv2(self.img_front_depth_deque.popleft(), "passthrough")
|
||||
|
||||
robot_base = None
|
||||
if self.args.use_robot_base:
|
||||
while self.robot_base_deque[0].header.stamp.to_sec() < frame_time:
|
||||
self.robot_base_deque.popleft()
|
||||
robot_base = self.robot_base_deque.popleft()
|
||||
|
||||
return (
|
||||
img_front,
|
||||
img_left,
|
||||
img_right,
|
||||
img_front_depth,
|
||||
img_left_depth,
|
||||
img_right_depth,
|
||||
puppet_arm_left,
|
||||
puppet_arm_right,
|
||||
robot_base,
|
||||
)
|
||||
|
||||
def img_left_callback(self, msg):
|
||||
if len(self.img_left_deque) >= 2000:
|
||||
self.img_left_deque.popleft()
|
||||
self.img_left_deque.append(msg)
|
||||
|
||||
def img_right_callback(self, msg):
|
||||
if len(self.img_right_deque) >= 2000:
|
||||
self.img_right_deque.popleft()
|
||||
self.img_right_deque.append(msg)
|
||||
|
||||
def img_front_callback(self, msg):
|
||||
if len(self.img_front_deque) >= 2000:
|
||||
self.img_front_deque.popleft()
|
||||
self.img_front_deque.append(msg)
|
||||
|
||||
def img_left_depth_callback(self, msg):
|
||||
if len(self.img_left_depth_deque) >= 2000:
|
||||
self.img_left_depth_deque.popleft()
|
||||
self.img_left_depth_deque.append(msg)
|
||||
|
||||
def img_right_depth_callback(self, msg):
|
||||
if len(self.img_right_depth_deque) >= 2000:
|
||||
self.img_right_depth_deque.popleft()
|
||||
self.img_right_depth_deque.append(msg)
|
||||
|
||||
def img_front_depth_callback(self, msg):
|
||||
if len(self.img_front_depth_deque) >= 2000:
|
||||
self.img_front_depth_deque.popleft()
|
||||
self.img_front_depth_deque.append(msg)
|
||||
|
||||
def puppet_arm_left_callback(self, msg):
|
||||
if len(self.puppet_arm_left_deque) >= 2000:
|
||||
self.puppet_arm_left_deque.popleft()
|
||||
self.puppet_arm_left_deque.append(msg)
|
||||
|
||||
def puppet_arm_right_callback(self, msg):
|
||||
if len(self.puppet_arm_right_deque) >= 2000:
|
||||
self.puppet_arm_right_deque.popleft()
|
||||
self.puppet_arm_right_deque.append(msg)
|
||||
|
||||
def robot_base_callback(self, msg):
|
||||
if len(self.robot_base_deque) >= 2000:
|
||||
self.robot_base_deque.popleft()
|
||||
self.robot_base_deque.append(msg)
|
||||
|
||||
def init_ros(self):
|
||||
rospy.init_node("joint_state_publisher", anonymous=True)
|
||||
rospy.Subscriber(
|
||||
self.args.img_left_topic,
|
||||
Image,
|
||||
self.img_left_callback,
|
||||
queue_size=1000,
|
||||
tcp_nodelay=True,
|
||||
)
|
||||
rospy.Subscriber(
|
||||
self.args.img_right_topic,
|
||||
Image,
|
||||
self.img_right_callback,
|
||||
queue_size=1000,
|
||||
tcp_nodelay=True,
|
||||
)
|
||||
rospy.Subscriber(
|
||||
self.args.img_front_topic,
|
||||
Image,
|
||||
self.img_front_callback,
|
||||
queue_size=1000,
|
||||
tcp_nodelay=True,
|
||||
)
|
||||
if self.args.use_depth_image:
|
||||
rospy.Subscriber(
|
||||
self.args.img_left_depth_topic,
|
||||
Image,
|
||||
self.img_left_depth_callback,
|
||||
queue_size=1000,
|
||||
tcp_nodelay=True,
|
||||
)
|
||||
rospy.Subscriber(
|
||||
self.args.img_right_depth_topic,
|
||||
Image,
|
||||
self.img_right_depth_callback,
|
||||
queue_size=1000,
|
||||
tcp_nodelay=True,
|
||||
)
|
||||
rospy.Subscriber(
|
||||
self.args.img_front_depth_topic,
|
||||
Image,
|
||||
self.img_front_depth_callback,
|
||||
queue_size=1000,
|
||||
tcp_nodelay=True,
|
||||
)
|
||||
rospy.Subscriber(
|
||||
self.args.puppet_arm_left_topic,
|
||||
JointState,
|
||||
self.puppet_arm_left_callback,
|
||||
queue_size=1000,
|
||||
tcp_nodelay=True,
|
||||
)
|
||||
rospy.Subscriber(
|
||||
self.args.puppet_arm_right_topic,
|
||||
JointState,
|
||||
self.puppet_arm_right_callback,
|
||||
queue_size=1000,
|
||||
tcp_nodelay=True,
|
||||
)
|
||||
rospy.Subscriber(
|
||||
self.args.robot_base_topic,
|
||||
Odometry,
|
||||
self.robot_base_callback,
|
||||
queue_size=1000,
|
||||
tcp_nodelay=True,
|
||||
)
|
||||
self.puppet_arm_left_publisher = rospy.Publisher(self.args.puppet_arm_left_cmd_topic, JointState, queue_size=10)
|
||||
self.puppet_arm_right_publisher = rospy.Publisher(self.args.puppet_arm_right_cmd_topic,
|
||||
JointState,
|
||||
queue_size=10)
|
||||
self.robot_base_publisher = rospy.Publisher(self.args.robot_base_cmd_topic, Twist, queue_size=10)
|
||||
|
||||
|
||||
def get_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--max_publish_step",
|
||||
action="store",
|
||||
type=int,
|
||||
help="Maximum number of action publishing steps",
|
||||
default=10000,
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed",
|
||||
action="store",
|
||||
type=int,
|
||||
help="Random seed",
|
||||
default=None,
|
||||
required=False,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--img_front_topic",
|
||||
action="store",
|
||||
type=str,
|
||||
help="img_front_topic",
|
||||
default="/camera_f/color/image_raw",
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--img_left_topic",
|
||||
action="store",
|
||||
type=str,
|
||||
help="img_left_topic",
|
||||
default="/camera_l/color/image_raw",
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--img_right_topic",
|
||||
action="store",
|
||||
type=str,
|
||||
help="img_right_topic",
|
||||
default="/camera_r/color/image_raw",
|
||||
required=False,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--img_front_depth_topic",
|
||||
action="store",
|
||||
type=str,
|
||||
help="img_front_depth_topic",
|
||||
default="/camera_f/depth/image_raw",
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--img_left_depth_topic",
|
||||
action="store",
|
||||
type=str,
|
||||
help="img_left_depth_topic",
|
||||
default="/camera_l/depth/image_raw",
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--img_right_depth_topic",
|
||||
action="store",
|
||||
type=str,
|
||||
help="img_right_depth_topic",
|
||||
default="/camera_r/depth/image_raw",
|
||||
required=False,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--puppet_arm_left_cmd_topic",
|
||||
action="store",
|
||||
type=str,
|
||||
help="puppet_arm_left_cmd_topic",
|
||||
default="/master/joint_left",
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--puppet_arm_right_cmd_topic",
|
||||
action="store",
|
||||
type=str,
|
||||
help="puppet_arm_right_cmd_topic",
|
||||
default="/master/joint_right",
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--puppet_arm_left_topic",
|
||||
action="store",
|
||||
type=str,
|
||||
help="puppet_arm_left_topic",
|
||||
default="/puppet/joint_left",
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--puppet_arm_right_topic",
|
||||
action="store",
|
||||
type=str,
|
||||
help="puppet_arm_right_topic",
|
||||
default="/puppet/joint_right",
|
||||
required=False,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--robot_base_topic",
|
||||
action="store",
|
||||
type=str,
|
||||
help="robot_base_topic",
|
||||
default="/odom_raw",
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--robot_base_cmd_topic",
|
||||
action="store",
|
||||
type=str,
|
||||
help="robot_base_topic",
|
||||
default="/cmd_vel",
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_robot_base",
|
||||
action="store_true",
|
||||
help="Whether to use the robot base to move around",
|
||||
default=False,
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--publish_rate",
|
||||
action="store",
|
||||
type=int,
|
||||
help="The rate at which to publish the actions",
|
||||
default=30,
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ctrl_freq",
|
||||
action="store",
|
||||
type=int,
|
||||
help="The control frequency of the robot",
|
||||
default=25,
|
||||
required=False,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--chunk_size",
|
||||
action="store",
|
||||
type=int,
|
||||
help="Action chunk size",
|
||||
default=64,
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--arm_steps_length",
|
||||
action="store",
|
||||
type=float,
|
||||
help="The maximum change allowed for each joint per timestep",
|
||||
default=[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.2],
|
||||
required=False,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--use_actions_interpolation",
|
||||
action="store_true",
|
||||
help="Whether to interpolate the actions if the difference is too large",
|
||||
default=False,
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_depth_image",
|
||||
action="store_true",
|
||||
help="Whether to use depth images",
|
||||
default=False,
|
||||
required=False,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--disable_puppet_arm",
|
||||
action="store_true",
|
||||
help="Whether to disable the puppet arm. This is useful for safely debugging",
|
||||
default=False,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--config_path",
|
||||
type=str,
|
||||
default="configs/base.yaml",
|
||||
help="Path to the config file",
|
||||
)
|
||||
# parser.add_argument('--cfg_scale', type=float, default=2.0,
|
||||
# help='the scaling factor used to modify the magnitude of the control features during denoising')
|
||||
parser.add_argument(
|
||||
"--pretrained_model_name_or_path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Name or path to the pretrained model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--lang_embeddings_path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the pre-encoded language instruction embeddings",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = get_arguments()
|
||||
ros_operator = RosOperator(args)
|
||||
if args.seed is not None:
|
||||
set_seed(args.seed)
|
||||
config = get_config(args)
|
||||
model_inference(args, config, ros_operator)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
315
RDT/rdt-export/scripts/agilex_model.py
Normal file
315
RDT/rdt-export/scripts/agilex_model.py
Normal file
@ -0,0 +1,315 @@
|
||||
import os, sys
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from torchvision import transforms
|
||||
|
||||
from configs.state_vec import STATE_VEC_IDX_MAPPING
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# get current workspace
|
||||
current_file = Path(__file__)
|
||||
sys.path.append(os.path.join(current_file.parent.parent, "models"))
|
||||
sys.path.append(os.path.join(current_file.parent.parent, "models"))
|
||||
|
||||
from multimodal_encoder.siglip_encoder import SiglipVisionTower
|
||||
from multimodal_encoder.t5_encoder import T5Embedder
|
||||
from rdt_runner import RDTRunner
|
||||
|
||||
AGILEX_STATE_INDICES = [
|
||||
STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"] for i in range(6)
|
||||
]
|
||||
|
||||
# Create the RDT model
|
||||
def create_model(args, **kwargs):
|
||||
model = RoboticDiffusionTransformerModel(args, **kwargs)
|
||||
pretrained = kwargs.get("pretrained", None)
|
||||
if pretrained is not None and os.path.isfile(pretrained):
|
||||
model.load_pretrained_weights(pretrained)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
class RoboticDiffusionTransformerModel(object):
|
||||
"""A wrapper for the RDT model, which handles
|
||||
1. Model initialization
|
||||
2. Encodings of instructions
|
||||
3. Model inference
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
args,
|
||||
device="cuda",
|
||||
dtype=torch.bfloat16,
|
||||
image_size=None,
|
||||
control_frequency=25,
|
||||
pretrained=None,
|
||||
pretrained_vision_encoder_name_or_path=None,
|
||||
):
|
||||
self.args = args
|
||||
self.dtype = dtype
|
||||
self.image_size = image_size
|
||||
self.device = device
|
||||
self.control_frequency = control_frequency
|
||||
# We do not use the text encoder due to limited GPU memory
|
||||
# self.text_tokenizer, self.text_model = self.get_text_encoder(pretrained_text_encoder_name_or_path)
|
||||
self.image_processor, self.vision_model = self.get_vision_encoder(pretrained_vision_encoder_name_or_path)
|
||||
self.policy = self.get_policy(pretrained)
|
||||
|
||||
self.reset()
|
||||
|
||||
def get_policy(self, pretrained):
|
||||
"""Initialize the model."""
|
||||
# Initialize model with arguments
|
||||
if pretrained is None or os.path.isfile(pretrained):
|
||||
img_cond_len = (self.args["common"]["img_history_size"] * self.args["common"]["num_cameras"] *
|
||||
self.vision_model.num_patches)
|
||||
|
||||
_model = RDTRunner(
|
||||
action_dim=self.args["common"]["state_dim"],
|
||||
pred_horizon=self.args["common"]["action_chunk_size"],
|
||||
config=self.args["model"],
|
||||
lang_token_dim=self.args["model"]["lang_token_dim"],
|
||||
img_token_dim=self.args["model"]["img_token_dim"],
|
||||
state_token_dim=self.args["model"]["state_token_dim"],
|
||||
max_lang_cond_len=self.args["dataset"]["tokenizer_max_length"],
|
||||
img_cond_len=img_cond_len,
|
||||
img_pos_embed_config=[
|
||||
# No initial pos embed in the last grid size
|
||||
# since we've already done in ViT
|
||||
(
|
||||
"image",
|
||||
(
|
||||
self.args["common"]["img_history_size"],
|
||||
self.args["common"]["num_cameras"],
|
||||
-self.vision_model.num_patches,
|
||||
),
|
||||
),
|
||||
],
|
||||
lang_pos_embed_config=[
|
||||
# Similarly, no initial pos embed for language
|
||||
("lang", -self.args["dataset"]["tokenizer_max_length"]),
|
||||
],
|
||||
dtype=self.dtype,
|
||||
)
|
||||
else:
|
||||
_model = RDTRunner.from_pretrained(pretrained)
|
||||
|
||||
return _model
|
||||
|
||||
def get_text_encoder(self, pretrained_text_encoder_name_or_path):
|
||||
text_embedder = T5Embedder(
|
||||
from_pretrained=pretrained_text_encoder_name_or_path,
|
||||
model_max_length=self.args["dataset"]["tokenizer_max_length"],
|
||||
device=self.device,
|
||||
)
|
||||
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
|
||||
return tokenizer, text_encoder
|
||||
|
||||
def get_vision_encoder(self, pretrained_vision_encoder_name_or_path):
|
||||
vision_encoder = SiglipVisionTower(vision_tower=pretrained_vision_encoder_name_or_path, args=None)
|
||||
image_processor = vision_encoder.image_processor
|
||||
return image_processor, vision_encoder
|
||||
|
||||
def reset(self):
|
||||
"""Set model to evaluation mode."""
|
||||
device = self.device
|
||||
weight_dtype = self.dtype
|
||||
self.policy.eval()
|
||||
# self.text_model.eval()
|
||||
self.vision_model.eval()
|
||||
|
||||
self.policy = self.policy.to(device, dtype=weight_dtype)
|
||||
# self.text_model = self.text_model.to(device, dtype=weight_dtype)
|
||||
self.vision_model = self.vision_model.to(device, dtype=weight_dtype)
|
||||
|
||||
def load_pretrained_weights(self, pretrained=None):
|
||||
if pretrained is None:
|
||||
return
|
||||
print(f"Loading weights from {pretrained}")
|
||||
filename = os.path.basename(pretrained)
|
||||
if filename.endswith(".pt"):
|
||||
checkpoint = torch.load(pretrained)
|
||||
self.policy.load_state_dict(checkpoint["module"])
|
||||
elif filename.endswith(".safetensors"):
|
||||
from safetensors.torch import load_model
|
||||
|
||||
load_model(self.policy, pretrained)
|
||||
else:
|
||||
raise NotImplementedError(f"Unknown checkpoint format: {pretrained}")
|
||||
|
||||
def encode_instruction(self, instruction, device="cuda"):
|
||||
"""Encode string instruction to latent embeddings.
|
||||
|
||||
Args:
|
||||
instruction: a string of instruction
|
||||
device: a string of device
|
||||
|
||||
Returns:
|
||||
pred: a tensor of latent embeddings of shape (text_max_length, 512)
|
||||
"""
|
||||
tokens = self.text_tokenizer(instruction, return_tensors="pt", padding="longest",
|
||||
truncation=True)["input_ids"].to(device)
|
||||
|
||||
tokens = tokens.view(1, -1)
|
||||
with torch.no_grad():
|
||||
pred = self.text_model(tokens).last_hidden_state.detach()
|
||||
|
||||
return pred
|
||||
|
||||
def _format_joint_to_state(self, joints):
|
||||
"""
|
||||
Format the joint proprioception into the unified action vector.
|
||||
|
||||
Args:
|
||||
joints (torch.Tensor): The joint proprioception to be formatted.
|
||||
qpos ([B, N, 14]).
|
||||
|
||||
Returns:
|
||||
state (torch.Tensor): The formatted vector for RDT ([B, N, 128]).
|
||||
"""
|
||||
# Rescale the gripper to the range of [0, 1]
|
||||
joints = joints / torch.tensor(
|
||||
[[[180, 180, 180, 180, 180, 180]]],
|
||||
device=joints.device,
|
||||
dtype=joints.dtype,
|
||||
)
|
||||
|
||||
B, N, _ = joints.shape
|
||||
state = torch.zeros(
|
||||
(B, N, self.args["model"]["state_token_dim"]),
|
||||
device=joints.device,
|
||||
dtype=joints.dtype,
|
||||
)
|
||||
# Fill into the unified state vector
|
||||
state[:, :, AGILEX_STATE_INDICES] = joints
|
||||
# Assemble the mask indicating each dimension's availability
|
||||
state_elem_mask = torch.zeros(
|
||||
(B, self.args["model"]["state_token_dim"]),
|
||||
device=joints.device,
|
||||
dtype=joints.dtype,
|
||||
)
|
||||
state_elem_mask[:, AGILEX_STATE_INDICES] = 1
|
||||
return state, state_elem_mask
|
||||
|
||||
def _unformat_action_to_joint(self, action):
|
||||
"""
|
||||
Unformat the unified action vector into the joint action to be executed.
|
||||
|
||||
Args:
|
||||
action (torch.Tensor): The unified action vector to be unformatted.
|
||||
([B, N, 128])
|
||||
|
||||
Returns:
|
||||
joints (torch.Tensor): The unformatted robot joint action.
|
||||
qpos ([B, N, 14]).
|
||||
"""
|
||||
action_indices = AGILEX_STATE_INDICES
|
||||
joints = action[:, :, action_indices]
|
||||
|
||||
# Rescale the gripper back to the action range
|
||||
# Note that the action range and proprioception range are different
|
||||
# for Mobile ALOHA robot
|
||||
joints = joints * torch.tensor(
|
||||
[[[180, 180, 180, 180, 180, 180]]],
|
||||
device=joints.device,
|
||||
dtype=joints.dtype,
|
||||
)
|
||||
|
||||
return joints
|
||||
|
||||
@torch.no_grad()
|
||||
def step(self, proprio, images, text_embeds):
|
||||
"""
|
||||
Predict the next action chunk given the
|
||||
proprioceptive states, images, and instruction embeddings.
|
||||
|
||||
Args:
|
||||
proprio: proprioceptive states
|
||||
images: RGB images, the order should be
|
||||
[ext_{t-1}, right_wrist_{t-1}, left_wrist_{t-1},
|
||||
ext_{t}, right_wrist_{t}, left_wrist_{t}]
|
||||
text_embeds: instruction embeddings
|
||||
|
||||
Returns:
|
||||
action: predicted action
|
||||
"""
|
||||
device = self.device
|
||||
dtype = self.dtype
|
||||
|
||||
# The background image used for padding
|
||||
background_color = np.array([int(x * 255) for x in self.image_processor.image_mean],
|
||||
dtype=np.uint8).reshape(1, 1, 3)
|
||||
background_image = (np.ones(
|
||||
(
|
||||
self.image_processor.size["height"],
|
||||
self.image_processor.size["width"],
|
||||
3,
|
||||
),
|
||||
dtype=np.uint8,
|
||||
) * background_color)
|
||||
|
||||
# Preprocess the images by order and encode them
|
||||
image_tensor_list = []
|
||||
for image in images:
|
||||
if image is None:
|
||||
# Replace it with the background image
|
||||
image = Image.fromarray(background_image)
|
||||
|
||||
if self.image_size is not None:
|
||||
image = transforms.Resize(self.data_args.image_size)(image)
|
||||
|
||||
if self.args["dataset"].get("auto_adjust_image_brightness", False):
|
||||
pixel_values = list(image.getdata())
|
||||
average_brightness = sum(sum(pixel) for pixel in pixel_values) / (len(pixel_values) * 255.0 * 3)
|
||||
if average_brightness <= 0.15:
|
||||
image = transforms.ColorJitter(brightness=(1.75, 1.75))(image)
|
||||
|
||||
if self.args["dataset"].get("image_aspect_ratio", "pad") == "pad":
|
||||
|
||||
def expand2square(pil_img, background_color):
|
||||
width, height = pil_img.size
|
||||
if width == height:
|
||||
return pil_img
|
||||
elif width > height:
|
||||
result = Image.new(pil_img.mode, (width, width), background_color)
|
||||
result.paste(pil_img, (0, (width - height) // 2))
|
||||
return result
|
||||
else:
|
||||
result = Image.new(pil_img.mode, (height, height), background_color)
|
||||
result.paste(pil_img, ((height - width) // 2, 0))
|
||||
return result
|
||||
|
||||
image = expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean))
|
||||
image = self.image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
|
||||
image_tensor_list.append(image)
|
||||
|
||||
image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
|
||||
|
||||
image_embeds = self.vision_model(image_tensor).detach()
|
||||
image_embeds = image_embeds.reshape(-1, self.vision_model.hidden_size).unsqueeze(0)
|
||||
|
||||
# Prepare the proprioception states and the control frequency
|
||||
joints = proprio.to(device).unsqueeze(0) # (1, 1, 14)
|
||||
states, state_elem_mask = self._format_joint_to_state(joints) # (1, 1, 128), (1, 128)
|
||||
states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(device, dtype=dtype)
|
||||
states = states[:, -1:, :] # (1, 1, 128)
|
||||
ctrl_freqs = torch.tensor([self.control_frequency]).to(device)
|
||||
|
||||
text_embeds = text_embeds.to(device, dtype=dtype)
|
||||
|
||||
# Predict the next action chunk given the inputs
|
||||
trajectory = self.policy.predict_action(
|
||||
lang_tokens=text_embeds,
|
||||
lang_attn_mask=torch.ones(text_embeds.shape[:2], dtype=torch.bool, device=text_embeds.device),
|
||||
img_tokens=image_embeds,
|
||||
state_tokens=states,
|
||||
action_mask=state_elem_mask.unsqueeze(1),
|
||||
ctrl_freqs=ctrl_freqs,
|
||||
)
|
||||
trajectory = self._unformat_action_to_joint(trajectory).to(torch.float32)
|
||||
|
||||
return trajectory
|
||||
53
RDT/rdt-export/scripts/encode_lang.py
Normal file
53
RDT/rdt-export/scripts/encode_lang.py
Normal file
@ -0,0 +1,53 @@
|
||||
import os
|
||||
|
||||
import torch
|
||||
import yaml
|
||||
|
||||
from models.multimodal_encoder.t5_encoder import T5Embedder
|
||||
|
||||
GPU = 0
|
||||
MODEL_PATH = "google/t5-v1_1-xxl"
|
||||
CONFIG_PATH = "configs/base.yaml"
|
||||
SAVE_DIR = "outs/"
|
||||
|
||||
# Modify this to your task name and instruction
|
||||
TASK_NAME = "handover_pan"
|
||||
INSTRUCTION = "Pick up the black marker on the right and put it into the packaging box on the left."
|
||||
|
||||
# Note: if your GPU VRAM is less than 24GB,
|
||||
# it is recommended to enable offloading by specifying an offload directory.
|
||||
OFFLOAD_DIR = (
|
||||
None # Specify your offload directory here, ensuring the directory exists.
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
with open(CONFIG_PATH, "r") as fp:
|
||||
config = yaml.safe_load(fp)
|
||||
|
||||
device = torch.device(f"cuda:{GPU}")
|
||||
text_embedder = T5Embedder(
|
||||
from_pretrained=MODEL_PATH,
|
||||
model_max_length=config["dataset"]["tokenizer_max_length"],
|
||||
device=device,
|
||||
use_offload_folder=OFFLOAD_DIR,
|
||||
)
|
||||
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
|
||||
|
||||
tokens = tokenizer(INSTRUCTION, return_tensors="pt", padding="longest", truncation=True)["input_ids"].to(device)
|
||||
|
||||
tokens = tokens.view(1, -1)
|
||||
with torch.no_grad():
|
||||
pred = text_encoder(tokens).last_hidden_state.detach().cpu()
|
||||
|
||||
save_path = os.path.join(SAVE_DIR, f"{TASK_NAME}.pt")
|
||||
# We save the embeddings in a dictionary format
|
||||
torch.save({"name": TASK_NAME, "instruction": INSTRUCTION, "embeddings": pred}, save_path)
|
||||
|
||||
print(
|
||||
f'"{INSTRUCTION}" from "{TASK_NAME}" is encoded by "{MODEL_PATH}" into shape {pred.shape} and saved to "{save_path}"'
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
57
RDT/rdt-export/scripts/encode_lang_batch_once.py
Normal file
57
RDT/rdt-export/scripts/encode_lang_batch_once.py
Normal file
@ -0,0 +1,57 @@
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
import torch
|
||||
import yaml
|
||||
from tqdm import tqdm
|
||||
|
||||
from models.multimodal_encoder.t5_encoder import T5Embedder
|
||||
|
||||
|
||||
def encode_lang(
|
||||
DATA_FILE_PATH,
|
||||
TARGET_DIR,
|
||||
GPU,
|
||||
desc_type="seen",
|
||||
tokenizer=None,
|
||||
text_encoder=None,
|
||||
):
|
||||
current_dir = os.path.dirname(__file__)
|
||||
|
||||
with open(os.path.join(current_dir, "../configs/base.yaml"), "r") as fp:
|
||||
config = yaml.safe_load(fp)
|
||||
|
||||
device = torch.device(f"cuda:{GPU}")
|
||||
if tokenizer is None or text_encoder is None:
|
||||
text_embedder = T5Embedder(
|
||||
from_pretrained=os.path.join(current_dir, "../../weights/RDT/t5-v1_1-xxl"),
|
||||
model_max_length=config["dataset"]["tokenizer_max_length"],
|
||||
device=device,
|
||||
use_offload_folder=None,
|
||||
)
|
||||
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
|
||||
|
||||
with open(DATA_FILE_PATH, "r") as f_instr:
|
||||
instruction_dict = json.load(f_instr)
|
||||
|
||||
instructions = instruction_dict[desc_type]
|
||||
|
||||
# Encode the instructions
|
||||
tokenized_res = tokenizer(instructions, return_tensors="pt", padding="longest", truncation=True)
|
||||
tokens = tokenized_res["input_ids"].to(device)
|
||||
attn_mask = tokenized_res["attention_mask"].to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
text_embeds = (text_encoder(input_ids=tokens, attention_mask=attn_mask)["last_hidden_state"].detach().cpu())
|
||||
|
||||
attn_mask = attn_mask.cpu().bool()
|
||||
if not os.path.exists(f"{TARGET_DIR}/instructions"):
|
||||
os.makedirs(f"{TARGET_DIR}/instructions")
|
||||
# Save the embeddings for training use
|
||||
for i in range(len(instructions)):
|
||||
text_embed = text_embeds[i][attn_mask[i]]
|
||||
save_path = os.path.join(TARGET_DIR, f"instructions/lang_embed_{i}.pt")
|
||||
# print("encoded instructions save_path:",save_path)
|
||||
torch.save(text_embed, save_path)
|
||||
|
||||
return tokenizer, text_encoder
|
||||
84
RDT/rdt-export/scripts/generate_output_json.py
Normal file
84
RDT/rdt-export/scripts/generate_output_json.py
Normal file
@ -0,0 +1,84 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
|
||||
def extract_metrics_from_log(log_file_path):
|
||||
all_metrics = []
|
||||
pattern = re.compile(
|
||||
r"\{'agilex_sample_mse':\s*([0-9.eE+-]+),\s*'agilex_sample_l2err':\s*([0-9.eE+-]+),\s*'overall_avg_sample_mse':\s*([0-9.eE+-]+),\s*'overall_avg_sample_l2err':\s*([0-9.eE+-]+)\}"
|
||||
)
|
||||
try:
|
||||
with open(log_file_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
m = pattern.search(line)
|
||||
if m:
|
||||
metrics = (
|
||||
float(m.group(1)),
|
||||
float(m.group(2)),
|
||||
float(m.group(3)),
|
||||
float(m.group(4))
|
||||
)
|
||||
all_metrics.append(metrics)
|
||||
print(f"Find Metrics: agilex_sample_mse={metrics[0]}, agilex_sample_l2err={metrics[1]}, "
|
||||
f"overall_avg_sample_mse={metrics[2]}, overall_avg_sample_l2err={metrics[3]}")
|
||||
except Exception as e:
|
||||
print(f"Failed to read log: {e}")
|
||||
return (None, None, None, None)
|
||||
|
||||
if not all_metrics:
|
||||
print("No metrics found in the log file")
|
||||
return (None, None, None, None)
|
||||
|
||||
print(f"\nTotal {len(all_metrics)} metrics found in the log file")
|
||||
|
||||
best_agilex_mse = min(m[0] for m in all_metrics)
|
||||
best_agilex_l2err = min(m[1] for m in all_metrics)
|
||||
best_overall_mse = min(m[2] for m in all_metrics)
|
||||
best_overall_l2err = min(m[3] for m in all_metrics)
|
||||
|
||||
print(f"\nBest metrics:")
|
||||
print(f" agilex_sample_mse: {best_agilex_mse}")
|
||||
print(f" agilex_sample_l2err: {best_agilex_l2err}")
|
||||
print(f" overall_avg_sample_mse: {best_overall_mse}")
|
||||
print(f" overall_avg_sample_l2err: {best_overall_l2err}")
|
||||
|
||||
return (best_agilex_mse, best_agilex_l2err, best_overall_mse, best_overall_l2err)
|
||||
|
||||
def generate_output_json(input_config_file, output_dir, runtime):
|
||||
with open(input_config_file, 'r') as f:
|
||||
config = json.load(f)
|
||||
|
||||
log_file = os.path.join(output_dir, 'output.log')
|
||||
agilex_sample_mse, agilex_sample_l2err, overall_avg_sample_mse, overall_avg_sample_l2err = extract_metrics_from_log(log_file)
|
||||
|
||||
if None in [agilex_sample_mse, agilex_sample_l2err, overall_avg_sample_mse, overall_avg_sample_l2err]:
|
||||
print("Warning: Some metrics are missing in the log file.")
|
||||
|
||||
output_json = {
|
||||
"task_id": config.get("task_id"),
|
||||
"model_type": "RDT-170M",
|
||||
"model_name": config.get("model_name") if "model_name" in config else config.get("train", {}).get("model"),
|
||||
"gpu_id": config.get("gpu_id"),
|
||||
"runtime": runtime,
|
||||
"log_path": log_file,
|
||||
"output_dir": output_dir,
|
||||
"model_path": os.path.join(output_dir, 'pytorch_model.bin'),
|
||||
"metrics": {
|
||||
"agilex_sample_mse": agilex_sample_mse,
|
||||
"agilex_sample_l2err": agilex_sample_l2err,
|
||||
"overall_avg_sample_mse": overall_avg_sample_mse,
|
||||
"overall_avg_sample_l2err": overall_avg_sample_l2err
|
||||
}
|
||||
}
|
||||
|
||||
# 写入 output.json,格式化输出、确保null与规范json一致
|
||||
output_json_path = os.path.join(output_dir, 'output.json')
|
||||
with open(output_json_path, 'w') as f:
|
||||
json.dump(output_json, f, indent=4, ensure_ascii=False)
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 4:
|
||||
print("Usage: python generate_output_json.py <input_config_file> <output_dir> <runtime>")
|
||||
sys.exit(1)
|
||||
generate_output_json(sys.argv[1], sys.argv[2], sys.argv[3])
|
||||
325
RDT/rdt-export/scripts/maniskill_model.py
Normal file
325
RDT/rdt-export/scripts/maniskill_model.py
Normal file
@ -0,0 +1,325 @@
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from torchvision import transforms
|
||||
|
||||
from configs.state_vec import STATE_VEC_IDX_MAPPING
|
||||
from models.multimodal_encoder.siglip_encoder import SiglipVisionTower
|
||||
from models.multimodal_encoder.t5_encoder import T5Embedder
|
||||
from models.rdt_runner import RDTRunner
|
||||
|
||||
MANISKILL_INDICES = [STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"]
|
||||
for i in range(7)] + [STATE_VEC_IDX_MAPPING[f"right_gripper_open"]]
|
||||
|
||||
|
||||
def create_model(args, pretrained, **kwargs):
|
||||
model = RoboticDiffusionTransformerModel(args, **kwargs)
|
||||
if pretrained is not None:
|
||||
model.load_pretrained_weights(pretrained)
|
||||
return model
|
||||
|
||||
|
||||
DATA_STAT = {
|
||||
"state_min": [
|
||||
-0.7463043928146362,
|
||||
-0.0801204964518547,
|
||||
-0.4976441562175751,
|
||||
-2.657780647277832,
|
||||
-0.5742632150650024,
|
||||
1.8309762477874756,
|
||||
-2.2423808574676514,
|
||||
0.0,
|
||||
],
|
||||
"state_max": [
|
||||
0.7645499110221863,
|
||||
1.4967026710510254,
|
||||
0.4650936424732208,
|
||||
-0.3866899907588959,
|
||||
0.5505855679512024,
|
||||
3.2900545597076416,
|
||||
2.5737812519073486,
|
||||
0.03999999910593033,
|
||||
],
|
||||
"action_min": [
|
||||
-0.7472005486488342,
|
||||
-0.08631071448326111,
|
||||
-0.4995281398296356,
|
||||
-2.658363103866577,
|
||||
-0.5751323103904724,
|
||||
1.8290787935256958,
|
||||
-2.245187997817993,
|
||||
-1.0,
|
||||
],
|
||||
"action_max": [
|
||||
0.7654682397842407,
|
||||
1.4984270334243774,
|
||||
0.46786263585090637,
|
||||
-0.38181185722351074,
|
||||
0.5517147779464722,
|
||||
3.291581630706787,
|
||||
2.575840711593628,
|
||||
1.0,
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
class RoboticDiffusionTransformerModel(object):
|
||||
"""A wrapper for the RDT model, which handles
|
||||
1. Model initialization
|
||||
2. Encodings of instructions
|
||||
3. Model inference
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
args,
|
||||
device="cuda",
|
||||
dtype=torch.bfloat16,
|
||||
image_size=None,
|
||||
control_frequency=25,
|
||||
pretrained_text_encoder_name_or_path=None,
|
||||
pretrained_vision_encoder_name_or_path=None,
|
||||
):
|
||||
self.args = args
|
||||
self.dtype = dtype
|
||||
self.image_size = image_size
|
||||
self.device = device
|
||||
self.control_frequency = control_frequency
|
||||
self.text_tokenizer, self.text_model = self.get_text_encoder(pretrained_text_encoder_name_or_path)
|
||||
self.image_processor, self.vision_model = self.get_vision_encoder(pretrained_vision_encoder_name_or_path)
|
||||
self.policy = self.get_policy()
|
||||
|
||||
self.state_min = torch.tensor(DATA_STAT["state_min"]).to(device)
|
||||
self.state_max = torch.tensor(DATA_STAT["state_max"]).to(device)
|
||||
self.action_min = torch.tensor(DATA_STAT["action_min"]).to(device)
|
||||
self.action_max = torch.tensor(DATA_STAT["action_max"]).to(device)
|
||||
|
||||
self.reset()
|
||||
|
||||
def get_policy(self):
|
||||
"""Initialize the model."""
|
||||
# Initialize model with arguments
|
||||
img_cond_len = (self.args["common"]["img_history_size"] * self.args["common"]["num_cameras"] *
|
||||
self.vision_model.num_patches)
|
||||
|
||||
_model = RDTRunner(
|
||||
action_dim=self.args["common"]["state_dim"],
|
||||
pred_horizon=self.args["common"]["action_chunk_size"],
|
||||
config=self.args["model"],
|
||||
lang_token_dim=self.args["model"]["lang_token_dim"],
|
||||
img_token_dim=self.args["model"]["img_token_dim"],
|
||||
state_token_dim=self.args["model"]["state_token_dim"],
|
||||
max_lang_cond_len=self.args["dataset"]["tokenizer_max_length"],
|
||||
img_cond_len=img_cond_len,
|
||||
img_pos_embed_config=[
|
||||
# No initial pos embed in the last grid size
|
||||
# since we've already done in ViT
|
||||
(
|
||||
"image",
|
||||
(
|
||||
self.args["common"]["img_history_size"],
|
||||
self.args["common"]["num_cameras"],
|
||||
-self.vision_model.num_patches,
|
||||
),
|
||||
),
|
||||
],
|
||||
lang_pos_embed_config=[
|
||||
# Similarly, no initial pos embed for language
|
||||
("lang", -self.args["dataset"]["tokenizer_max_length"]),
|
||||
],
|
||||
dtype=self.dtype,
|
||||
)
|
||||
|
||||
return _model
|
||||
|
||||
def get_text_encoder(self, pretrained_text_encoder_name_or_path):
|
||||
text_embedder = T5Embedder(
|
||||
from_pretrained=pretrained_text_encoder_name_or_path,
|
||||
model_max_length=self.args["dataset"]["tokenizer_max_length"],
|
||||
device=self.device,
|
||||
)
|
||||
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
|
||||
return tokenizer, text_encoder
|
||||
|
||||
def get_vision_encoder(self, pretrained_vision_encoder_name_or_path):
|
||||
vision_encoder = SiglipVisionTower(vision_tower=pretrained_vision_encoder_name_or_path, args=None)
|
||||
image_processor = vision_encoder.image_processor
|
||||
return image_processor, vision_encoder
|
||||
|
||||
def reset(self):
|
||||
"""Set model to evaluation mode."""
|
||||
device = self.device
|
||||
weight_dtype = self.dtype
|
||||
self.policy.eval()
|
||||
self.text_model.eval()
|
||||
self.vision_model.eval()
|
||||
|
||||
self.policy = self.policy.to(device, dtype=weight_dtype)
|
||||
self.text_model = self.text_model.to(device, dtype=weight_dtype)
|
||||
self.vision_model = self.vision_model.to(device, dtype=weight_dtype)
|
||||
|
||||
def load_pretrained_weights(self, pretrained=None):
|
||||
if pretrained is None:
|
||||
return
|
||||
print(f"Loading weights from {pretrained}")
|
||||
filename = os.path.basename(pretrained)
|
||||
if filename.endswith(".pt"):
|
||||
checkpoint = torch.load(pretrained)
|
||||
self.policy.load_state_dict(checkpoint["module"])
|
||||
elif filename.endswith(".safetensors"):
|
||||
from safetensors.torch import load_model
|
||||
|
||||
load_model(self.policy, pretrained)
|
||||
else:
|
||||
raise NotImplementedError(f"Unknown checkpoint format: {pretrained}")
|
||||
|
||||
def encode_instruction(self, instruction, device="cuda"):
|
||||
"""Encode string instruction to latent embeddings.
|
||||
|
||||
Args:
|
||||
instruction: a string of instruction
|
||||
device: a string of device
|
||||
|
||||
Returns:
|
||||
pred: a tensor of latent embeddings of shape (text_max_length, 512)
|
||||
"""
|
||||
tokens = self.text_tokenizer(instruction, return_tensors="pt", padding="longest",
|
||||
truncation=True)["input_ids"].to(device)
|
||||
|
||||
tokens = tokens.view(1, -1)
|
||||
with torch.no_grad():
|
||||
pred = self.text_model(tokens).last_hidden_state.detach()
|
||||
|
||||
return pred
|
||||
|
||||
def _format_joint_to_state(self, joints):
|
||||
"""
|
||||
Format the robot joint state into the unified state vector.
|
||||
|
||||
Args:
|
||||
joints (torch.Tensor): The joint state to be formatted.
|
||||
qpos ([B, N, 14]).
|
||||
|
||||
Returns:
|
||||
state (torch.Tensor): The formatted state for RDT ([B, N, 128]).
|
||||
"""
|
||||
# Rescale the gripper
|
||||
# joints = joints / torch.tensor(
|
||||
# [[[1, 1, 1, 1, 1, 1, 4.7908, 1, 1, 1, 1, 1, 1, 4.7888]]],
|
||||
# device=joints.device, dtype=joints.dtype
|
||||
# )
|
||||
|
||||
# normalize to -1,1
|
||||
joints = (joints - self.state_min) / (self.state_max - self.state_min) * 2 - 1
|
||||
B, N, _ = joints.shape
|
||||
state = torch.zeros(
|
||||
(B, N, self.args["model"]["state_token_dim"]),
|
||||
device=joints.device,
|
||||
dtype=joints.dtype,
|
||||
)
|
||||
# assemble the unifed state vector
|
||||
state[:, :, MANISKILL_INDICES] = joints
|
||||
state_elem_mask = torch.zeros(
|
||||
(B, self.args["model"]["state_token_dim"]),
|
||||
device=joints.device,
|
||||
dtype=joints.dtype,
|
||||
)
|
||||
state_elem_mask[:, MANISKILL_INDICES] = 1
|
||||
return state, state_elem_mask
|
||||
|
||||
def _unformat_action_to_joint(self, action):
|
||||
action_indices = MANISKILL_INDICES
|
||||
joints = action[:, :, action_indices]
|
||||
|
||||
# denormalize to action space
|
||||
|
||||
joints = (joints + 1) / 2 * (self.action_max - self.action_min) + self.action_min
|
||||
|
||||
return joints
|
||||
|
||||
@torch.no_grad()
|
||||
def step(self, proprio, images, text_embeds):
|
||||
"""
|
||||
Args:
|
||||
proprio: proprioceptive states
|
||||
images: RGB images
|
||||
text_embeds: instruction embeddings
|
||||
|
||||
Returns:
|
||||
action: predicted action
|
||||
"""
|
||||
device = self.device
|
||||
dtype = self.dtype
|
||||
|
||||
background_color = np.array([int(x * 255) for x in self.image_processor.image_mean],
|
||||
dtype=np.uint8).reshape(1, 1, 3)
|
||||
background_image = (np.ones(
|
||||
(
|
||||
self.image_processor.size["height"],
|
||||
self.image_processor.size["width"],
|
||||
3,
|
||||
),
|
||||
dtype=np.uint8,
|
||||
) * background_color)
|
||||
|
||||
image_tensor_list = []
|
||||
for image in images:
|
||||
if image is None:
|
||||
# Replace it with the background image
|
||||
image = Image.fromarray(background_image)
|
||||
|
||||
if self.image_size is not None:
|
||||
image = transforms.Resize(self.data_args.image_size)(image)
|
||||
|
||||
if self.args["dataset"].get("auto_adjust_image_brightness", False):
|
||||
pixel_values = list(image.getdata())
|
||||
average_brightness = sum(sum(pixel) for pixel in pixel_values) / (len(pixel_values) * 255.0 * 3)
|
||||
if average_brightness <= 0.15:
|
||||
image = transforms.ColorJitter(brightness=(1.75, 1.75))(image)
|
||||
|
||||
if self.args["dataset"].get("image_aspect_ratio", "pad") == "pad":
|
||||
|
||||
def expand2square(pil_img, background_color):
|
||||
width, height = pil_img.size
|
||||
if width == height:
|
||||
return pil_img
|
||||
elif width > height:
|
||||
result = Image.new(pil_img.mode, (width, width), background_color)
|
||||
result.paste(pil_img, (0, (width - height) // 2))
|
||||
return result
|
||||
else:
|
||||
result = Image.new(pil_img.mode, (height, height), background_color)
|
||||
result.paste(pil_img, ((height - width) // 2, 0))
|
||||
return result
|
||||
|
||||
image = expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean))
|
||||
image = self.image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
|
||||
image_tensor_list.append(image)
|
||||
|
||||
image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
|
||||
|
||||
image_embeds = self.vision_model(image_tensor).detach()
|
||||
image_embeds = image_embeds.reshape(-1, self.vision_model.hidden_size).unsqueeze(0)
|
||||
|
||||
# history of actions
|
||||
joints = proprio.to(device).unsqueeze(0) # (1, 1, 14)
|
||||
states, state_elem_mask = self._format_joint_to_state(joints) # (1, 1, 128), (1, 128)
|
||||
states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(device, dtype=dtype)
|
||||
states = states[:, -1:, :] # (1, 1, 128)
|
||||
ctrl_freqs = torch.tensor([self.control_frequency]).to(device)
|
||||
|
||||
text_embeds = text_embeds.to(device, dtype=dtype)
|
||||
|
||||
trajectory = self.policy.predict_action(
|
||||
lang_tokens=text_embeds,
|
||||
lang_attn_mask=torch.ones(text_embeds.shape[:2], dtype=torch.bool, device=text_embeds.device),
|
||||
img_tokens=image_embeds,
|
||||
state_tokens=states,
|
||||
action_mask=state_elem_mask.unsqueeze(1),
|
||||
ctrl_freqs=ctrl_freqs,
|
||||
)
|
||||
trajectory = self._unformat_action_to_joint(trajectory).to(torch.float32)
|
||||
|
||||
return trajectory
|
||||
169
RDT/rdt-export/scripts/process_data.py
Normal file
169
RDT/rdt-export/scripts/process_data.py
Normal file
@ -0,0 +1,169 @@
|
||||
import sys
|
||||
|
||||
sys.path.append("./")
|
||||
|
||||
import os
|
||||
import h5py
|
||||
import numpy as np
|
||||
import pickle
|
||||
import cv2
|
||||
import argparse
|
||||
import yaml
|
||||
from scripts.encode_lang_batch_once import encode_lang
|
||||
|
||||
|
||||
def load_hdf5(dataset_path):
|
||||
if not os.path.isfile(dataset_path):
|
||||
print(f"Dataset does not exist at \n{dataset_path}\n")
|
||||
exit()
|
||||
|
||||
with h5py.File(dataset_path, "r") as root:
|
||||
left_gripper, left_arm = (
|
||||
root["/joint_action/left_gripper"][()],
|
||||
root["/joint_action/left_arm"][()],
|
||||
)
|
||||
right_gripper, right_arm = (
|
||||
root["/joint_action/right_gripper"][()],
|
||||
root["/joint_action/right_arm"][()],
|
||||
)
|
||||
image_dict = dict()
|
||||
for cam_name in root[f"/observation/"].keys():
|
||||
image_dict[cam_name] = root[f"/observation/{cam_name}/rgb"][()]
|
||||
|
||||
return left_gripper, left_arm, right_gripper, right_arm, image_dict
|
||||
|
||||
|
||||
def images_encoding(imgs):
|
||||
encode_data = []
|
||||
padded_data = []
|
||||
max_len = 0
|
||||
for i in range(len(imgs)):
|
||||
success, encoded_image = cv2.imencode(".jpg", imgs[i])
|
||||
jpeg_data = encoded_image.tobytes()
|
||||
encode_data.append(jpeg_data)
|
||||
max_len = max(max_len, len(jpeg_data))
|
||||
# padding
|
||||
for i in range(len(imgs)):
|
||||
padded_data.append(encode_data[i].ljust(max_len, b"\0"))
|
||||
return encode_data, max_len
|
||||
|
||||
|
||||
def get_task_config(task_name):
|
||||
with open(f"./task_config/{task_name}.yml", "r", encoding="utf-8") as f:
|
||||
args = yaml.load(f.read(), Loader=yaml.FullLoader)
|
||||
return args
|
||||
|
||||
|
||||
def data_transform(path, episode_num, save_path):
|
||||
begin = 0
|
||||
floders = os.listdir(path)
|
||||
assert episode_num <= len(floders), "data num not enough"
|
||||
|
||||
if not os.path.exists(save_path):
|
||||
os.makedirs(save_path)
|
||||
|
||||
for i in range(episode_num):
|
||||
left_gripper_all, left_arm_all, right_gripper_all, right_arm_all, image_dict = (load_hdf5(
|
||||
os.path.join(path, f"episode{i}.hdf5")))
|
||||
qpos = []
|
||||
actions = []
|
||||
cam_high = []
|
||||
cam_right_wrist = []
|
||||
cam_left_wrist = []
|
||||
left_arm_dim = []
|
||||
right_arm_dim = []
|
||||
|
||||
last_state = None
|
||||
for j in range(0, left_gripper_all.shape[0]):
|
||||
|
||||
left_gripper, left_arm, right_gripper, right_arm = (
|
||||
left_gripper_all[j],
|
||||
left_arm_all[j],
|
||||
right_gripper_all[j],
|
||||
right_arm_all[j],
|
||||
)
|
||||
|
||||
state = np.concatenate((left_arm, [left_gripper], right_arm, [right_gripper]), axis=0) # joint
|
||||
state = state.astype(np.float32)
|
||||
|
||||
if j != left_gripper_all.shape[0] - 1:
|
||||
|
||||
qpos.append(state)
|
||||
|
||||
camera_high_bits = image_dict["head_camera"][j]
|
||||
camera_high = cv2.imdecode(np.frombuffer(camera_high_bits, np.uint8), cv2.IMREAD_COLOR)
|
||||
camera_high_resized = cv2.resize(camera_high, (640, 480))
|
||||
cam_high.append(camera_high_resized)
|
||||
|
||||
camera_right_wrist_bits = image_dict["right_camera"][j]
|
||||
camera_right_wrist = cv2.imdecode(np.frombuffer(camera_right_wrist_bits, np.uint8), cv2.IMREAD_COLOR)
|
||||
camera_right_wrist_resized = cv2.resize(camera_right_wrist, (640, 480))
|
||||
cam_right_wrist.append(camera_right_wrist_resized)
|
||||
|
||||
camera_left_wrist_bits = image_dict["left_camera"][j]
|
||||
camera_left_wrist = cv2.imdecode(np.frombuffer(camera_left_wrist_bits, np.uint8), cv2.IMREAD_COLOR)
|
||||
camera_left_wrist_resized = cv2.resize(camera_left_wrist, (640, 480))
|
||||
cam_left_wrist.append(camera_left_wrist_resized)
|
||||
|
||||
if j != 0:
|
||||
action = state
|
||||
actions.append(action)
|
||||
left_arm_dim.append(left_arm.shape[0])
|
||||
right_arm_dim.append(right_arm.shape[0])
|
||||
|
||||
if not os.path.exists(os.path.join(save_path, f"episode_{i}")):
|
||||
os.makedirs(os.path.join(save_path, f"episode_{i}"))
|
||||
hdf5path = os.path.join(save_path, f"episode_{i}/episode_{i}.hdf5")
|
||||
|
||||
with h5py.File(hdf5path, "w") as f:
|
||||
f.create_dataset("action", data=np.array(actions))
|
||||
obs = f.create_group("observations")
|
||||
obs.create_dataset("qpos", data=np.array(qpos))
|
||||
obs.create_dataset("left_arm_dim", data=np.array(left_arm_dim))
|
||||
obs.create_dataset("right_arm_dim", data=np.array(right_arm_dim))
|
||||
image = obs.create_group("images")
|
||||
cam_high_enc, len_high = images_encoding(cam_high)
|
||||
cam_right_wrist_enc, len_right = images_encoding(cam_right_wrist)
|
||||
cam_left_wrist_enc, len_left = images_encoding(cam_left_wrist)
|
||||
image.create_dataset("cam_high", data=cam_high_enc, dtype=f"S{len_high}")
|
||||
image.create_dataset("cam_right_wrist", data=cam_right_wrist_enc, dtype=f"S{len_right}")
|
||||
image.create_dataset("cam_left_wrist", data=cam_left_wrist_enc, dtype=f"S{len_left}")
|
||||
|
||||
begin += 1
|
||||
print(f"proccess {i} success!")
|
||||
|
||||
return begin
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Process some episodes.")
|
||||
parser.add_argument("task_name", type=str)
|
||||
parser.add_argument("task_config", type=str)
|
||||
parser.add_argument("expert_data_num", type=int)
|
||||
args = parser.parse_args()
|
||||
|
||||
task_name = args.task_name
|
||||
task_config = args.task_config
|
||||
expert_data_num = args.expert_data_num
|
||||
|
||||
load_dir = os.path.join("../../data", str(task_name), str(task_config), "data")
|
||||
|
||||
print(f"read data from path: {load_dir}")
|
||||
begin = data_transform(
|
||||
load_dir,
|
||||
expert_data_num,
|
||||
f"./processed_data/{task_name}-{task_config}-{expert_data_num}",
|
||||
)
|
||||
tokenizer, text_encoder = None, None
|
||||
for idx in range(expert_data_num):
|
||||
print(f"Processing Language: {idx}", end="\r")
|
||||
data_file_path = (f"../../data/{task_name}/{task_config}/instructions/episode{idx}.json")
|
||||
target_dir = (f"processed_data/{task_name}-{task_config}-{expert_data_num}/episode_{idx}")
|
||||
tokenizer, text_encoder = encode_lang(
|
||||
DATA_FILE_PATH=data_file_path,
|
||||
TARGET_DIR=target_dir,
|
||||
GPU=0,
|
||||
desc_type="seen",
|
||||
tokenizer=tokenizer,
|
||||
text_encoder=text_encoder,
|
||||
)
|
||||
42
RDT/rdt-export/scripts/read_config.py
Normal file
42
RDT/rdt-export/scripts/read_config.py
Normal file
@ -0,0 +1,42 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
def read_config(config_file, key_path):
|
||||
"""
|
||||
Read a value from JSON config file.
|
||||
|
||||
Args:
|
||||
config_file: Path to JSON config file
|
||||
key_path: Dot-separated path to the key (e.g., "evaluation.checkpoint_path")
|
||||
|
||||
Returns:
|
||||
The value at the specified key path
|
||||
"""
|
||||
with open(config_file, 'r') as f:
|
||||
json_config = json.load(f)
|
||||
|
||||
# Navigate through nested keys
|
||||
keys = key_path.split('.')
|
||||
value = json_config
|
||||
for key in keys:
|
||||
if isinstance(value, dict):
|
||||
value = value.get(key)
|
||||
else:
|
||||
return None
|
||||
|
||||
return value
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: python read_config.py <config_file> <key_path>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
config_file = sys.argv[1]
|
||||
key_path = sys.argv[2]
|
||||
|
||||
value = read_config(config_file, key_path)
|
||||
if value is not None:
|
||||
print(value)
|
||||
else:
|
||||
print("", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
22
RDT/rdt-export/scripts/read_yaml.py
Normal file
22
RDT/rdt-export/scripts/read_yaml.py
Normal file
@ -0,0 +1,22 @@
|
||||
import sys
|
||||
import yaml
|
||||
|
||||
|
||||
def read_yaml_value(file_path, key):
|
||||
with open(file_path, "r") as file:
|
||||
data = yaml.safe_load(file)
|
||||
value = data.get(key)
|
||||
if value is not None:
|
||||
print(value)
|
||||
else:
|
||||
print(f"Key '{key}' not found in {file_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python read_yaml.py <file_path> <key>")
|
||||
sys.exit(1)
|
||||
|
||||
file_path = sys.argv[1]
|
||||
key = sys.argv[2]
|
||||
read_yaml_value(file_path, key)
|
||||
2
RDT/rdt-quant/.dockerignore
Normal file
2
RDT/rdt-quant/.dockerignore
Normal file
@ -0,0 +1,2 @@
|
||||
input/
|
||||
output/
|
||||
14
RDT/rdt-quant/Dockerfile
Normal file
14
RDT/rdt-quant/Dockerfile
Normal file
@ -0,0 +1,14 @@
|
||||
FROM ai_toolchain_ubuntu_22_s100_gpu:v3.2.0
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV TZ=Asia/Shanghai
|
||||
|
||||
RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
|
||||
sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
|
||||
|
||||
COPY . /app/
|
||||
|
||||
ENTRYPOINT ["bash", "convert.sh"]
|
||||
30
RDT/rdt-quant/convert.sh
Normal file
30
RDT/rdt-quant/convert.sh
Normal file
@ -0,0 +1,30 @@
|
||||
CONFIG=input/config.json
|
||||
OUTPUT=/app/output/$(python3 read_json.py $CONFIG task_id)
|
||||
python3 load_config.py $CONFIG
|
||||
echo "Convert PTQ YAML Haved been Prepared"
|
||||
|
||||
|
||||
|
||||
|
||||
######### Img Adaptor
|
||||
cd $OUTPUT/Img_Adaptor
|
||||
BEGIN_IMG_ADAPTOR_TIME=$(date +%s)
|
||||
echo -e "\033[44;37m===== Start Compiling Img Adaptor =====\033[0m"
|
||||
hb_compile --config $OUTPUT/img_adaptor.yaml
|
||||
echo -e "\033[44;37m===== End Compiling Img Adaptor =====\033[0m"
|
||||
END_IMG_ADAPTOR_TIME=$(date +%s)
|
||||
IMG_ADAPTOR_TIME=$((END_IMG_ADAPTOR_TIME - BEGIN_IMG_ADAPTOR_TIME))
|
||||
echo -e "\033[44;37m===== Img Adaptor Time =====\033[0m"
|
||||
echo -e "\033[44;37m===== $IMG_ADAPTOR_TIME seconds =====\033[0m"
|
||||
|
||||
########## DiT
|
||||
cd $OUTPUT/DiT_Policy
|
||||
BEGIN_DIT_TIME=$(date +%s)
|
||||
echo -e "\033[44;37m===== Start Compiling DiT =====\033[0m"
|
||||
hb_compile --config $OUTPUT/dit.yaml
|
||||
echo -e "\033[44;37m===== End Compiling DiT =====\033[0m"
|
||||
END_DIT_TIME=$(date +%s)
|
||||
DIT_TIME=$((END_DIT_TIME - BEGIN_DIT_TIME))
|
||||
echo -e "\033[44;37m===== DiT Time =====\033[0m"
|
||||
echo -e "\033[44;37m===== $DIT_TIME seconds =====\033[0m"
|
||||
|
||||
88
RDT/rdt-quant/load_config.py
Normal file
88
RDT/rdt-quant/load_config.py
Normal file
@ -0,0 +1,88 @@
|
||||
import json
|
||||
import yaml
|
||||
import sys
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
DIT = "DiT_Policy"
|
||||
IMG_ADAPTOR = "Img_Adaptor"
|
||||
|
||||
@dataclass
|
||||
class QuantConfig:
|
||||
task_id: str = None
|
||||
gpu_id: str = None
|
||||
march: str = None
|
||||
model_type: str = None
|
||||
output_path: str = None
|
||||
DiT_Policy_ONNX: str = None
|
||||
DiT_Policy_CALIBRATION: str = None
|
||||
Img_Adaptor_ONNX: str = None
|
||||
Img_Adaptor_CALIBRATION: str = None
|
||||
|
||||
|
||||
|
||||
|
||||
def load_config(config_path):
|
||||
with open(config_path, "r") as file:
|
||||
config = json.load(file)
|
||||
|
||||
if "quant" in config:
|
||||
quant_info = config["quant"]
|
||||
if "DiT_Policy" in quant_info:
|
||||
dit_policy = quant_info["DiT_Policy"]
|
||||
if "Img_Adaptor" in quant_info:
|
||||
img_adaptor = quant_info["Img_Adaptor"]
|
||||
|
||||
opt = QuantConfig(
|
||||
task_id=config.get("task_id"),
|
||||
gpu_id=config.get("gpu_id"),
|
||||
march=quant_info.get("march"),
|
||||
model_type=quant_info.get("model_type"),
|
||||
output_path=os.path.join(quant_info.get("output_path"), config.get("task_id")),
|
||||
DiT_Policy_ONNX=dit_policy.get("onnx_model"),
|
||||
DiT_Policy_CALIBRATION=dit_policy.get("calibration_data"),
|
||||
Img_Adaptor_ONNX=img_adaptor.get("onnx_model"),
|
||||
Img_Adaptor_CALIBRATION=img_adaptor.get("calibration_data")
|
||||
)
|
||||
os.makedirs(opt.output_path, exist_ok=True)
|
||||
|
||||
# PrePare Img Convert YAML
|
||||
with open(f"ptq_yaml/{opt.model_type}/img_adaptor.yaml", "r") as file:
|
||||
img_adaptor_yaml = yaml.safe_load(file)
|
||||
img_adaptor_yaml["model_parameters"]["onnx_model"] = opt.Img_Adaptor_ONNX
|
||||
img_adaptor_yaml["model_parameters"]["march"] = opt.march
|
||||
img_adaptor_yaml["model_parameters"]["output_model_file_prefix"] = "rdt_img_adaptor"
|
||||
img_adaptor_yaml["calibration_parameters"]["cal_data_dir"] = opt.Img_Adaptor_CALIBRATION
|
||||
img_adaptor_yaml["model_parameters"]["working_dir"] = IMG_ADAPTOR
|
||||
img_adaptor_yaml_path = os.path.join(opt.output_path, "img_adaptor.yaml")
|
||||
with open(img_adaptor_yaml_path, 'w') as f:
|
||||
yaml.safe_dump(img_adaptor_yaml, f, default_flow_style=False, allow_unicode=True)
|
||||
|
||||
|
||||
# PrePare DiT Convert YAML
|
||||
with open(f"ptq_yaml/{opt.model_type}/dit.yaml", "r") as file:
|
||||
dit_yaml = yaml.safe_load(file)
|
||||
for k, v in dit_yaml.get("calibration_parameters", {}).items():
|
||||
if isinstance(v, str) and "{dit_cal_name}" in v:
|
||||
if opt.DiT_Policy_CALIBRATION is not None:
|
||||
dit_yaml["calibration_parameters"][k] = v.replace("{dit_cal_name}", opt.DiT_Policy_CALIBRATION)
|
||||
else:
|
||||
raise ValueError(f"DiT_Policy_CALIBRATION is None, cannot replace {{dit_cal_name}} in {k}")
|
||||
dit_yaml["model_parameters"]["onnx_model"] = opt.DiT_Policy_ONNX
|
||||
dit_yaml["model_parameters"]["march"] = opt.march
|
||||
dit_yaml["model_parameters"]["working_dir"] = DIT
|
||||
|
||||
# dit_onnx_dir = os.path.dirname(opt.DiT_Policy_ONNX) if opt.DiT_Policy_ONNX else ""
|
||||
# os.environ["DIT_ONNX_DIR"] = dit_onnx_dir
|
||||
|
||||
with open(f"ptq_yaml/{opt.model_type}/dit_op_config.json", "r") as file:
|
||||
dit_json = json.load(file)
|
||||
dit_yaml["calibration_parameters"]["quant_config"] = dit_json
|
||||
|
||||
dit_yaml_path = os.path.join(opt.output_path, "dit.yaml")
|
||||
with open(dit_yaml_path, 'w') as f:
|
||||
yaml.safe_dump(dit_yaml, f, default_flow_style=False, allow_unicode=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
config_path = sys.argv[1]
|
||||
config = load_config(config_path)
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 31 KiB |
29
RDT/rdt-quant/ptq_yaml/170M/dit.yaml
Normal file
29
RDT/rdt-quant/ptq_yaml/170M/dit.yaml
Normal file
@ -0,0 +1,29 @@
|
||||
calibration_parameters:
|
||||
cal_data_dir: '{dit_cal_name}/x/;{dit_cal_name}/freq/;{dit_cal_name}/t/;{dit_cal_name}/lang_c/;{dit_cal_name}/img_c/;{dit_cal_name}/lang_mask/;'
|
||||
quant_config: dit_json_name
|
||||
run_on_cpu: '/t_embedder/Cos;/t_embedder/Sin;/freq_embedder/Cos;/freq_embedder/Sin'
|
||||
compiler_parameters:
|
||||
compile_mode: latency
|
||||
core_num: 1
|
||||
debug: true
|
||||
jobs: 8
|
||||
max_time_per_fc: 0
|
||||
optimize_level: O2
|
||||
advice: 1
|
||||
input_parameters:
|
||||
input_layout_rt: NCHW;NCHW;NCHW;NCHW;NCHW;NCHW
|
||||
input_layout_train: NCHW;NCHW;NCHW;NCHW;NCHW;NCHW
|
||||
input_name: x;freq;t;lang_c;img_c;lang_mask;
|
||||
input_shape: 1x65x1024;1;1;1x64x1024;1x4374x1024;1x64
|
||||
input_space_and_range: ''
|
||||
input_type_rt: featuremap;featuremap;featuremap;featuremap;featuremap;featuremap
|
||||
input_type_train: featuremap;featuremap;featuremap;featuremap;featuremap;featuremap
|
||||
norm_type: no_preprocess;no_preprocess;no_preprocess;no_preprocess;no_preprocess;no_preprocess
|
||||
model_parameters:
|
||||
layer_out_dump: false
|
||||
debug_mode: "dump_calibration_data"
|
||||
enable_vpu: True
|
||||
march: {opt.march}
|
||||
onnx_model: {dit_name}
|
||||
output_model_file_prefix: rdt_dit
|
||||
working_dir: bpu_output
|
||||
251
RDT/rdt-quant/ptq_yaml/170M/dit_op_config.json
Normal file
251
RDT/rdt-quant/ptq_yaml/170M/dit_op_config.json
Normal file
@ -0,0 +1,251 @@
|
||||
{
|
||||
"model_config": {
|
||||
"all_node_type": "int16",
|
||||
"model_output_type": "float32",
|
||||
"activation": {
|
||||
"calibration_type": ["max"],
|
||||
"num_bin": [1024, 2048, 4096],
|
||||
"max_num_bin": 16384,
|
||||
"max_percentile": 1.0,
|
||||
"per_channel": true,
|
||||
"asymmetric": [true]
|
||||
},
|
||||
"weight": {
|
||||
"bias_correction": {
|
||||
"metric": "mae"
|
||||
}
|
||||
},
|
||||
"modelwise_search": {
|
||||
"metric": "mae"
|
||||
}
|
||||
},
|
||||
"op_config": {
|
||||
"ReduceMean": {"qtype": "int16"},
|
||||
"Sub": {"qtype": "int16"},
|
||||
"Softmax": {"qtype": "int16"}
|
||||
},
|
||||
"node_config": {
|
||||
"/t_embedder/Mul": {"qtype": "float32"},
|
||||
"/t_embedder/Cos": {"qtype": "float32"},
|
||||
"/t_embedder/Sin": {"qtype": "float32"},
|
||||
"/t_embedder/Concat": {"qtype": "float32"},
|
||||
"/freq_embedder/Mul": {"qtype": "float32"},
|
||||
"/freq_embedder/Cos": {"qtype": "float32"},
|
||||
"/freq_embedder/Sin": {"qtype": "float32"},
|
||||
"/freq_embedder/Concat": {"qtype": "float32"},
|
||||
"/blocks.0/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.0/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.0/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.0/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.0/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.0/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.0/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.0/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.0/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.0/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.0/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.0/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.0/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.0/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.0/Add": {"qtype": "int16"},
|
||||
"/blocks.1/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.1/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.1/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.1/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.1/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.1/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.1/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.1/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.1/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.1/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.1/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.1/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.1/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.1/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.1/Add": {"qtype": "int16"},
|
||||
"/blocks.2/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.2/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.2/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.2/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.2/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.2/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.2/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.2/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.2/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.2/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.2/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.2/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.2/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.2/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.2/Add": {"qtype": "int16"},
|
||||
"/blocks.3/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.3/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.3/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.3/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.3/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.3/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.3/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.3/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.3/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.3/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.3/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.3/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.3/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.3/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.3/Add": {"qtype": "int16"},
|
||||
"/blocks.4/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.4/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.4/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.4/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.4/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.4/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.4/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.4/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.4/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.4/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.4/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.4/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.4/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.4/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.4/Add": {"qtype": "int16"},
|
||||
"/blocks.5/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.5/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.5/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.5/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.5/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.5/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.5/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.5/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.5/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.5/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.5/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.5/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.5/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.5/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.5/Add": {"qtype": "int16"},
|
||||
"/blocks.6/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.6/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.6/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.6/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.6/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.6/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.6/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.6/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.6/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.6/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.6/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.6/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.6/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.6/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.6/Add": {"qtype": "int16"},
|
||||
"/blocks.7/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.7/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.7/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.7/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.7/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.7/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.7/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.7/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.7/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.7/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.7/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.7/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.7/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.7/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.7/Add": {"qtype": "int16"},
|
||||
"/blocks.8/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.8/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.8/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.8/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.8/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.8/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.8/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.8/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.8/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.8/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.8/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.8/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.8/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.8/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.8/Add": {"qtype": "int16"},
|
||||
"/blocks.9/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.9/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.9/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.9/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.9/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.9/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.9/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.9/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.9/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.9/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.9/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.9/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.9/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.9/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.9/Add": {"qtype": "int16"},
|
||||
"/blocks.10/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.10/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.10/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.10/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.10/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.10/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.10/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.10/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.10/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.10/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.10/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.10/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.10/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.10/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.10/Add": {"qtype": "int16"},
|
||||
"/blocks.11/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.11/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.11/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.11/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.11/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.11/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.11/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.11/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.11/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.11/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.11/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.11/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.11/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.11/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.11/Add": {"qtype": "int16"},
|
||||
"/blocks.12/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.12/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.12/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.12/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.12/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.12/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.12/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.12/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.12/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.12/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.12/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.12/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.12/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.12/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.12/Add": {"qtype": "int16"},
|
||||
"/blocks.13/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.13/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.13/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.13/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
|
||||
"/blocks.13/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.13/ffn/fc1/MatMul": {"qtype": "int16"},
|
||||
"/blocks.13/ffn/act/Mul": {"qtype": "int16"},
|
||||
"/blocks.13/ffn/act/Mul_1": {"qtype": "int16"},
|
||||
"/blocks.13/ffn/act/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.13/ffn/act/Add": {"qtype": "int16"},
|
||||
"/blocks.13/ffn/act/Mul_3": {"qtype": "int16"},
|
||||
"/blocks.13/ffn/act/Tanh": {"qtype": "int16"},
|
||||
"/blocks.13/norm1/Mul_2": {"qtype": "int16"},
|
||||
"/blocks.13/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/blocks.13/Add": {"qtype": "int16"},
|
||||
"/blocks.13/norm3/Div_1_reciprocal": {"qtype": "int16"},
|
||||
"/final_layer/ffn_final/act/Mul_1": {"qtype": "int16"},
|
||||
"/final_layer/ffn_final/act/Mul_2 ": {"qtype": "int16"},
|
||||
"/final_layer/norm_final/Div_1_reciprocal": {"qtype": "float32"}
|
||||
}
|
||||
}
|
||||
33
RDT/rdt-quant/ptq_yaml/170M/img_adaptor.yaml
Normal file
33
RDT/rdt-quant/ptq_yaml/170M/img_adaptor.yaml
Normal file
@ -0,0 +1,33 @@
|
||||
model_parameters:
|
||||
onnx_model: '{img_adaptor_name}'
|
||||
march: {opt.march}
|
||||
layer_out_dump: False
|
||||
working_dir: bpu_output
|
||||
output_model_file_prefix: rdt_img_adaptor
|
||||
enable_vpu: True
|
||||
input_parameters:
|
||||
input_name: ''
|
||||
input_type_rt: 'featuremap;'
|
||||
input_layout_rt: 'NCHW;'
|
||||
input_type_train: 'featuremap;'
|
||||
input_layout_train: 'NCHW;'
|
||||
norm_type: 'no_preprocess;'
|
||||
calibration_parameters:
|
||||
cal_data_dir: '{img_adaptor_cal_name}'
|
||||
cal_data_type: 'float32'
|
||||
calibration_type: 'default'
|
||||
quant_config:
|
||||
model_config:
|
||||
all_node_type: int16
|
||||
model_output_type: int16
|
||||
|
||||
compiler_parameters:
|
||||
extra_params:
|
||||
input_no_padding: true
|
||||
output_no_padding: true
|
||||
jobs: 8
|
||||
compile_mode: 'latency'
|
||||
debug: True
|
||||
advice: 1
|
||||
optimize_level: 'O2'
|
||||
core_num: 2
|
||||
29
RDT/rdt-quant/ptq_yaml/1B/dit.yaml
Normal file
29
RDT/rdt-quant/ptq_yaml/1B/dit.yaml
Normal file
@ -0,0 +1,29 @@
|
||||
calibration_parameters:
|
||||
cal_data_dir: '{dit_cal_name}/x/;{dit_cal_name}/freq/;{dit_cal_name}/t/;{dit_cal_name}/lang_c/;{dit_cal_name}/img_c/;{dit_cal_name}/lang_mask/;'
|
||||
quant_config: dit_json_name
|
||||
run_on_cpu: '/t_embedder/Cos;/t_embedder/Sin;/freq_embedder/Cos;/freq_embedder/Sin'
|
||||
compiler_parameters:
|
||||
compile_mode: latency
|
||||
core_num: 1
|
||||
debug: true
|
||||
jobs: 8
|
||||
max_time_per_fc: 0
|
||||
optimize_level: O2
|
||||
advice: 1
|
||||
input_parameters:
|
||||
input_layout_rt: NCHW;NCHW;NCHW;NCHW;NCHW;NCHW
|
||||
input_layout_train: NCHW;NCHW;NCHW;NCHW;NCHW;NCHW
|
||||
input_name: x;freq;t;lang_c;img_c;lang_mask;
|
||||
input_shape: 1x65x2048;1;1;1x64x2048;1x4374x2048;1x64
|
||||
input_space_and_range: ''
|
||||
input_type_rt: featuremap;featuremap;featuremap;featuremap;featuremap;featuremap
|
||||
input_type_train: featuremap;featuremap;featuremap;featuremap;featuremap;featuremap
|
||||
norm_type: no_preprocess;no_preprocess;no_preprocess;no_preprocess;no_preprocess;no_preprocess
|
||||
model_parameters:
|
||||
layer_out_dump: false
|
||||
debug_mode: "dump_calibration_data"
|
||||
enable_vpu: True
|
||||
march: {opt.march}
|
||||
onnx_model: {dit_name}
|
||||
output_model_file_prefix: rdt_dit
|
||||
working_dir: bpu_output
|
||||
6988
RDT/rdt-quant/ptq_yaml/1B/dit_op_config.json
Normal file
6988
RDT/rdt-quant/ptq_yaml/1B/dit_op_config.json
Normal file
File diff suppressed because it is too large
Load Diff
32
RDT/rdt-quant/ptq_yaml/1B/img_adaptor.yaml
Normal file
32
RDT/rdt-quant/ptq_yaml/1B/img_adaptor.yaml
Normal file
@ -0,0 +1,32 @@
|
||||
model_parameters:
|
||||
onnx_model: '{img_adaptor_name}'
|
||||
march: {opt.march}
|
||||
layer_out_dump: False
|
||||
working_dir: bpu_output
|
||||
output_model_file_prefix: rdt_img_adaptor
|
||||
enable_vpu: True
|
||||
input_parameters:
|
||||
input_name: ''
|
||||
input_type_rt: 'featuremap;'
|
||||
input_layout_rt: 'NCHW;'
|
||||
input_type_train: 'featuremap;'
|
||||
input_layout_train: 'NCHW;'
|
||||
norm_type: 'no_preprocess;'
|
||||
calibration_parameters:
|
||||
cal_data_dir: '{img_adaptor_cal_name}'
|
||||
cal_data_type: 'float32'
|
||||
calibration_type: 'default'
|
||||
quant_config:
|
||||
model_config:
|
||||
all_node_type: int16
|
||||
model_output_type: int16
|
||||
compiler_parameters:
|
||||
extra_params:
|
||||
input_no_padding: true
|
||||
output_no_padding: true
|
||||
jobs: 8
|
||||
compile_mode: 'latency'
|
||||
debug: True
|
||||
advice: 1
|
||||
optimize_level: 'O2'
|
||||
core_num: 2
|
||||
42
RDT/rdt-quant/read_json.py
Normal file
42
RDT/rdt-quant/read_json.py
Normal file
@ -0,0 +1,42 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
def read_config(config_file, key_path):
|
||||
"""
|
||||
Read a value from JSON config file.
|
||||
|
||||
Args:
|
||||
config_file: Path to JSON config file
|
||||
key_path: Dot-separated path to the key (e.g., "evaluation.checkpoint_path")
|
||||
|
||||
Returns:
|
||||
The value at the specified key path
|
||||
"""
|
||||
with open(config_file, 'r') as f:
|
||||
json_config = json.load(f)
|
||||
|
||||
# Navigate through nested keys
|
||||
keys = key_path.split('.')
|
||||
value = json_config
|
||||
for key in keys:
|
||||
if isinstance(value, dict):
|
||||
value = value.get(key)
|
||||
else:
|
||||
return None
|
||||
|
||||
return value
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: python read_config.py <config_file> <key_path>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
config_file = sys.argv[1]
|
||||
key_path = sys.argv[2]
|
||||
|
||||
value = read_config(config_file, key_path)
|
||||
if value is not None:
|
||||
print(value)
|
||||
else:
|
||||
print("", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
2
RDT/rdt170m-run/.dockerignore
Normal file
2
RDT/rdt170m-run/.dockerignore
Normal file
@ -0,0 +1,2 @@
|
||||
input/*
|
||||
output/*
|
||||
7
RDT/rdt170m-run/.gitignore
vendored
Normal file
7
RDT/rdt170m-run/.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
processed_data/
|
||||
training_data/
|
||||
checkpoints/
|
||||
model_config/*.yml
|
||||
wandb/*
|
||||
!models/
|
||||
!data/
|
||||
48
RDT/rdt170m-run/Dockerfile
Normal file
48
RDT/rdt170m-run/Dockerfile
Normal file
@ -0,0 +1,48 @@
|
||||
|
||||
FROM registry.d-robotics.cc/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
|
||||
# ccr-29eug8s3-pub.cnc.bj.baidubce.com/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
|
||||
WORKDIR /app
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV TZ=Asia/Shanghai
|
||||
|
||||
RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
|
||||
sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
|
||||
|
||||
RUN apt-get update --allow-unauthenticated && apt-get install -y \
|
||||
software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y \
|
||||
python3.10 \
|
||||
python3.10-dev \
|
||||
python3-pip \
|
||||
python3.10-distutils \
|
||||
libgl1-mesa-glx \
|
||||
libglib2.0-0 \
|
||||
wget \
|
||||
ffmpeg \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
|
||||
|
||||
COPY . /app/
|
||||
|
||||
RUN python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
# RUN pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu121
|
||||
# RUN pip install torch==2.1.0 torchvision==0.16.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
RUN pip install packaging==24.0
|
||||
|
||||
RUN pip install tfds-nightly==4.9.4.dev202402070044
|
||||
|
||||
RUN pip install flash_attn-2.7.2.post1+cu12torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||
|
||||
# RUN mkdir -p /app/dataset/input /app/dataset/output
|
||||
|
||||
ENTRYPOINT ["bash", "deploy.sh"]
|
||||
1
RDT/rdt170m-run/__init__.py
Normal file
1
RDT/rdt170m-run/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .deploy_policy import *
|
||||
BIN
RDT/rdt170m-run/assets/head.png
Normal file
BIN
RDT/rdt170m-run/assets/head.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 726 KiB |
300
RDT/rdt170m-run/client.py
Normal file
300
RDT/rdt170m-run/client.py
Normal file
@ -0,0 +1,300 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
RDT 推理服务器测试客户端
|
||||
使用模拟数据测试 get_actions 接口
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import logging
|
||||
import argparse
|
||||
import time
|
||||
from cloud_helper import Client
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_mock_observation(
|
||||
state_dim=6,
|
||||
img_history_size=2,
|
||||
img_height=480,
|
||||
img_width=640,
|
||||
num_cameras=3
|
||||
):
|
||||
"""创建模拟的观测数据
|
||||
|
||||
Args:
|
||||
state_dim: 状态向量维度(关节数量)
|
||||
img_history_size: 图像历史长度
|
||||
img_height: 图像高度
|
||||
img_width: 图像宽度
|
||||
num_cameras: 相机数量
|
||||
|
||||
Returns:
|
||||
observation: 包含状态和图像的观测字典
|
||||
"""
|
||||
observation = {}
|
||||
|
||||
# 1. 创建模拟的机器人状态(关节角度等)
|
||||
# 范围在 [-180, 180] 度之间
|
||||
state = np.random.uniform(-180, 180, size=(state_dim,)).astype(np.float32)
|
||||
observation["state"] = state
|
||||
|
||||
# 2. 创建模拟的相机图像
|
||||
# 注意:msgpack_numpy 会自动处理 numpy 数组的序列化
|
||||
camera_names = ["cam_high", "cam_left_wrist", "cam_right_wrist"]
|
||||
|
||||
for i, cam_name in enumerate(camera_names[:num_cameras]):
|
||||
# 创建彩色渐变图像作为模拟数据
|
||||
images = []
|
||||
for t in range(img_history_size):
|
||||
# 为每个时间步创建不同颜色的图像
|
||||
img = np.zeros((img_height, img_width, 3), dtype=np.uint8)
|
||||
|
||||
# 创建彩色渐变效果
|
||||
color_shift = (t * 50 + i * 100) % 255
|
||||
img[:, :, 0] = np.linspace(color_shift, 255, img_width, dtype=np.uint8) # R
|
||||
img[:, :, 1] = np.linspace(0, 255 - color_shift, img_height, dtype=np.uint8)[:, None] # G
|
||||
img[:, :, 2] = 128 # B
|
||||
|
||||
images.append(img)
|
||||
|
||||
# 堆叠为 (IMG_HISTORY_SIZE, H, W, 3) 格式
|
||||
observation[f"images.{cam_name}"] = np.stack(images, axis=0)
|
||||
|
||||
return observation
|
||||
|
||||
|
||||
def create_test_batch(
|
||||
observation,
|
||||
instruction="pick up the bottle and place it in the box",
|
||||
use_instruction_index=False
|
||||
):
|
||||
"""创建完整的测试批次数据
|
||||
|
||||
Args:
|
||||
observation: 观测数据字典
|
||||
instruction: 指令字符串或索引
|
||||
use_instruction_index: 是否使用指令索引而非字符串
|
||||
|
||||
Returns:
|
||||
batch: 完整的请求数据
|
||||
"""
|
||||
batch = {
|
||||
"observation": observation,
|
||||
"instruction": 0 if use_instruction_index else instruction
|
||||
}
|
||||
return batch
|
||||
|
||||
|
||||
def test_single_request(client, args):
|
||||
"""测试单次请求"""
|
||||
logger.info("=" * 60)
|
||||
logger.info("开始单次请求测试")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# 创建模拟数据
|
||||
observation = create_mock_observation(
|
||||
state_dim=args.state_dim,
|
||||
img_history_size=args.img_history_size,
|
||||
img_height=args.img_height,
|
||||
img_width=args.img_width,
|
||||
num_cameras=args.num_cameras
|
||||
)
|
||||
|
||||
logger.info(f"模拟观测数据:")
|
||||
logger.info(f" - state shape: {observation['state'].shape}")
|
||||
for key in observation.keys():
|
||||
if key.startswith("images."):
|
||||
logger.info(f" - {key} shape: {observation[key].shape}")
|
||||
|
||||
# 创建请求批次
|
||||
batch = create_test_batch(
|
||||
observation,
|
||||
instruction=args.instruction,
|
||||
use_instruction_index=args.use_index
|
||||
)
|
||||
|
||||
# 发送请求
|
||||
logger.info(f"发送指令: {batch['instruction']}")
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
action = client.call_endpoint("get_actions", batch)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
logger.info(f"✓ 请求成功! 耗时: {elapsed_time*1000:.2f} ms")
|
||||
logger.info(f" - action shape: {action.shape}")
|
||||
logger.info(f" - action dtype: {action.dtype}")
|
||||
logger.info(f" - action range: [{action.min():.3f}, {action.max():.3f}]")
|
||||
logger.info(f" - action preview (前3个时间步的前3个维度):")
|
||||
preview_steps = min(3, action.shape[0])
|
||||
preview_dims = min(3, action.shape[1])
|
||||
for t in range(preview_steps):
|
||||
logger.info(f" t={t}: {action[t, :preview_dims]}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"✗ 请求失败: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def test_multiple_requests(client, args):
|
||||
"""测试多次连续请求(性能测试)"""
|
||||
logger.info("=" * 60)
|
||||
logger.info(f"开始连续请求测试 (共 {args.num_requests} 次)")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# 预先创建观测数据
|
||||
observation = create_mock_observation(
|
||||
state_dim=args.state_dim,
|
||||
img_history_size=args.img_history_size,
|
||||
img_height=args.img_height,
|
||||
img_width=args.img_width,
|
||||
num_cameras=args.num_cameras
|
||||
)
|
||||
|
||||
batch = create_test_batch(
|
||||
observation,
|
||||
instruction=args.instruction,
|
||||
use_instruction_index=args.use_index
|
||||
)
|
||||
|
||||
success_count = 0
|
||||
total_time = 0
|
||||
latencies = []
|
||||
|
||||
for i in range(args.num_requests):
|
||||
try:
|
||||
start_time = time.time()
|
||||
action = client.call_endpoint("get_actions", batch)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
success_count += 1
|
||||
total_time += elapsed_time
|
||||
latencies.append(elapsed_time)
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
logger.info(f"已完成 {i + 1}/{args.num_requests} 次请求")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"第 {i+1} 次请求失败: {e}")
|
||||
|
||||
# 统计结果
|
||||
logger.info("=" * 60)
|
||||
logger.info("性能统计:")
|
||||
logger.info(f" - 总请求数: {args.num_requests}")
|
||||
logger.info(f" - 成功数: {success_count}")
|
||||
logger.info(f" - 失败数: {args.num_requests - success_count}")
|
||||
logger.info(f" - 成功率: {success_count/args.num_requests*100:.1f}%")
|
||||
|
||||
if latencies:
|
||||
latencies = np.array(latencies)
|
||||
logger.info(f" - 平均延迟: {np.mean(latencies)*1000:.2f} ms")
|
||||
logger.info(f" - 中位数延迟: {np.median(latencies)*1000:.2f} ms")
|
||||
logger.info(f" - 最小延迟: {np.min(latencies)*1000:.2f} ms")
|
||||
logger.info(f" - 最大延迟: {np.max(latencies)*1000:.2f} ms")
|
||||
logger.info(f" - 吞吐量: {success_count/total_time:.2f} requests/s")
|
||||
|
||||
|
||||
def test_different_instructions(client, args):
|
||||
"""测试不同的指令"""
|
||||
logger.info("=" * 60)
|
||||
logger.info("测试不同指令")
|
||||
logger.info("=" * 60)
|
||||
|
||||
instructions = [
|
||||
"pick up the red cube",
|
||||
"place the bottle on the table",
|
||||
"move to the left",
|
||||
"grasp the bottle",
|
||||
"open the drawer"
|
||||
]
|
||||
|
||||
observation = create_mock_observation(
|
||||
state_dim=args.state_dim,
|
||||
img_history_size=args.img_history_size,
|
||||
img_height=args.img_height,
|
||||
img_width=args.img_width,
|
||||
num_cameras=args.num_cameras
|
||||
)
|
||||
|
||||
for i, instruction in enumerate(instructions):
|
||||
logger.info(f"\n测试指令 {i+1}/{len(instructions)}: '{instruction}'")
|
||||
batch = create_test_batch(observation, instruction=instruction)
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
action = client.call_endpoint("get_actions", batch)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
logger.info(f" ✓ 成功 | 耗时: {elapsed_time*1000:.2f} ms | action shape: {action.shape}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f" ✗ 失败: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="RDT 推理服务器测试客户端")
|
||||
|
||||
# 连接参数
|
||||
parser.add_argument("--host", type=str, default="localhost", help="服务器地址")
|
||||
parser.add_argument("--port", type=int, default=8005, help="服务器端口")
|
||||
|
||||
# 测试模式
|
||||
parser.add_argument("--mode", type=str, default="single",
|
||||
choices=["single", "multiple", "instructions"],
|
||||
help="测试模式: single(单次), multiple(多次), instructions(不同指令)")
|
||||
parser.add_argument("--num-requests", type=int, default=50,
|
||||
help="多次测试的请求数量")
|
||||
|
||||
# 数据参数
|
||||
parser.add_argument("--state-dim", type=int, default=6, help="状态向量维度")
|
||||
parser.add_argument("--img-history-size", type=int, default=2, help="图像历史长度")
|
||||
parser.add_argument("--img-height", type=int, default=480, help="图像高度")
|
||||
parser.add_argument("--img-width", type=int, default=640, help="图像宽度")
|
||||
parser.add_argument("--num-cameras", type=int, default=3, help="相机数量 (与服务器配置一致)")
|
||||
|
||||
# 指令参数
|
||||
parser.add_argument("--instruction", type=str,
|
||||
default="pick up the bottle and place it in the box",
|
||||
help="测试指令")
|
||||
parser.add_argument("--use-index", action="store_true",
|
||||
help="使用指令索引而非字符串")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 连接服务器
|
||||
logger.info(f"正在连接到 {args.host}:{args.port} ...")
|
||||
try:
|
||||
client = Client(host=args.host, port=args.port)
|
||||
logger.info("✓ 连接成功!")
|
||||
except Exception as e:
|
||||
logger.error(f"✗ 连接失败: {e}")
|
||||
return
|
||||
|
||||
# 根据模式运行测试
|
||||
try:
|
||||
if args.mode == "single":
|
||||
test_single_request(client, args)
|
||||
elif args.mode == "multiple":
|
||||
test_multiple_requests(client, args)
|
||||
elif args.mode == "instructions":
|
||||
test_different_instructions(client, args)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("\n测试被用户中断")
|
||||
except Exception as e:
|
||||
logger.error(f"测试过程中发生错误: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
162
RDT/rdt170m-run/cloud_helper.py
Normal file
162
RDT/rdt170m-run/cloud_helper.py
Normal file
@ -0,0 +1,162 @@
|
||||
import zmq
|
||||
import msgpack
|
||||
import msgpack_numpy as m
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
from typing import Any, Callable
|
||||
import zstandard as zstd
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
compresser = zstd.ZstdCompressor(level=12)
|
||||
decompresser = zstd.ZstdDecompressor()
|
||||
|
||||
|
||||
def _pack(data: Any) -> bytes:
|
||||
return compresser.compress(msgpack.packb(data, default=m.encode, use_bin_type=True))
|
||||
|
||||
|
||||
def _unpack(data: bytes) -> Any:
|
||||
return msgpack.unpackb(
|
||||
decompresser.decompress(data), object_hook=m.decode, raw=False
|
||||
)
|
||||
|
||||
|
||||
class Server:
|
||||
def __init__(self, host: str = "*", port: int = 5555):
|
||||
self.host = host
|
||||
self.port = port
|
||||
|
||||
self.context = zmq.Context()
|
||||
self.socket = self.context.socket(zmq.REP)
|
||||
self.socket.bind(f"tcp://{self.host}:{self.port}")
|
||||
logger.info(f"Server started at tcp://{self.host}:{self.port}")
|
||||
|
||||
self.endpoints: dict[str, Callable[[Any], Any]] = {}
|
||||
|
||||
def register_endpoint(self, command: str, func: Callable[[Any], Any]):
|
||||
self.endpoints[command] = func
|
||||
logger.info(f"Registered endpoint: {command} -> {func}")
|
||||
|
||||
def return_error(self, message: str) -> None:
|
||||
self.socket.send(_pack({"status": "error", "data": message}))
|
||||
|
||||
def return_ok(self, data: Any) -> None:
|
||||
self.socket.send(_pack({"status": "ok", "data": data}))
|
||||
|
||||
def handle_once(self) -> None:
|
||||
message = self.socket.recv()
|
||||
message = _unpack(message)
|
||||
|
||||
cmd = message.get("command")
|
||||
data = message.get("data")
|
||||
|
||||
logger.info("Received Command: %s", cmd)
|
||||
|
||||
handler = self.endpoints.get(cmd)
|
||||
|
||||
if handler is not None:
|
||||
try:
|
||||
if data is None:
|
||||
response = handler()
|
||||
else:
|
||||
response = handler(data)
|
||||
self.return_ok(response)
|
||||
except Exception as e:
|
||||
logger.error(f"Error handling command {cmd}: {e}")
|
||||
self.return_error(str(e))
|
||||
else:
|
||||
logger.warning(f"Unknown command: {cmd}")
|
||||
self.return_error(f"Unknown command: {cmd}")
|
||||
|
||||
def loop_forever(self):
|
||||
try:
|
||||
while True:
|
||||
self.handle_once()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Server shutting down...")
|
||||
|
||||
finally:
|
||||
self.socket.close()
|
||||
self.context.term()
|
||||
|
||||
|
||||
class Client:
|
||||
def __init__(self, host: str = "localhost", port: int = 5555):
|
||||
self.context = zmq.Context()
|
||||
self.socket = self.context.socket(zmq.REQ)
|
||||
self.socket.connect(f"tcp://{host}:{port}")
|
||||
logger.info(f"Client connected to tcp://{host}:{port}")
|
||||
|
||||
def call_endpoint(self, command: str, data=None):
|
||||
self.socket.send(_pack({"command": command, "data": data}))
|
||||
message = self.socket.recv()
|
||||
message = _unpack(message)
|
||||
|
||||
if message.get("status") == "ok":
|
||||
return message.get("data")
|
||||
else:
|
||||
logger.error(f"Error from server: {message.get('data')}")
|
||||
raise Exception(f"Error from server: {message.get('data')}")
|
||||
|
||||
|
||||
def freq_control(freq: int = 25):
|
||||
def decorator(func):
|
||||
def wrapper(*args, **kwargs):
|
||||
start_time = time.time()
|
||||
result = func(*args, **kwargs)
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time
|
||||
# logger.info(f"'{func.__name__}' tooks {elapsed_time * 1000:.2f} ms")
|
||||
time.sleep(max(0, (1.0 / freq) - elapsed_time))
|
||||
return result
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
from time import sleep
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
|
||||
assert (len(sys.argv) == 2) and ((mode := sys.argv[1]) in ("server", "client")), (
|
||||
"Usage: python service.py [server|client]"
|
||||
)
|
||||
|
||||
## Protocol:
|
||||
# Request: { "command": str, "data": Any }
|
||||
# Response: { "status": "ok" | "error", "data": Any if status=="ok" else str (ErrorMsg) }
|
||||
|
||||
if mode == "server":
|
||||
server = Server()
|
||||
server.register_endpoint("ping", lambda: "pong")
|
||||
server.register_endpoint("echo", lambda x: x)
|
||||
server.register_endpoint("add", lambda data: data["a"] + data["b"])
|
||||
server.loop_forever()
|
||||
|
||||
elif mode == "client":
|
||||
client = Client()
|
||||
while True:
|
||||
try:
|
||||
response = client.call_endpoint("ping")
|
||||
print(f"Response from server: {response}")
|
||||
response = client.call_endpoint("echo", "Hello, World!")
|
||||
print(f"Response from server: {response}")
|
||||
response = client.call_endpoint("add", {"a": 5, "b": 10})
|
||||
print(f"Response from server: {response}")
|
||||
|
||||
sleep(0.2)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
break
|
||||
71
RDT/rdt170m-run/configs/base.yaml
Normal file
71
RDT/rdt170m-run/configs/base.yaml
Normal file
@ -0,0 +1,71 @@
|
||||
common:
|
||||
# The number of historical images
|
||||
img_history_size: 2
|
||||
# The number of future actions to predict
|
||||
action_chunk_size: 64
|
||||
# The number of cameras to be used in the model
|
||||
num_cameras: 3
|
||||
# Dimension for state/action, we use the same space for both state and action
|
||||
# This MUST be equal to configs/state_vec.py
|
||||
state_dim: 128
|
||||
|
||||
|
||||
dataset:
|
||||
# We will extract the data from raw dataset
|
||||
# and store them in the disk buffer by producer
|
||||
# When training, we will read the data
|
||||
# randomly from the buffer by consumer
|
||||
# The producer will replace the data which has been
|
||||
# read by the consumer with new data
|
||||
|
||||
# The path to the buffer (at least 400GB)
|
||||
buf_path: /path/to/buffer
|
||||
# The number of chunks in the buffer
|
||||
buf_num_chunks: 512
|
||||
# The number of samples (step rather than episode) in each chunk
|
||||
buf_chunk_size: 512
|
||||
|
||||
# We will filter the episodes with length less than `epsd_len_thresh_low`
|
||||
epsd_len_thresh_low: 32
|
||||
# For those more than `epsd_len_thresh_high`,
|
||||
# we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
|
||||
# to better balance the training datasets
|
||||
epsd_len_thresh_high: 2048
|
||||
# How to fit the image size
|
||||
image_aspect_ratio: pad
|
||||
# Maximum number of language tokens
|
||||
tokenizer_max_length: 1024
|
||||
|
||||
model:
|
||||
# Config for condition adpators
|
||||
lang_adaptor: mlp2x_gelu
|
||||
img_adaptor: mlp2x_gelu
|
||||
state_adaptor: mlp3x_gelu
|
||||
lang_token_dim: 4096
|
||||
img_token_dim: 1152
|
||||
# Dim of action or proprioception vector
|
||||
# A `state` refers to an action or a proprioception vector
|
||||
state_token_dim: 128
|
||||
# Config for RDT structure
|
||||
rdt:
|
||||
# 1B: num_head 32 hidden_size 2048
|
||||
hidden_size: 2048
|
||||
depth: 28
|
||||
num_heads: 32
|
||||
cond_pos_embed_type: multimodal
|
||||
# For noise scheduler
|
||||
noise_scheduler:
|
||||
type: ddpm
|
||||
num_train_timesteps: 1000
|
||||
num_inference_timesteps: 5
|
||||
beta_schedule: squaredcos_cap_v2 # Critical choice
|
||||
prediction_type: sample
|
||||
clip_sample: False
|
||||
# For EMA (params averaging)
|
||||
# We do not use EMA currently
|
||||
ema:
|
||||
update_after_step: 0
|
||||
inv_gamma: 1.0
|
||||
power: 0.75
|
||||
min_value: 0.0
|
||||
max_value: 0.9999
|
||||
@ -0,0 +1,50 @@
|
||||
{
|
||||
"A": [
|
||||
[
|
||||
-0.2691913843154907,
|
||||
-0.21995729207992554,
|
||||
-0.182277649641037
|
||||
],
|
||||
[
|
||||
0.35127854347229004,
|
||||
0.2769763469696045,
|
||||
0.17159393429756165
|
||||
]
|
||||
],
|
||||
"B": [
|
||||
[
|
||||
-0.2576896846294403,
|
||||
-0.22244493663311005,
|
||||
-0.20557966828346252
|
||||
],
|
||||
[
|
||||
0.32854634523391724,
|
||||
0.2922680974006653,
|
||||
0.17373555898666382
|
||||
]
|
||||
],
|
||||
"C": [
|
||||
[
|
||||
-0.29205888509750366,
|
||||
-0.24688798189163208,
|
||||
-0.17577645182609558
|
||||
],
|
||||
[
|
||||
0.25053921341896057,
|
||||
0.3277084231376648,
|
||||
0.16431939601898193
|
||||
]
|
||||
],
|
||||
"D": [
|
||||
[
|
||||
-0.25131964683532715,
|
||||
-0.15233077108860016,
|
||||
-0.13294968008995056
|
||||
],
|
||||
[
|
||||
0.19209328293800354,
|
||||
0.19344553351402283,
|
||||
0.1370421051979065
|
||||
]
|
||||
]
|
||||
}
|
||||
65
RDT/rdt170m-run/configs/dataset_control_freq.json
Normal file
65
RDT/rdt170m-run/configs/dataset_control_freq.json
Normal file
@ -0,0 +1,65 @@
|
||||
{
|
||||
"fractal20220817_data": 3,
|
||||
"taco_play": 15,
|
||||
"jaco_play": 10,
|
||||
"berkeley_cable_routing": 10,
|
||||
"nyu_door_opening_surprising_effectiveness": 3,
|
||||
"viola": 20,
|
||||
"berkeley_autolab_ur5": 5,
|
||||
"toto": 30,
|
||||
"kuka": 10,
|
||||
"language_table": 10,
|
||||
"columbia_cairlab_pusht_real": 10,
|
||||
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 20,
|
||||
"nyu_rot_dataset_converted_externally_to_rlds":3,
|
||||
"stanford_hydra_dataset_converted_externally_to_rlds": 10,
|
||||
"austin_buds_dataset_converted_externally_to_rlds": 20,
|
||||
"nyu_franka_play_dataset_converted_externally_to_rlds": 3,
|
||||
"maniskill_dataset_converted_externally_to_rlds": 20,
|
||||
"furniture_bench_dataset_converted_externally_to_rlds": 10,
|
||||
"ucsd_kitchen_dataset_converted_externally_to_rlds": 2,
|
||||
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": 3,
|
||||
"austin_sailor_dataset_converted_externally_to_rlds": 20,
|
||||
"austin_sirius_dataset_converted_externally_to_rlds": 20,
|
||||
"bc_z": 10,
|
||||
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": 10,
|
||||
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 10,
|
||||
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
|
||||
"utokyo_xarm_bimanual_converted_externally_to_rlds": 10,
|
||||
"berkeley_mvp_converted_externally_to_rlds": 5,
|
||||
"berkeley_rpt_converted_externally_to_rlds": 30,
|
||||
"kaist_nonprehensile_converted_externally_to_rlds": 10,
|
||||
"stanford_mask_vit_converted_externally_to_rlds": 0,
|
||||
"tokyo_u_lsmo_converted_externally_to_rlds": 10,
|
||||
"dlr_sara_pour_converted_externally_to_rlds": 10,
|
||||
"dlr_sara_grid_clamp_converted_externally_to_rlds": 10,
|
||||
"dlr_edan_shared_control_converted_externally_to_rlds": 5,
|
||||
"asu_table_top_converted_externally_to_rlds": 12.5,
|
||||
"stanford_robocook_converted_externally_to_rlds": 5,
|
||||
"eth_agent_affordances": 66.6,
|
||||
"imperialcollege_sawyer_wrist_cam": 10,
|
||||
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": 20,
|
||||
"uiuc_d3field": 1,
|
||||
"utaustin_mutex": 20,
|
||||
"berkeley_fanuc_manipulation": 10,
|
||||
"cmu_play_fusion": 5,
|
||||
"cmu_stretch": 10,
|
||||
"berkeley_gnm_recon": 3,
|
||||
"berkeley_gnm_cory_hall": 5,
|
||||
"berkeley_gnm_sac_son": 10,
|
||||
"robo_net": 1,
|
||||
"roboturk_real_towercreation": 10,
|
||||
"roboturk_real_laundrylayout": 10,
|
||||
"roboturk_real_objectsearch": 10,
|
||||
"aloha_mobile": 50,
|
||||
"aloha_static": 50,
|
||||
"roboset": 5,
|
||||
"droid": 15,
|
||||
"fmb": 10,
|
||||
"dobbe": 30,
|
||||
"qut_dexterous_manpulation": 30,
|
||||
"agilex": 25,
|
||||
"rh20t": 10,
|
||||
"calvin": 30,
|
||||
"bridgev2": 5
|
||||
}
|
||||
575
RDT/rdt170m-run/configs/dataset_img_keys.json
Normal file
575
RDT/rdt170m-run/configs/dataset_img_keys.json
Normal file
@ -0,0 +1,575 @@
|
||||
{
|
||||
"fractal20220817_data": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[
|
||||
1,0,0,0
|
||||
]
|
||||
},
|
||||
"taco_play": {
|
||||
"image_keys": [
|
||||
"rgb_static",
|
||||
"rgb_gripper",
|
||||
"rgb_static",
|
||||
"rgb_static"
|
||||
],
|
||||
"image_mask":[
|
||||
1,1,0,0
|
||||
]
|
||||
},
|
||||
"jaco_play": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image_wrist",
|
||||
"image_wrist",
|
||||
"image_wrist"
|
||||
],
|
||||
"image_mask":[
|
||||
1,1,0,0
|
||||
]
|
||||
},
|
||||
"berkeley_cable_routing": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist45_image",
|
||||
"wrist225_image",
|
||||
"top_image"
|
||||
],
|
||||
"image_mask":[1,1,0,1]
|
||||
},
|
||||
"nyu_door_opening_surprising_effectiveness": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"viola": {
|
||||
"image_keys": [
|
||||
"agentview_rgb",
|
||||
"eye_in_hand_rgb",
|
||||
"eye_in_hand_rgb",
|
||||
"eye_in_hand_rgb"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"berkeley_autolab_ur5": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"hand_image",
|
||||
"hand_image",
|
||||
"hand_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"toto": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"kuka": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"language_table": {
|
||||
"image_keys": [
|
||||
"rgb",
|
||||
"rgb",
|
||||
"rgb",
|
||||
"rgb"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"columbia_cairlab_pusht_real": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"nyu_rot_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"stanford_hydra_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"austin_buds_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"nyu_franka_play_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image_additional_view",
|
||||
"image_additional_view",
|
||||
"image_additional_view"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"maniskill_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"furniture_bench_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"ucsd_kitchen_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"austin_sailor_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"austin_sirius_dataset_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"bc_z": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"hand_image",
|
||||
"hand_image",
|
||||
"image2"
|
||||
],
|
||||
"image_mask":[1,1,0,1]
|
||||
},
|
||||
"utokyo_xarm_bimanual_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"berkeley_mvp_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"hand_image",
|
||||
"hand_image",
|
||||
"hand_image",
|
||||
"hand_image"
|
||||
],
|
||||
"image_mask":[0,1,0,0]
|
||||
},
|
||||
"berkeley_rpt_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"hand_image",
|
||||
"hand_image",
|
||||
"hand_image",
|
||||
"hand_image"
|
||||
],
|
||||
"image_mask":[0,1,0,0]
|
||||
},
|
||||
"kaist_nonprehensile_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"stanford_mask_vit_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"tokyo_u_lsmo_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"dlr_sara_pour_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"dlr_sara_grid_clamp_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"dlr_edan_shared_control_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"asu_table_top_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"stanford_robocook_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image_2",
|
||||
"image_1",
|
||||
"image_3",
|
||||
"image_4"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"eth_agent_affordances": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"imperialcollege_sawyer_wrist_cam": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[0,1,0,0]
|
||||
},
|
||||
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"uiuc_d3field": {
|
||||
"image_keys": [
|
||||
"image_1",
|
||||
"image_2",
|
||||
"image_3",
|
||||
"image_4"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"utaustin_mutex": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"berkeley_fanuc_manipulation": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"cmu_play_fusion": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"cmu_stretch": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"berkeley_gnm_recon": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"berkeley_gnm_cory_hall": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"berkeley_gnm_sac_son": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"robo_net": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image1",
|
||||
"image2",
|
||||
"image2"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"roboturk_real_towercreation": {
|
||||
"image_keys": [
|
||||
"top_rgb_frame",
|
||||
"front_rgb_frame",
|
||||
"front_rgb_frame",
|
||||
"front_rgb_frame"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"roboturk_real_laundrylayout": {
|
||||
"image_keys": [
|
||||
"top_rgb_frame",
|
||||
"front_rgb_frame",
|
||||
"front_rgb_frame",
|
||||
"front_rgb_frame"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"roboturk_real_objectsearch": {
|
||||
"image_keys": [
|
||||
"top_rgb_frame",
|
||||
"front_rgb_frame",
|
||||
"front_rgb_frame",
|
||||
"front_rgb_frame"
|
||||
],
|
||||
"image_mask":[1,0,0,1]
|
||||
},
|
||||
"aloha_mobile": {
|
||||
"image_keys": [
|
||||
"cam_high",
|
||||
"cam_right_wrist",
|
||||
"cam_left_wrist",
|
||||
"cam_right_wrist"
|
||||
],
|
||||
"image_mask":[1,1,1,0]
|
||||
},
|
||||
"aloha_static": {
|
||||
"image_keys": [
|
||||
"cam_high",
|
||||
"cam_right_wrist",
|
||||
"cam_left_wrist",
|
||||
"cam_low"
|
||||
],
|
||||
"image_mask":[1,1,1,1]
|
||||
},
|
||||
"roboset": {
|
||||
"image_keys": [
|
||||
"rgb_top",
|
||||
"rgb_right",
|
||||
"rgb_left",
|
||||
"rgb_right"
|
||||
],
|
||||
"image_mask":[1,1,1,0]
|
||||
},
|
||||
"droid": {
|
||||
"image_keys": [
|
||||
"exterior_image_1_left",
|
||||
"wrist_image_left",
|
||||
"wrist_image_left",
|
||||
"exterior_image_2_left"
|
||||
],
|
||||
"image_mask":[1,1,0,1]
|
||||
},
|
||||
"fmb": {
|
||||
"image_keys": [
|
||||
"image_side_1",
|
||||
"image_wrist_1",
|
||||
"image_wrist_1",
|
||||
"image_side_2"
|
||||
],
|
||||
"image_mask":[1,1,0,1]
|
||||
},
|
||||
"dobbe": {
|
||||
"image_keys": [
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[0,1,0,0]
|
||||
},
|
||||
"qut_dexterous_manpulation": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"wrist_image",
|
||||
"wrist_image",
|
||||
"wrist_image"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"agilex": {
|
||||
"image_keys": [
|
||||
"cam_high",
|
||||
"cam_right_wrist",
|
||||
"cam_left_wrist",
|
||||
"cam_right_wrist"
|
||||
],
|
||||
"image_mask":[1,1,1,0]
|
||||
},
|
||||
"rh20t": {
|
||||
"image_keys": [
|
||||
"image",
|
||||
"image",
|
||||
"image",
|
||||
"image"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
},
|
||||
"calvin": {
|
||||
"image_keys": [
|
||||
"rgb_static",
|
||||
"rgb_gripper",
|
||||
"rgb_gripper",
|
||||
"rgb_gripper"
|
||||
],
|
||||
"image_mask":[1,1,0,0]
|
||||
},
|
||||
"bridgev2": {
|
||||
"image_keys": [
|
||||
"images0",
|
||||
"images0",
|
||||
"images0",
|
||||
"images0"
|
||||
],
|
||||
"image_mask":[1,0,0,0]
|
||||
}
|
||||
}
|
||||
525
RDT/rdt170m-run/configs/dataset_stat.json
Normal file
525
RDT/rdt170m-run/configs/dataset_stat.json
Normal file
@ -0,0 +1,525 @@
|
||||
{
|
||||
"agilex": {
|
||||
"dataset_name": "agilex",
|
||||
"state_mean": [
|
||||
-0.0036545392947090432,
|
||||
-0.2773659935760079,
|
||||
0.3147616748061523,
|
||||
0.3813313179910183,
|
||||
0.04028575944090457,
|
||||
0.034888520819083294,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0
|
||||
],
|
||||
"state_std": [
|
||||
0.05763674563578847,
|
||||
0.2580181064167735,
|
||||
0.19785840483767897,
|
||||
0.05020347749331385,
|
||||
0.054529239104671424,
|
||||
0.05020521339363586,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0
|
||||
],
|
||||
"state_min": [
|
||||
-0.17447535196940103,
|
||||
-0.5522612677680121,
|
||||
-0.3340397516886393,
|
||||
0.21861712137858072,
|
||||
-0.09725829230414497,
|
||||
0.003396739231215583,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0
|
||||
],
|
||||
"state_max": [
|
||||
0.21961932712131077,
|
||||
0.30613206227620443,
|
||||
0.5444545321994357,
|
||||
0.4866888682047526,
|
||||
0.31486290825737845,
|
||||
0.3355223337809245,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0
|
||||
]
|
||||
}
|
||||
}
|
||||
3
RDT/rdt170m-run/configs/finetune_datasets.json
Normal file
3
RDT/rdt170m-run/configs/finetune_datasets.json
Normal file
@ -0,0 +1,3 @@
|
||||
[
|
||||
"agilex"
|
||||
]
|
||||
3
RDT/rdt170m-run/configs/finetune_sample_weights.json
Normal file
3
RDT/rdt170m-run/configs/finetune_sample_weights.json
Normal file
@ -0,0 +1,3 @@
|
||||
{
|
||||
"agilex": 100
|
||||
}
|
||||
48
RDT/rdt170m-run/configs/pretrain_datasets.json
Normal file
48
RDT/rdt170m-run/configs/pretrain_datasets.json
Normal file
@ -0,0 +1,48 @@
|
||||
[
|
||||
"fractal20220817_data",
|
||||
"jaco_play",
|
||||
"taco_play",
|
||||
"berkeley_cable_routing",
|
||||
"viola",
|
||||
"berkeley_autolab_ur5",
|
||||
"toto",
|
||||
"nyu_door_opening_surprising_effectiveness",
|
||||
"columbia_cairlab_pusht_real",
|
||||
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds",
|
||||
"austin_buds_dataset_converted_externally_to_rlds",
|
||||
"kuka",
|
||||
"utokyo_xarm_bimanual_converted_externally_to_rlds",
|
||||
"stanford_hydra_dataset_converted_externally_to_rlds",
|
||||
"maniskill_dataset_converted_externally_to_rlds",
|
||||
"ucsd_kitchen_dataset_converted_externally_to_rlds",
|
||||
"ucsd_pick_and_place_dataset_converted_externally_to_rlds",
|
||||
"austin_sailor_dataset_converted_externally_to_rlds",
|
||||
"austin_sirius_dataset_converted_externally_to_rlds",
|
||||
"bc_z",
|
||||
"utokyo_pr2_opening_fridge_converted_externally_to_rlds",
|
||||
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds",
|
||||
"utokyo_xarm_pick_and_place_converted_externally_to_rlds",
|
||||
"berkeley_mvp_converted_externally_to_rlds",
|
||||
"berkeley_rpt_converted_externally_to_rlds",
|
||||
"kaist_nonprehensile_converted_externally_to_rlds",
|
||||
"tokyo_u_lsmo_converted_externally_to_rlds",
|
||||
"dlr_sara_grid_clamp_converted_externally_to_rlds",
|
||||
"stanford_robocook_converted_externally_to_rlds",
|
||||
"imperialcollege_sawyer_wrist_cam",
|
||||
"iamlab_cmu_pickup_insert_converted_externally_to_rlds",
|
||||
"utaustin_mutex",
|
||||
"berkeley_fanuc_manipulation",
|
||||
"cmu_play_fusion",
|
||||
"language_table",
|
||||
"furniture_bench_dataset_converted_externally_to_rlds",
|
||||
"droid",
|
||||
"fmb",
|
||||
"dobbe",
|
||||
"qut_dexterous_manpulation",
|
||||
"aloha_mobile",
|
||||
"aloha_static",
|
||||
"roboset",
|
||||
"rh20t",
|
||||
"calvin",
|
||||
"bridgev2"
|
||||
]
|
||||
48
RDT/rdt170m-run/configs/pretrain_sample_weights.json
Normal file
48
RDT/rdt170m-run/configs/pretrain_sample_weights.json
Normal file
@ -0,0 +1,48 @@
|
||||
{
|
||||
"fractal20220817_data": 271,
|
||||
"taco_play": 60,
|
||||
"jaco_play": 33,
|
||||
"berkeley_cable_routing": 8,
|
||||
"nyu_door_opening_surprising_effectiveness": 10,
|
||||
"viola": 12,
|
||||
"berkeley_autolab_ur5": 32,
|
||||
"toto": 32,
|
||||
"kuka": 50,
|
||||
"language_table": 100,
|
||||
"columbia_cairlab_pusht_real": 12,
|
||||
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 55,
|
||||
"stanford_hydra_dataset_converted_externally_to_rlds": 24,
|
||||
"austin_buds_dataset_converted_externally_to_rlds": 7,
|
||||
"maniskill_dataset_converted_externally_to_rlds": 174,
|
||||
"furniture_bench_dataset_converted_externally_to_rlds": 71,
|
||||
"ucsd_kitchen_dataset_converted_externally_to_rlds": 12,
|
||||
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": 37,
|
||||
"austin_sailor_dataset_converted_externally_to_rlds": 15,
|
||||
"austin_sirius_dataset_converted_externally_to_rlds": 24,
|
||||
"bc_z": 208,
|
||||
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": 9,
|
||||
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 15,
|
||||
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
|
||||
"utokyo_xarm_bimanual_converted_externally_to_rlds": 1,
|
||||
"berkeley_mvp_converted_externally_to_rlds": 22,
|
||||
"berkeley_rpt_converted_externally_to_rlds": 30,
|
||||
"kaist_nonprehensile_converted_externally_to_rlds": 14,
|
||||
"tokyo_u_lsmo_converted_externally_to_rlds": 7,
|
||||
"dlr_sara_grid_clamp_converted_externally_to_rlds": 1,
|
||||
"stanford_robocook_converted_externally_to_rlds": 50,
|
||||
"imperialcollege_sawyer_wrist_cam": 13,
|
||||
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": 25,
|
||||
"utaustin_mutex": 39,
|
||||
"berkeley_fanuc_manipulation": 20,
|
||||
"cmu_play_fusion": 24,
|
||||
"droid": 303,
|
||||
"fmb": 42,
|
||||
"dobbe": 36,
|
||||
"qut_dexterous_manpulation": 14,
|
||||
"aloha_mobile": 150,
|
||||
"aloha_static": 150,
|
||||
"roboset": 135,
|
||||
"rh20t": 331,
|
||||
"calvin": 100,
|
||||
"bridgev2": 224
|
||||
}
|
||||
126
RDT/rdt170m-run/configs/state_vec.py
Normal file
126
RDT/rdt170m-run/configs/state_vec.py
Normal file
@ -0,0 +1,126 @@
|
||||
STATE_VEC_IDX_MAPPING = {
|
||||
# [0, 10): right arm joint positions
|
||||
**{
|
||||
"arm_joint_{}_pos".format(i): i
|
||||
for i in range(10)
|
||||
},
|
||||
**{
|
||||
"right_arm_joint_{}_pos".format(i): i
|
||||
for i in range(10)
|
||||
},
|
||||
# [10, 15): right gripper joint positions
|
||||
**{
|
||||
"gripper_joint_{}_pos".format(i): i + 10
|
||||
for i in range(5)
|
||||
},
|
||||
**{
|
||||
"right_gripper_joint_{}_pos".format(i): i + 10
|
||||
for i in range(5)
|
||||
},
|
||||
"gripper_open": 10, # alias of right_gripper_joint_0_pos
|
||||
"right_gripper_open": 10,
|
||||
# [15, 25): right arm joint velocities
|
||||
**{
|
||||
"arm_joint_{}_vel".format(i): i + 15
|
||||
for i in range(10)
|
||||
},
|
||||
**{
|
||||
"right_arm_joint_{}_vel".format(i): i + 15
|
||||
for i in range(10)
|
||||
},
|
||||
# [25, 30): right gripper joint velocities
|
||||
**{
|
||||
"gripper_joint_{}_vel".format(i): i + 25
|
||||
for i in range(5)
|
||||
},
|
||||
**{
|
||||
"right_gripper_joint_{}_vel".format(i): i + 25
|
||||
for i in range(5)
|
||||
},
|
||||
"gripper_open_vel": 25, # alias of right_gripper_joint_0_vel
|
||||
"right_gripper_open_vel": 25,
|
||||
# [30, 33): right end effector positions
|
||||
"eef_pos_x": 30,
|
||||
"right_eef_pos_x": 30,
|
||||
"eef_pos_y": 31,
|
||||
"right_eef_pos_y": 31,
|
||||
"eef_pos_z": 32,
|
||||
"right_eef_pos_z": 32,
|
||||
# [33, 39): right end effector 6D pose
|
||||
"eef_angle_0": 33,
|
||||
"right_eef_angle_0": 33,
|
||||
"eef_angle_1": 34,
|
||||
"right_eef_angle_1": 34,
|
||||
"eef_angle_2": 35,
|
||||
"right_eef_angle_2": 35,
|
||||
"eef_angle_3": 36,
|
||||
"right_eef_angle_3": 36,
|
||||
"eef_angle_4": 37,
|
||||
"right_eef_angle_4": 37,
|
||||
"eef_angle_5": 38,
|
||||
"right_eef_angle_5": 38,
|
||||
# [39, 42): right end effector velocities
|
||||
"eef_vel_x": 39,
|
||||
"right_eef_vel_x": 39,
|
||||
"eef_vel_y": 40,
|
||||
"right_eef_vel_y": 40,
|
||||
"eef_vel_z": 41,
|
||||
"right_eef_vel_z": 41,
|
||||
# [42, 45): right end effector angular velocities
|
||||
"eef_angular_vel_roll": 42,
|
||||
"right_eef_angular_vel_roll": 42,
|
||||
"eef_angular_vel_pitch": 43,
|
||||
"right_eef_angular_vel_pitch": 43,
|
||||
"eef_angular_vel_yaw": 44,
|
||||
"right_eef_angular_vel_yaw": 44,
|
||||
# [45, 50): reserved
|
||||
# [50, 60): left arm joint positions
|
||||
**{
|
||||
"left_arm_joint_{}_pos".format(i): i + 50
|
||||
for i in range(10)
|
||||
},
|
||||
# [60, 65): left gripper joint positions
|
||||
**{
|
||||
"left_gripper_joint_{}_pos".format(i): i + 60
|
||||
for i in range(5)
|
||||
},
|
||||
"left_gripper_open": 60, # alias of left_gripper_joint_0_pos
|
||||
# [65, 75): left arm joint velocities
|
||||
**{
|
||||
"left_arm_joint_{}_vel".format(i): i + 65
|
||||
for i in range(10)
|
||||
},
|
||||
# [75, 80): left gripper joint velocities
|
||||
**{
|
||||
"left_gripper_joint_{}_vel".format(i): i + 75
|
||||
for i in range(5)
|
||||
},
|
||||
"left_gripper_open_vel": 75, # alias of left_gripper_joint_0_vel
|
||||
# [80, 83): left end effector positions
|
||||
"left_eef_pos_x": 80,
|
||||
"left_eef_pos_y": 81,
|
||||
"left_eef_pos_z": 82,
|
||||
# [83, 89): left end effector 6D pose
|
||||
"left_eef_angle_0": 83,
|
||||
"left_eef_angle_1": 84,
|
||||
"left_eef_angle_2": 85,
|
||||
"left_eef_angle_3": 86,
|
||||
"left_eef_angle_4": 87,
|
||||
"left_eef_angle_5": 88,
|
||||
# [89, 92): left end effector velocities
|
||||
"left_eef_vel_x": 89,
|
||||
"left_eef_vel_y": 90,
|
||||
"left_eef_vel_z": 91,
|
||||
# [92, 95): left end effector angular velocities
|
||||
"left_eef_angular_vel_roll": 92,
|
||||
"left_eef_angular_vel_pitch": 93,
|
||||
"left_eef_angular_vel_yaw": 94,
|
||||
# [95, 100): reserved
|
||||
# [100, 102): base linear velocities
|
||||
"base_vel_x": 100,
|
||||
"base_vel_y": 101,
|
||||
# [102, 103): base angular velocities
|
||||
"base_angular_vel": 102,
|
||||
# [103, 128): reserved
|
||||
}
|
||||
STATE_VEC_LEN = 128
|
||||
14
RDT/rdt170m-run/configs/zero2.json
Normal file
14
RDT/rdt170m-run/configs/zero2.json
Normal file
@ -0,0 +1,14 @@
|
||||
{
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 1e9
|
||||
}
|
||||
}
|
||||
2
RDT/rdt170m-run/data/.gitignore
vendored
Normal file
2
RDT/rdt170m-run/data/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
# Ignore data files
|
||||
datasets
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user