first update

2025-11-12 00:59:35 +08:00 · 2025-11-12 00:59:35 +08:00 · c88bfcf840
commit c88bfcf840
264 changed files with 43806 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,174 @@
+input/
+output/
+Temp/
+weights/
+# ---> Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
--- a/ACT/act_export/.dockerignore
+++ b/ACT/act_export/.dockerignore
@ -0,0 +1,2 @@
+input/*
+output/*
--- a/ACT/act_export/Dockerfile
+++ b/ACT/act_export/Dockerfile
@ -0,0 +1,40 @@
+
+FROM registry.d-robotics.cc/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+# ccr-29eug8s3-pub.cnc.bj.baidubce.com/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+WORKDIR /app
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV TZ=Asia/Shanghai
+
+RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
+    sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
+
+RUN apt-get update --allow-unauthenticated && apt-get install -y \
+    software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y \
+    python3.10 \
+    python3.10-dev \
+    python3-pip \
+    python3.10-distutils \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    wget \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+
+
+
+COPY . /app/
+
+ENV TORCH_HOME=/app/weights/
+RUN python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
+RUN pip install --ignore-installed -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+ENTRYPOINT ["python3", "export.py"]
--- a/ACT/act_export/export.py
+++ b/ACT/act_export/export.py
@ -0,0 +1,465 @@
+import logging
+import os
+import sys
+import shutil
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import argparse
+import onnx
+import json
+import yaml
+from copy import deepcopy
+from termcolor import colored
+from onnxsim import simplify
+from pprint import pformat
+import time
+from lerobot.policies.act.modeling_act import ACTPolicy
+from lerobot.datasets.factory import make_dataset
+from lerobot.utils.utils import get_safe_torch_device, init_logging
+from lerobot.configs import parser
+from lerobot.configs.train import TrainPipelineConfig
+
+_global_config = None
+
+BPU_VisionEncoder = "BPU_ACTPolicy_VisionEncoder"
+BPU_TransformerLayers = "BPU_ACTPolicy_TransformerLayers"
+
+def onnx_sim(onnx_path, onnx_sim):   
+    if onnx_sim:
+        model_onnx = onnx.load(onnx_path)  # load onnx model
+        onnx.checker.check_model(model_onnx)  # check onnx model
+        model_onnx, check = simplify(
+            model_onnx,
+            dynamic_input_shape=False,
+            input_shapes=None)
+        assert check, 'assert check failed'
+        onnx.save(model_onnx, onnx_path)   
+
+def load_config(config_path):
+    # 根据文件扩展名选择加载方式
+    with open(config_path, 'r', encoding='utf-8') as f:
+        config_dict = json.load(f)
+    args = []
+
+    if 'export' in config_dict:
+        export_cfg = config_dict['export']
+        if 'repo_id' in export_cfg:
+            args.extend(['--dataset.repo_id', str(export_cfg['repo_id'])])
+        if 'dataset_path' in export_cfg:
+            args.extend(['--dataset.root', str(export_cfg['dataset_path'])])
+
+        args.extend(['--policy.type', 'act'])
+        args.extend(['--policy.device', 'cpu' if 'gpu_id' not in config_dict else f"cuda"])
+        args.extend(['--policy.repo_id', str(export_cfg['repo_id'])])
+
+        # 使用 opencv 作为视频后端，避免 torchcodec 需要 FFmpeg 的问题
+        args.extend(['--dataset.video_backend', 'pyav'])
+
+        args.extend(['--wandb.enable', 'false'])
+
+        # 保留原始的脚本名称作为 sys.argv[0]，然后添加参数
+        sys.argv = [sys.argv[0]] + args
+
+        logging.info(f"Loaded config from {config_path}")
+        logging.info(f"Config: {sys.argv}")
+
+        return config_dict
+    
+    return None
+
+class BPU_ACTPolicy_VisionEncoder(nn.Module):
+    def __init__(self, act_policy):
+        super().__init__()
+        self.backbone = deepcopy(act_policy.model.backbone)
+        self.encoder_img_feat_input_proj = deepcopy(act_policy.model.encoder_img_feat_input_proj)
+    def forward(self, images):
+        cam_features = self.backbone(images)["feature_map"]
+        cam_features = self.encoder_img_feat_input_proj(cam_features)
+        cam_features = cam_features
+        return cam_features
+
+class BPU_ACTPolicy_TransformerLayers(nn.Module):
+    def __init__(self, act_policy, camera_names):
+        super().__init__()
+        self.model = deepcopy(act_policy.model)
+        self.camera_names = camera_names
+
+    def forward(self, states, *vision_features):
+        latent_sample = torch.zeros([1, self.model.config.latent_dim], dtype=torch.float32)
+
+        encoder_in_tokens = [self.model.encoder_latent_input_proj(latent_sample)]
+        encoder_in_pos_embed = self.model.encoder_1d_feature_pos_embed.weight.unsqueeze(1).unbind(dim=0)
+        encoder_in_tokens.append(self.model.encoder_robot_state_input_proj(states))
+
+        all_cam_features = []
+        all_cam_pos_embeds = []
+
+        # 动态处理所有相机的视觉特征
+        for vision_feature in vision_features:
+            cam_pos_embed = self.model.encoder_cam_feat_pos_embed(vision_feature)
+            all_cam_features.append(vision_feature)
+            all_cam_pos_embeds.append(cam_pos_embed)
+
+
+        tokens = []
+        for token in encoder_in_tokens:
+            tokens.append(token.view(1,1,self.model.config.dim_model))
+        all_cam_features = torch.cat(all_cam_features, axis=-1).permute(2, 3, 0, 1).view(-1,1,self.model.config.dim_model)
+        tokens.append(all_cam_features)
+        encoder_in_tokens = torch.cat(tokens, axis=0)
+
+        pos_embeds = []
+        for pos_embed in encoder_in_pos_embed:
+            pos_embeds.append(pos_embed.view(1,1,self.model.config.dim_model))
+        all_cam_pos_embeds = torch.cat(all_cam_pos_embeds, axis=-1).permute(2, 3, 0, 1).view(-1,1,self.model.config.dim_model)
+        pos_embeds.append(all_cam_pos_embeds)
+        encoder_in_pos_embed = torch.cat(pos_embeds, axis=0)
+
+        encoder_out = self.model.encoder(encoder_in_tokens, pos_embed=encoder_in_pos_embed)
+
+        decoder_in = torch.zeros(
+            (self.model.config.chunk_size, 1, self.model.config.dim_model),
+            dtype=encoder_in_pos_embed.dtype,
+            device=encoder_in_pos_embed.device,
+        )
+        decoder_out = self.model.decoder(
+            decoder_in,
+            encoder_out,
+            encoder_pos_embed=encoder_in_pos_embed,
+            decoder_pos_embed=self.model.decoder_pos_embed.weight.unsqueeze(1),
+        )
+
+        decoder_out = decoder_out.transpose(0, 1)
+
+        actions = self.model.action_head(decoder_out)
+
+        return actions
+
+def lerobotTensor2cvmat(tensor):
+    img = (tensor*255).permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8)[0,:,:,:]
+    # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+    return img
+
+def onnx_sim(onnx_path, onnx_sim):   
+    if onnx_sim:
+        model_onnx = onnx.load(onnx_path)  # load onnx model
+        onnx.checker.check_model(model_onnx)  # check onnx model
+        model_onnx, check = simplify(
+            model_onnx,
+            dynamic_input_shape=False,
+            input_shapes=None)
+        assert check, 'assert check failed'
+        onnx.save(model_onnx, onnx_path)    
+
+@parser.wrap()
+def main(cfg: TrainPipelineConfig):
+    # LeRobot的参数列表
+    # 跳过validate()，配置不是用于训练，而是用于导出
+    # cfg.validate()
+    logging.info(pformat(cfg.to_dict()))
+    
+    # BPU导出参数 - 从全局配置或命令行读取
+    global _global_config
+    
+    class BPUOptions:
+        act_path = _global_config['export']['model_path']
+        export_path = _global_config['export']['output_path'] + "/" + _global_config['task_id']
+        cal_num = _global_config['export']['calibration_num']
+        onnx_sim = True
+        combine_jobs = 6    
+    
+    opt = BPUOptions()
+
+    if _global_config:
+        opt.act_path = _global_config['export']['model_path']
+        opt.export_path = _global_config['export']['output_path'] + "/" + _global_config['task_id']
+        opt.cal_num = _global_config['export']['calibration_num']
+        opt.onnx_sim = True
+        opt.march = _global_config['export']['march']
+        opt.combine_jobs = 6
+        logging.info("BPU parameters loaded from config file")
+
+    logging.info("="*80)
+    logging.info(colored("BPU Export Configuration:", 'light_cyan'))
+    logging.info(f"  ACT Model Path:      {opt.act_path}")
+    logging.info(f"  Export Path:         {opt.export_path}")
+    logging.info(f"  Calibration Samples: {opt.cal_num}")
+    logging.info(f"  ONNX Simplify:       {opt.onnx_sim}")
+    logging.info(f"  March:               {opt.march}")
+    logging.info(f"  Compiler Jobs:       {opt.combine_jobs}")
+    logging.info(f"  Dataset Root:        {cfg.dataset.root}")
+    logging.info("="*80)
+
+    if not os.path.exists(opt.export_path): 
+        os.makedirs(opt.export_path)
+    
+    visionEncoder_ws = os.path.join(opt.export_path, BPU_VisionEncoder)
+    transformersLayers_ws = os.path.join(opt.export_path, BPU_TransformerLayers)
+    onnx_name_BPU_ACTPolicy_VisionEncoder = BPU_VisionEncoder + ".onnx"
+    onnx_path_BPU_ACTPolicy_VisionEncoder = os.path.join(visionEncoder_ws, onnx_name_BPU_ACTPolicy_VisionEncoder)
+    onnx_name_BPU_ACTPolicy_TransformerLayers = BPU_TransformerLayers + ".onnx"
+    onnx_path_BPU_ACTPolicy_TransformerLayers = os.path.join(transformersLayers_ws, onnx_name_BPU_ACTPolicy_TransformerLayers)
+    ## 导出校准文件路径
+    calbrate_data_name_BPU_ACTPolicy_VisionEncoder = "calibration_data_" + BPU_VisionEncoder
+    calbrate_data_path_BPU_ACTPolicy_VisionEncoder = os.path.join(visionEncoder_ws, calbrate_data_name_BPU_ACTPolicy_VisionEncoder)
+    calbrate_data_name_BPU_ACTPolicy_TransformerLayers = "calibration_data_" + BPU_TransformerLayers
+    calbrate_data_path_BPU_ACTPolicy_TransformerLayers = os.path.join(transformersLayers_ws, calbrate_data_name_BPU_ACTPolicy_TransformerLayers)
+    state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, "state")
+    ## 发布文件夹的脚本路径
+    bpu_output_name = "bpu_output"
+    bpu_output_path = os.path.join(opt.export_path, bpu_output_name)
+    bash_build_all_path = os.path.join(opt.export_path, "build_all.sh") 
+    ## 前后处理参数文件路径
+    action_std_path = os.path.join(bpu_output_path, "action_std.npy")
+    action_mean_path = os.path.join(bpu_output_path, "action_mean.npy")
+    action_std_unnormalize_path = os.path.join(bpu_output_path, "action_std_unnormalize.npy")
+    action_mean_unnormalize_path = os.path.join(bpu_output_path, "action_mean_unnormalize.npy")
+    ## 新建工作目录
+    os.makedirs(visionEncoder_ws, exist_ok=True)
+    logging.info(colored(f"mkdir: {visionEncoder_ws} Success.", 'green'))
+    os.makedirs(transformersLayers_ws, exist_ok=True)
+    logging.info(colored(f"mkdir: {transformersLayers_ws} Success.", 'green'))
+    os.makedirs(calbrate_data_path_BPU_ACTPolicy_VisionEncoder, exist_ok=True)
+    logging.info(colored(f"mkdir: {calbrate_data_path_BPU_ACTPolicy_VisionEncoder} Success.", 'green'))
+    os.makedirs(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, exist_ok=True)
+    logging.info(colored(f"mkdir: {calbrate_data_path_BPU_ACTPolicy_TransformerLayers} Success.", 'green'))
+    os.makedirs(state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers, exist_ok=True)
+    logging.info(colored(f"mkdir: {state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers} Success.", 'green'))
+    os.makedirs(bpu_output_path, exist_ok=True)
+    logging.info(colored(f"mkdir: {bpu_output_path} Success.", 'green'))
+
+    policy = ACTPolicy.from_pretrained(opt.act_path).cpu().eval()
+    logging.info(colored(f"Load ACT Policy Model: {opt.act_path} Success.", 'light_red'))
+    device = get_safe_torch_device(cfg.policy.device, log=True)
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+
+    # 加载数据集
+    dataset = make_dataset(cfg)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=0,
+        batch_size=1,
+        shuffle=True,
+        sampler=None,
+        pin_memory=device.type != "cpu",
+        drop_last=False,
+    )
+    logging.info(colored(f"Load ACT Policy Dataset: \n{dataset} Success.", 'light_red'))
+    batch = next(iter(dataloader))
+    image_keys = [key for key in batch.keys() if key.startswith('observation.images.')]
+    camera_names = [key.split('.')[-1] for key in image_keys]
+    logging.info(colored(f"Camera Names: {camera_names} Success.", 'light_red'))
+    logging.info(colored(f"Image Keys: {image_keys} Success.", 'light_red'))
+    logging.info(colored(f"Batch: {batch} Success.", 'light_red'))
+
+    outputs = policy.select_action(deepcopy(batch))
+
+    ## 动态获取前后处理参数
+    # 为每个相机保存归一化参数
+    for camera_name in camera_names:
+        buffer_name = f"buffer_observation_images_{camera_name}"
+        if hasattr(policy.normalize_inputs, buffer_name):
+            buffer = getattr(policy.normalize_inputs, buffer_name)
+            camera_std = buffer.std.data.detach().cpu().numpy()
+            camera_mean = buffer.mean.data.detach().cpu().numpy()
+            
+            camera_std_path = os.path.join(bpu_output_path, f"{camera_name}_std.npy")
+            camera_mean_path = os.path.join(bpu_output_path, f"{camera_name}_mean.npy")
+            
+            np.save(camera_std_path, camera_std)
+            np.save(camera_mean_path, camera_mean)
+            logging.info(f"Saved {camera_name} normalization parameters")
+
+    # 保存状态和动作归一化参数
+    action_std = policy.normalize_inputs.buffer_observation_state.std.data.detach().cpu().numpy()
+    action_mean = policy.normalize_inputs.buffer_observation_state.mean.data.detach().cpu().numpy()
+    action_std_unnormalize = policy.unnormalize_outputs.buffer_action.std.data.detach().cpu().numpy()
+    action_mean_unnormalize = policy.unnormalize_outputs.buffer_action.mean.data.detach().cpu().numpy()
+
+    np.save(action_std_path, action_std)
+    np.save(action_mean_path, action_mean)
+    np.save(action_std_unnormalize_path, action_std_unnormalize)   
+    np.save(action_mean_unnormalize_path, action_mean_unnormalize)
+
+    ## Vision Encoder
+    batch = policy.normalize_inputs(batch)
+    m_VisionEncoder = BPU_ACTPolicy_VisionEncoder(policy)
+    m_VisionEncoder.eval()
+
+    # 动态获取相机视觉特征
+    vision_features = []
+    for camera_name in camera_names:
+        input_tensor = batch[f'observation.images.{camera_name}']
+        vision_feature = m_VisionEncoder(input_tensor)
+        vision_features.append(vision_feature)
+        logging.info(f"Generated vision features for {camera_name}: {vision_feature.shape}")
+
+    # 确定ONNX版本
+    opset_version = 11 if "bayes" in opt.march else 19
+    logging.info(f"Using ONNX opset version: {opset_version} for type: {opt.march}")
+    
+    onnx_path = onnx_path_BPU_ACTPolicy_VisionEncoder
+    torch.onnx.export(
+        m_VisionEncoder,  # 要转换的模型
+        input_tensor,  # 模型的输入
+        onnx_path,  # 输出文件名
+        export_params=True,  # 存储训练后的参数
+        opset_version=opset_version,  # 动态ONNX版本
+        do_constant_folding=True,  # 是否执行常量折叠优化
+        input_names=['images'],  # 输入节点名称
+        output_names=['Vision_Features'],  # 输出节点名称
+        dynamic_axes=None
+    )
+    onnx_sim(onnx_path, opt.onnx_sim)
+    logging.info(colored(f"Export {onnx_path} Success.", 'green'))
+
+    m_TransformerLayers = BPU_ACTPolicy_TransformerLayers(policy, camera_names)
+    m_TransformerLayers.eval()
+    state = batch["observation.state"]
+    actions = m_TransformerLayers(state, *vision_features)
+    # np.save(f"new_actions.npy", actions.detach().cpu().numpy())
+
+    input_names = ['states'] + [f'{camera_name}_features' for camera_name in camera_names]
+    logging.info(f"Transformer input names: {input_names}")
+
+    onnx_path = onnx_path_BPU_ACTPolicy_TransformerLayers
+    torch.onnx.export(
+        m_TransformerLayers,  # 要转换的模型
+        (state, *vision_features),  # 模型的输入
+        onnx_path,  # 输出文件名
+        export_params=True,  # 存储训练后的参数
+        opset_version=opset_version,  # 动态ONNX版本
+        do_constant_folding=True,  # 是否执行常量折叠优化
+        input_names=input_names,  # 动态输入节点名称
+        output_names=['Actions'],  # 输出节点名称
+        dynamic_axes=None
+    )
+    onnx_sim(onnx_path, opt.onnx_sim)
+    logging.info(colored(f"Export {onnx_path} Success.", 'green'))
+
+    if "nash" in opt.march:
+        ## calibrate data - 动态生成相机校准数据目录
+        input_names_TransformerLayers = camera_names + ["state"]
+        input_cal_path = []
+        for input_name in input_names_TransformerLayers:
+            p = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, input_name)
+            input_cal_path.append(p)
+            os.makedirs(p, exist_ok=True)
+            logging.info(colored(f"mkdir: {p} Success.", 'green'))
+        
+        for i, batch in enumerate(dataloader):
+            name = "%.10d.npy"%i
+            batch = policy.normalize_inputs(batch)
+            
+            # 动态处理所有相机输入
+            camera_inputs = {}
+            for camera_name in camera_names:
+                camera_inputs[camera_name] = batch[f'observation.images.{camera_name}']
+            state_input = batch["observation.state"]
+
+            ## VisionEncoder - 动态保存所有相机的校准数据
+            if i%4 == 0:
+                for camera_name in camera_names:
+                    p = os.path.join(calbrate_data_path_BPU_ACTPolicy_VisionEncoder, f"{camera_name}_" + name)
+                    np.save(p, camera_inputs[camera_name].detach().cpu().numpy())
+                    logging.info(colored(f"save to: {p}", 'light_blue'))
+                    
+            ## TransformerLayers - 动态处理所有相机的视觉特征
+            for camera_name in camera_names:
+                vision_feature = m_VisionEncoder(camera_inputs[camera_name])
+                camera_cal_path = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, camera_name)
+                p = os.path.join(camera_cal_path, name)
+                np.save(p, vision_feature.detach().cpu().numpy())
+                logging.info(colored(f"save to: {p}", 'light_magenta'))
+        
+            p = os.path.join(state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers, name)
+            np.save(p, state_input.detach().cpu().numpy())
+            logging.info(colored(f"save to: {p}", 'light_magenta'))
+
+            if i >= opt.cal_num:
+                break
+    
+    if "bayes" in opt.march:
+
+        ## calibrate data - 动态生成相机校准数据目录
+        input_names_TransformerLayers = camera_names + ["state"]
+        input_cal_path = []
+        for input_name in input_names_TransformerLayers:
+            p = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, input_name)
+            input_cal_path.append(p)
+            os.makedirs(p, exist_ok=True)
+            logging.info(colored(f"mkdir: {p} Success.", 'green'))
+
+        for i, batch in enumerate(dataloader):
+            name = "%.10d.nchw"%i
+            batch = policy.normalize_inputs(batch)
+            
+            # 动态处理所有相机输入
+            camera_inputs = {}
+            for camera_name in camera_names:
+                camera_inputs[camera_name] = batch[f'observation.images.{camera_name}']
+            
+            state_input = batch["observation.state"]
+            
+            ## VisionEncoder - 动态保存所有相机的校准数据 (Bayes格式)
+            if i%4 == 0:
+                for camera_name in camera_names:
+                    p = os.path.join(calbrate_data_path_BPU_ACTPolicy_VisionEncoder, f"{camera_name}_" + name)
+                    camera_inputs[camera_name].detach().cpu().numpy().tofile(p)
+                    logging.info(colored(f"save to: {p}", 'light_blue'))
+            
+            ## TransformerLayers - 动态处理所有相机的视觉特征 (Bayes格式)
+            for camera_name in camera_names:
+                vision_feature = m_VisionEncoder(camera_inputs[camera_name])
+                camera_cal_path = os.path.join(calbrate_data_path_BPU_ACTPolicy_TransformerLayers, camera_name)
+                p = os.path.join(camera_cal_path, name)
+                vision_feature.detach().cpu().numpy().tofile(p)
+                logging.info(colored(f"save to: {p}", 'light_magenta'))
+
+            p = os.path.join(state_calbrate_data_path_BPU_ACTPolicy_TransformerLayers, name)
+            state_input.detach().cpu().numpy().tofile(p)
+            logging.info(colored(f"save to: {p}", 'light_magenta'))
+
+            if i >= opt.cal_num:
+                break
+
+def generate_output_config(time_cost):
+    global _global_config
+    export_path = _global_config['export']['output_path'] + "/" + _global_config['task_id']
+    TransformerLayers = export_path + "/" + BPU_TransformerLayers
+    TransformerLayers_onnx = TransformerLayers + "/" + BPU_TransformerLayers + ".onnx"
+    TransformerLayers_calibration_data = TransformerLayers + "/" + "calibration_data_" + BPU_TransformerLayers
+    VisionEncoder = export_path + "/" + BPU_VisionEncoder
+    VisionEncoder_onnx = VisionEncoder + "/" + BPU_VisionEncoder + ".onnx"
+    VisionEncoder_calibration_data = VisionEncoder + "/" + "calibration_data_" + BPU_VisionEncoder
+    output_config = {
+        "task_name": _global_config['task_id'],
+        "march": _global_config['export']['march'],
+        "time_cost": time_cost,
+        "export_path": export_path,
+        "TransformerLayers": TransformerLayers_onnx,
+        "TransformerLayers_calibration_data": TransformerLayers_calibration_data,
+        "VisionEncoder": VisionEncoder_onnx,
+        "VisionEncoder_calibration_data": VisionEncoder_calibration_data,
+    }
+    with open(os.path.join(export_path, "output.json"), "w") as f:
+        json.dump(output_config, f)
+
+
+
+if __name__ == "__main__":
+    init_logging()
+    config_path = "input/config.json"
+    _global_config = load_config(config_path)
+    time_start = time.time()
+    main()
+    time_end = time.time()
+    time_cost = time_end - time_start
+    logging.info(colored(f"Time Cost: {time_cost} seconds", 'light_red'))
+    generate_output_config(time_cost)
+
+
--- a/ACT/act_export/requirements.txt
+++ b/ACT/act_export/requirements.txt
@ -0,0 +1,5 @@
+lerobot==0.3.3
+onnx
+onnxsim
+onnxruntime
+av
--- a/ACT/act_quant/.dockerignore
+++ b/ACT/act_quant/.dockerignore
--- a/ACT/act_quant/Dockerfile
+++ b/ACT/act_quant/Dockerfile
@ -0,0 +1,17 @@
+# ARG BASE_IMAGE=ccr-29eug8s3-pub.cnc.bj.baidubce.com/deliver/ai_toolchain_ubuntu_20_x5_cpu:v1.2.8
+ARG BASE_IMAGE=ccr-29eug8s3-pub.cnc.bj.baidubce.com/aitools/ai_toolchain_ubuntu_22_j6_gpu:v3.3.0
+# 可通过 --build-arg BASE_IMAGE=... 来替换基础镜像
+FROM ${BASE_IMAGE}
+
+WORKDIR /app
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV TZ=Asia/Shanghai
+
+RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
+    sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
+
+COPY . /app/
+
+ENTRYPOINT ["bash", "convert.sh"]
--- a/ACT/act_quant/convert.sh
+++ b/ACT/act_quant/convert.sh
@ -0,0 +1,24 @@
+CONFIG=input/config.json
+TASKID=$(python3 read_json.py $CONFIG task_id)
+MARCH=$(python3 read_json.py $CONFIG quant.march)
+OUTPUT=output/$TASKID
+
+python3 load_config.py $CONFIG
+echo "Convert PTQ YAML Haved been Prepared"
+
+VISIONENCODER_YAML=$OUTPUT/ptq_yaml/VisionEncoder.yaml
+TRANSFORMERLAYERS_YAML=$OUTPUT/ptq_yaml/TransformerLayers.yaml
+
+if [[ "$MARCH" == *"nash"* ]]; then
+    echo -e "\033[44;37m===== Start Compiling TRANSFORMERLAYERS =====\033[0m"
+    hb_compile --config $TRANSFORMERLAYERS_YAML
+    echo -e "\033[44;37m===== Start Compiling TRANSFORMERLAYERS =====\033[0m"
+    hb_compile --config $VISIONENCODER_YAML
+    echo -e "\033[44;37m===== End Compiling Nash Model =====\033[0m"
+else
+    echo -e "\033[44;37m===== Start Compiling TRANSFORMERLAYERS =====\033[0m"
+    hb_mapper makertbin --model-type onnx --config $TRANSFORMERLAYERS_YAML
+    echo -e "\033[44;37m===== Start Compiling VISIONENCODER =====\033[0m"
+    hb_mapper makertbin --model-type onnx --config $VISIONENCODER_YAML
+    echo -e "\033[44;37m===== End Compiling Bayes Model =====\033[0m"
+fi
--- a/ACT/act_quant/load_config.py
+++ b/ACT/act_quant/load_config.py
@ -0,0 +1,76 @@
+import json
+import yaml
+import sys
+import os
+
+
+def load_config(config_path):
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+
+    if "quant" in config:
+        quant_info = config["quant"]
+    if "output_path" in quant_info:
+        output_path = os.path.join(quant_info["output_path"], config["task_id"])
+    if "march" in quant_info:
+        march = "nash" if "nash" in quant_info["march"] else "bayes"
+        convert_yaml_path = f"pyq_yaml/{march}/"
+    # prepare the nash and bayes bpu
+    ## first prepare the VisionEncoder yaml
+    VisionEncoder_yaml_path = os.path.join(convert_yaml_path, "VisionEncoder.yaml")
+    with open(VisionEncoder_yaml_path, "r") as file:
+        VisionEncoder_yaml = yaml.safe_load(file)
+    VisionEncoder_yaml["model_parameters"]["onnx_model"] = quant_info["VisionEncoder"]["onnx_model"]
+    VisionEncoder_yaml["calibration_parameters"]["cal_data_dir"] = quant_info["VisionEncoder"]["calibration_data"]
+    VisionEncoder_yaml["model_parameters"]["march"] = quant_info["march"]
+
+    # Make sure output ptq_yaml directory exists
+    output_ptq_yaml_dir = os.path.join(output_path, "ptq_yaml")
+    os.makedirs(output_ptq_yaml_dir, exist_ok=True)
+
+    # Save VisionEncoder yaml to output/ptq_yaml
+    VisionEncoder_yaml_save_path = os.path.join(output_ptq_yaml_dir, "VisionEncoder.yaml")
+    with open(VisionEncoder_yaml_save_path, "w") as file:
+        yaml.safe_dump(VisionEncoder_yaml, file)
+
+    ## second prepare the TransformerLayers yaml
+    TransformerLayers_yaml_path = os.path.join(convert_yaml_path, "TransformerLayers.yaml")
+    with open(TransformerLayers_yaml_path, "r") as file:
+        TransformerLayers_yaml = yaml.safe_load(file)
+    TransformerLayers_yaml["model_parameters"]["onnx_model"] = quant_info["TransformerLayers"]["onnx_model"]
+    TransformerLayers_yaml["model_parameters"]["march"] = quant_info["march"]
+    
+    TransformerLayers_Cal_dir = quant_info["TransformerLayers"]["calibration_data"]
+    # (Fix cal_data_dir variable)
+    cal_data_dir = TransformerLayers_Cal_dir
+    sub_dirs = [d for d in os.listdir(cal_data_dir) if os.path.isdir(os.path.join(cal_data_dir, d))]
+    input_names = []
+    for name in sub_dirs:
+        if name == "state":
+            input_names.append("states")
+        else:
+            input_names.append(f"{name}_features")
+    input_name_str = ";".join(input_names) + ";"
+
+    TransformerLayers_yaml["input_parameters"]["input_name"] = input_name_str
+    TransformerLayers_yaml["input_parameters"]["input_type_rt"] = "featuremap;" * len(input_names)
+    TransformerLayers_yaml["input_parameters"]["input_layout_rt"] = "NCHW;" * len(input_names)
+    TransformerLayers_yaml["input_parameters"]["input_type_train"] = "featuremap;" * len(input_names)
+    TransformerLayers_yaml["input_parameters"]["input_layout_train"] = "NCHW;" * len(input_names)
+    TransformerLayers_yaml["input_parameters"]["norm_type"] = "no_preprocess;" * len(input_names)
+    TransformerLayers_yaml["calibration_parameters"]["cal_data_dir"] = ";".join([os.path.join(TransformerLayers_Cal_dir, name) for name in sub_dirs]) + ";"
+    TransformerLayers_yaml["calibration_parameters"]["cal_data_type"] = "float32;" * len(input_names)
+
+    # Save TransformerLayers yaml to output/ptq_yaml
+    TransformerLayers_yaml_save_path = os.path.join(output_ptq_yaml_dir, "TransformerLayers.yaml")
+    with open(TransformerLayers_yaml_save_path, "w") as file:
+        yaml.safe_dump(TransformerLayers_yaml, file)
+            
+            
+
+
+
+
+if __name__ == "__main__":
+    config_path = sys.argv[1]
+    config = load_config(config_path)
--- a/ACT/act_quant/pyq_yaml/bayes/TransformerLayers.yaml
+++ b/ACT/act_quant/pyq_yaml/bayes/TransformerLayers.yaml
@ -0,0 +1,23 @@
+model_parameters:
+  onnx_model: '{onnx_name_BPU_ACTPolicy_TransformerLayers}'
+  march: "{opt.type}"
+  layer_out_dump: False
+  working_dir: 'bpu_model_output'
+  output_model_file_prefix: 'BPU_TransformerLayers'
+input_parameters:
+  input_name: "{input_name_str}"
+  input_type_rt: '{input_type_str}'
+  input_layout_rt: '{nchw_str}'
+  input_type_train: '{input_type_str}'
+  input_layout_train: '{nchw_str}'
+  norm_type: '{norm_type_str}'
+calibration_parameters:
+  cal_data_dir: '{cal_data_dir_str}'
+  cal_data_type: '{cal_data_type_str}'
+  calibration_type: 'default'
+  optimization: set_all_nodes_int16;set_Softmax_input_int16;set_Softmax_output_int16;
+compiler_parameters:
+  jobs: 6
+  compile_mode: 'latency'
+  debug: False
+  optimize_level: 'O3'
--- a/ACT/act_quant/pyq_yaml/bayes/VisionEncoder.yaml
+++ b/ACT/act_quant/pyq_yaml/bayes/VisionEncoder.yaml
@ -0,0 +1,23 @@
+model_parameters:
+  onnx_model: 'onnx_name_BPU_ACTPolicy_VisionEncoder'
+  march: "opt.type"
+  layer_out_dump: False
+  working_dir: 'bpu_model_output'
+  output_model_file_prefix: 'BPU_VisionEncoder'
+input_parameters:
+  input_name: ""
+  input_type_rt: 'featuremap'
+  input_layout_rt: 'NCHW'
+  input_type_train: 'featuremap'
+  input_layout_train: 'NCHW'
+  norm_type: 'no_preprocess'
+calibration_parameters:
+  cal_data_dir: 'calbrate_data_name_BPU_ACTPolicy_VisionEncoder'
+  cal_data_type: 'float32'
+  calibration_type: 'default'
+  optimization: set_all_nodes_int16;set_Softmax_input_int16;set_Softmax_output_int16;
+compiler_parameters:
+  jobs: 6
+  compile_mode: 'latency'
+  debug: true
+  optimize_level: 'O3'
--- a/ACT/act_quant/pyq_yaml/nash/TransformerLayers.yaml
+++ b/ACT/act_quant/pyq_yaml/nash/TransformerLayers.yaml
@ -0,0 +1,24 @@
+model_parameters:
+  onnx_model: '{onnx_name_BPU_ACTPolicy_TransformerLayers}'
+  march: "{opt.type}"
+  layer_out_dump: False
+  working_dir: 'bpu_model_output'
+  output_model_file_prefix: 'BPU_TransformerLayers'
+input_parameters:
+  input_name: "{input_name_str}"
+  input_type_rt: '{input_type_str}'
+  input_layout_rt: '{nchw_str}'
+  input_type_train: '{input_type_str}'
+  input_layout_train: '{nchw_str}'
+  norm_type: '{norm_type_str}'
+calibration_parameters:
+  cal_data_dir: '{cal_data_dir_str}'
+  cal_data_type: '{cal_data_type_str}'
+  calibration_type: 'default'
+  optimization: set_all_nodes_int16
+compiler_parameters:
+  extra_params: {'input_no_padding': True, 'output_no_padding': True}
+  jobs: 6
+  compile_mode: 'latency'
+  debug: False
+  optimize_level: 'O2'
--- a/ACT/act_quant/pyq_yaml/nash/VisionEncoder.yaml
+++ b/ACT/act_quant/pyq_yaml/nash/VisionEncoder.yaml
@ -0,0 +1,24 @@
+model_parameters:
+  onnx_model: '{onnx_name_BPU_ACTPolicy_VisionEncoder}'
+  march: "{opt.type}"
+  layer_out_dump: False
+  working_dir: 'bpu_model_output'
+  output_model_file_prefix: 'BPU_VisionEncoder'
+input_parameters:
+  input_name: ""
+  input_type_rt: 'featuremap'
+  input_layout_rt: 'NCHW'
+  input_type_train: 'featuremap'
+  input_layout_train: 'NCHW'
+  norm_type: 'no_preprocess'
+calibration_parameters:
+  cal_data_dir: '{calbrate_data_name_BPU_ACTPolicy_VisionEncoder}'
+  cal_data_type: 'float32'
+  calibration_type: 'default'
+  optimization: set_all_nodes_int16
+compiler_parameters:
+  extra_params: {'input_no_padding': True, 'output_no_padding': True}
+  jobs: 6
+  compile_mode: 'latency'
+  debug: true
+  optimize_level: 'O2'
--- a/ACT/act_quant/read_json.py
+++ b/ACT/act_quant/read_json.py
@ -0,0 +1,42 @@
+import json
+import sys
+
+def read_config(config_file, key_path):
+    """
+    Read a value from JSON config file.
+    
+    Args:
+        config_file: Path to JSON config file
+        key_path: Dot-separated path to the key (e.g., "evaluation.checkpoint_path")
+    
+    Returns:
+        The value at the specified key path
+    """
+    with open(config_file, 'r') as f:
+        json_config = json.load(f)
+    
+    # Navigate through nested keys
+    keys = key_path.split('.')
+    value = json_config
+    for key in keys:
+        if isinstance(value, dict):
+            value = value.get(key)
+        else:
+            return None
+    
+    return value
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python read_config.py <config_file> <key_path>", file=sys.stderr)
+        sys.exit(1)
+    
+    config_file = sys.argv[1]
+    key_path = sys.argv[2]
+    
+    value = read_config(config_file, key_path)
+    if value is not None:
+        print(value)
+    else:
+        print("", file=sys.stderr)
+        sys.exit(1)
--- a/RDT/README.md
+++ b/RDT/README.md
@ -0,0 +1,2 @@
+# d-robotics-rdt
+
--- a/RDT/lerobot2rdt/.dockerignore
+++ b/RDT/lerobot2rdt/.dockerignore
@ -0,0 +1,2 @@
+input/*
+output/*
--- a/RDT/lerobot2rdt/Dockerfile
+++ b/RDT/lerobot2rdt/Dockerfile
@ -0,0 +1,43 @@
+
+FROM registry.d-robotics.cc/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+# ccr-29eug8s3-pub.cnc.bj.baidubce.com/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+WORKDIR /app
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV TZ=Asia/Shanghai
+
+RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
+    sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
+
+RUN apt-get update --allow-unauthenticated && apt-get install -y \
+    software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y \
+    python3.10 \
+    python3.10-dev \
+    python3-pip \
+    python3.10-distutils \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    wget \
+    libsm6 \
+    libxext6 \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+
+COPY . /app/
+
+RUN python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+# RUN pip install torch==2.1.0 torchvision==0.16.0  --index-url https://download.pytorch.org/whl/cu121
+RUN pip install torch==2.1.0 torchvision==0.16.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
+RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+RUN pip install packaging==24.0
+
+# RUN mkdir -p /app/dataset/input /app/dataset/output 
+
+ENTRYPOINT ["bash", "convert.sh"]
--- a/RDT/lerobot2rdt/convert.sh
+++ b/RDT/lerobot2rdt/convert.sh
@ -0,0 +1,60 @@
+
+BEGIN_TIME=$(date +%s)
+
+CONFIG_FILE="input/config.json"
+echo "CONFIG_FILE_PATH: $CONFIG_FILE"
+
+# Read values directly from the config.json using python - no more nested key error by using a helper script
+TASK_ID=$(python3 read_json.py "$CONFIG_FILE" "task_id")
+DATA_DIR=$(python3 read_json.py "$CONFIG_FILE" "data_dir")
+OUTPUT_DIR=$(python3 read_json.py "$CONFIG_FILE" "output_dir")
+EPISODE_NUM=$(python3 read_json.py "$CONFIG_FILE" "episode_num")
+GPU=$(python3 read_json.py "$CONFIG_FILE" "gpu")
+T5_PATH="/weights/t5-v1_1-xxl"
+NO_LANGUAGE=$(python3 read_json.py "$CONFIG_FILE" "no_language")
+
+# For the camera keys, extract them in a way that avoids the error about 'images_info.key.*' not found
+CAM_HIGH_KEY=$(python3 -c "import json; print(json.load(open('$CONFIG_FILE'))['images_info']['key'].get('cam_high', ''))")
+CAM_RIGHT_WRIST_KEY=$(python3 -c "import json; print(json.load(open('$CONFIG_FILE'))['images_info']['key'].get('cam_right_wrist', ''))")
+
+# create output path
+if [ ! -d "$OUTPUT_DIR/$TASK_ID" ]; then
+  mkdir -p "$OUTPUT_DIR/$TASK_ID"
+  echo "Created output directory: $OUTPUT_DIR/$TASK_ID"
+else
+  echo "Output directory already exists: $OUTPUT_DIR/$TASK_ID"
+fi
+
+if [ "$NO_LANGUAGE" = "true" ]; then
+    python3 lerobot2rdt.py \
+        --data_dir $DATA_DIR \
+        --output_dir $OUTPUT_DIR/$TASK_ID \
+        --episode_num $EPISODE_NUM \
+        --gpu $GPU \
+        --t5_path $T5_PATH \
+        --cam_high_key $CAM_HIGH_KEY \
+        --cam_right_wrist_key $CAM_RIGHT_WRIST_KEY \
+        --no_language
+    status=$?
+else
+    python3 lerobot2rdt.py \
+        --data_dir $DATA_DIR \
+        --output_dir $OUTPUT_DIR/$TASK_ID \
+        --episode_num $EPISODE_NUM \
+        --gpu $GPU \
+        --t5_path $T5_PATH \
+        --cam_high_key $CAM_HIGH_KEY \
+        --cam_right_wrist_key $CAM_RIGHT_WRIST_KEY
+    status=$?
+fi
+
+END_TIME=$(date +%s)
+echo "END_TIME: $END_TIME"
+echo "TOTAL_TIME: $((END_TIME - BEGIN_TIME))"
+
+if [ $status -eq 0 ]; then
+    python3 generate_output.py $CONFIG_FILE $((END_TIME - BEGIN_TIME))
+else
+    echo "lerobot2rdt.py exited with status $status, skipping generate_output.py"
+fi
+
--- a/RDT/lerobot2rdt/generate_output.py
+++ b/RDT/lerobot2rdt/generate_output.py
@ -0,0 +1,26 @@
+import json
+import os
+import sys
+
+def generate_output(input_config, time):
+    with open(input_config, "r") as f:
+        data = json.load(f)
+    output_dir_with_taskid = os.path.join(data["output_dir"], str(data["task_id"]))
+    # Ensure the output directory exists before writing the output file
+    os.makedirs(output_dir_with_taskid, exist_ok=True)
+    output_data = {
+        "task_id": data["task_id"],
+        "convert_time": time,
+        "data_dir": data["data_dir"],
+        "output_dir": output_dir_with_taskid,
+        "episode_num": data["episode_num"],
+        "no_language": data["no_language"],
+    }
+    output_json_path = os.path.join(output_dir_with_taskid, "output.json")
+    with open(output_json_path, "w") as f:
+        json.dump(output_data, f)
+
+if __name__ == "__main__":
+    input_config = sys.argv[1]
+    time = int(sys.argv[2])
+    generate_output(input_config, time)
--- a/RDT/lerobot2rdt/lerobot2rdt.py
+++ b/RDT/lerobot2rdt/lerobot2rdt.py
@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+"""
+LeRobot到RDT数据转换脚本
+
+LeRobot机器人结构：
+- 5个关节 (shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll)
+- 1个夹爪 (gripper)
+- 总计：6个自由度 (6DOF)
+
+维度映射（匹配RDT训练代码）：
+- left_arm_dim = 0 (单臂机器人，左臂不存在)
+- right_arm_dim = 6 (5关节 + 1夹爪，映射到RDT的right_arm部分)
+- 状态向量：6维 [joint1, joint2, joint3, joint4, joint5, gripper]
+- RDT索引映射：right_arm_joint_0_pos到right_arm_joint_5_pos (索引0-5)
+"""
+
+import sys
+import os
+import h5py
+import numpy as np
+import cv2
+import argparse
+import yaml
+import json
+import subprocess
+from pathlib import Path
+import pandas as pd
+import torch
+
+current_dir = os.path.dirname(__file__)
+sys.path.append(os.path.join(current_dir, ".."))
+from models.multimodal_encoder.t5_encoder import T5Embedder
+
+def extract_frames_from_video(video_path, output_dir, episode_idx):
+    if not os.path.exists(video_path):
+        print(f"  No video file: {video_path}")
+        return []
+    
+    temp_dir = os.path.join(output_dir, f"temp_frames_{episode_idx}")
+    if not os.path.exists(temp_dir):
+        os.makedirs(temp_dir)
+    
+    output_pattern = os.path.join(temp_dir, "frame_%04d.jpg")
+    
+    try:
+        cmd = [
+            'ffmpeg', '-i', video_path,
+            '-vf', 'fps=30',
+            '-q:v', '2',
+            output_pattern,
+            '-y'
+        ]
+        
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        
+        if result.returncode != 0:
+            print(f"  Failed to extract frames with ffmpeg: {result.stderr}")
+            return []
+        
+        frames = []
+        frame_files = sorted([f for f in os.listdir(temp_dir) if f.endswith('.jpg')])
+        
+        for frame_file in frame_files:
+            frame_path = os.path.join(temp_dir, frame_file)
+            frame = cv2.imread(frame_path)
+            if frame is not None:
+                frame_resized = cv2.resize(frame, (640, 480))
+                frames.append(frame_resized)
+        
+        print(f"  Successfully extracted {len(frames)} frames")
+        
+        for frame_file in frame_files:
+            os.remove(os.path.join(temp_dir, frame_file))
+        os.rmdir(temp_dir)
+        
+        return frames
+        
+    except Exception as e:
+        print(f"  Error extracting frames: {e}")
+        return []
+
+def load_lerobot_episode(data_dir, episode_idx, output_dir, cam_high_key="high", cam_right_wrist_key="arm"):
+    """加载LeRobot的单个episode数据
+    
+    LeRobot数据结构：
+    - action: 6维 [shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll, gripper]
+    - observation.state: 6维 [shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll, gripper]
+    - 图像: 高位相机 + 手臂相机
+    """
+    parquet_path = os.path.join(data_dir, "data/chunk-000", f"episode_{episode_idx:06d}.parquet")
+    if not os.path.exists(parquet_path):
+        print(f"Episode {episode_idx} parquet file does not exist: {parquet_path}")
+        return None
+    
+    df = pd.read_parquet(parquet_path)
+    
+    actions = []
+    qpos = []
+    
+    for i in range(len(df)):
+        action = df['action'].iloc[i]
+        state = df['observation.state'].iloc[i]
+        
+        if isinstance(action, np.ndarray):
+            actions.append(action.astype(np.float32))
+        else:
+            actions.append(np.array(action, dtype=np.float32))
+            
+        if isinstance(state, np.ndarray):
+            qpos.append(state.astype(np.float32))
+        else:
+            qpos.append(np.array(state, dtype=np.float32))
+    
+    high_cam_path = os.path.join(data_dir, f"videos/chunk-000/observation.images.{cam_high_key}", f"episode_{episode_idx:06d}.mp4")
+    arm_cam_path = os.path.join(data_dir, f"videos/chunk-000/observation.images.{cam_right_wrist_key}", f"episode_{episode_idx:06d}.mp4")
+    
+    print(f"  Extracting high camera frames...")
+    high_images = extract_frames_from_video(high_cam_path, output_dir, episode_idx)
+    
+    print(f"  Extracting arm camera frames...")
+    arm_images = extract_frames_from_video(arm_cam_path, output_dir, episode_idx)
+    
+    target_frames = len(df)
+    if len(high_images) > target_frames:
+        high_images = high_images[:target_frames]
+    if len(arm_images) > target_frames:
+        arm_images = arm_images[:target_frames]
+    
+    while len(high_images) < target_frames and high_images:
+        high_images.append(high_images[-1])
+    while len(arm_images) < target_frames and arm_images:
+        arm_images.append(arm_images[-1])
+    
+    return {
+        'actions': np.array(actions),
+        'qpos': np.array(qpos),
+        'high_images': high_images,
+        'arm_images': arm_images,
+        'episode_length': len(df)
+    }
+
+def images_encoding(imgs):
+    if not imgs:
+        return [], 0
+        
+    encode_data = []
+    padded_data = []
+    max_len = 0
+    
+    for i in range(len(imgs)):
+        success, encoded_image = cv2.imencode(".jpg", imgs[i])
+        if success:
+            jpeg_data = encoded_image.tobytes()
+            encode_data.append(jpeg_data)
+            max_len = max(max_len, len(jpeg_data))
+        else:
+            print(f"  Image encoding failed: {i}")
+            empty_data = b""
+            encode_data.append(empty_data)
+    
+    for i in range(len(imgs)):
+        padded_data.append(encode_data[i].ljust(max_len, b"\0"))
+    
+    return encode_data, max_len
+
+def load_task_instructions(data_dir):
+    tasks_file = os.path.join(data_dir, "meta/tasks.jsonl")
+    if not os.path.exists(tasks_file):
+        print(f"Warning: tasks file not found: {tasks_file}")
+        return None
+    
+    instructions = []
+    with open(tasks_file, 'r') as f:
+        for line in f:
+            if line.strip():
+                task_data = json.loads(line.strip())
+                instructions.append(task_data["task"])
+    
+    print(f"  加载了 {len(instructions)} 个任务指令")
+    return instructions
+
+def encode_language_instruction(instruction_text, t5_embedder, device):
+    try:
+        text_embeds, attn_mask = t5_embedder.get_text_embeddings([instruction_text])
+        
+        valid_embeds = text_embeds[0][attn_mask[0]].float()
+        return valid_embeds.cpu().numpy()
+        
+    except Exception as e:
+        print(f"  Language encoding failed: {e}")
+        return np.zeros((1, 4096))
+
+def convert_lerobot_to_rdt(data_dir, output_dir, episode_num, gpu=0, no_language=False, t5_path=None, cam_high_key="high", cam_right_wrist_key="arm"):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    
+    print(f"Start converting LeRobot data to RDT format...")
+    print(f"Data source: {data_dir}")
+    print(f"Output directory: {output_dir}")
+    print(f"Processing episode number: {episode_num}")
+    print(f"GPU device: {gpu}")
+    
+    scene_name = os.path.basename(data_dir)
+    
+    instructions = None
+    if not no_language:
+        instructions = load_task_instructions(data_dir)
+    
+    t5_embedder = None
+    if not no_language and instructions:
+        try:
+            print(f"  Initializing T5 encoder...")
+            t5_embedder = T5Embedder(
+                from_pretrained=t5_path,
+                device=f"cuda:{gpu}" if torch.cuda.is_available() else "cpu",
+                model_max_length=1024,
+                use_offload_folder=None,
+            )
+            print(f"  T5 encoder initialized successfully")
+        except Exception as e:
+            print(f"  T5 encoder initialization failed: {e}")
+            print(f"  Will skip language processing")
+            no_language = True
+    
+    for i in range(episode_num):
+        print(f"Processing episode {i}...")
+        
+        episode_data = load_lerobot_episode(data_dir, i, output_dir, cam_high_key=cam_high_key, cam_right_wrist_key=cam_right_wrist_key)
+        if episode_data is None:
+            print(f"Skipping episode {i}")
+            continue
+        
+        episode_output_dir = os.path.join(output_dir, f"episode_{i}")
+        if not os.path.exists(episode_output_dir):
+            os.makedirs(episode_output_dir)
+        
+        hdf5_path = os.path.join(episode_output_dir, f"episode_{i}.hdf5")
+        
+        with h5py.File(hdf5_path, "w") as f:
+            f.create_dataset("action", data=episode_data['actions'])
+            
+            obs = f.create_group("observations")
+            obs.create_dataset("qpos", data=episode_data['qpos'])
+            
+            image = obs.create_group("images")
+            
+            if episode_data['high_images']:
+                print(f"  Encoding high camera images...")
+                high_enc, len_high = images_encoding(episode_data['high_images'])
+                if high_enc and len_high > 0:
+                    image.create_dataset("cam_high", data=high_enc, dtype=f"S{len_high}")
+                    print(f"  Saved high camera images: {len(episode_data['high_images'])} frames")
+                else:
+                    print(f"  Warning: High camera images encoding failed")
+            
+            if episode_data['arm_images']:
+                print(f"  Encoding arm camera images...")
+                arm_enc, len_arm = images_encoding(episode_data['arm_images'])
+                if arm_enc and len_arm > 0:
+                    image.create_dataset("cam_right_wrist", data=arm_enc, dtype=f"S{len_arm}")
+                    print(f"  Saved arm camera images: {len(episode_data['arm_images'])} frames")
+                else:
+                    print(f"  Warning: Arm camera images encoding failed")
+            
+            # 添加机器人维度信息（LeRobot: 5个关节 + 1个夹爪）
+            # 根据process_data.py的逻辑，每个时间步都需要记录维度信息
+            # LeRobot是单臂机器人，只有右臂：5个关节 + 1个夹爪 = 6维
+            # 左臂：0维（单臂机器人）
+            
+            # 为每个时间步记录维度信息
+            left_arm_dim = [0] * len(episode_data['actions'])  # 左臂0维（单臂机器人）
+            right_arm_dim = [6] * len(episode_data['actions'])  # 右臂6维（5关节+1夹爪）
+            
+            obs.create_dataset("left_arm_dim", data=np.array(left_arm_dim))
+            obs.create_dataset("right_arm_dim", data=np.array(right_arm_dim))
+        
+        print(f"  Episode {i} converted successfully: {hdf5_path}")
+        print(f"  Data length: {episode_data['episode_length']}")
+        print(f"  Action shape: {episode_data['actions'].shape}")
+        print(f"  Qpos shape: {episode_data['qpos'].shape}")
+        print(f"  High camera frames: {len(episode_data['high_images'])}")
+        print(f"  Arm camera frames: {len(episode_data['arm_images'])}")
+        
+        if not no_language and t5_embedder and instructions:
+            print(f"  Processing language instructions...")
+            try:
+                instruction = instructions[0]
+                
+                language_features = encode_language_instruction(instruction, t5_embedder, f"cuda:{gpu}")
+                
+                instructions_dir = os.path.join(episode_output_dir, "instructions")
+                if not os.path.exists(instructions_dir):
+                    os.makedirs(instructions_dir)
+                
+                lang_embed_path = os.path.join(instructions_dir, "lang_embed_0.pt")
+                torch.save(torch.from_numpy(language_features), lang_embed_path)
+                
+                print(f"  Language instruction encoded successfully: {instruction}")
+                print(f"  Language features saved to: {lang_embed_path}")
+                print(f"  Language features shape: {language_features.shape}, data type: {language_features.dtype}")
+                
+            except Exception as e:
+                print(f"  Language instruction processing failed: {e}")
+    
+    print(f"\nConversion completed! Processed {episode_num} episodes")
+    print(f"Output directory: {output_dir}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert LeRobot data to RDT format")
+    parser.add_argument("--data_dir", type=str, required=True, 
+                       help="LeRobot data directory path")
+    parser.add_argument("--output_dir", type=str, required=True,
+                       help="Output directory path")
+    parser.add_argument("--episode_num", type=int, default=10,
+                       help="Number of episodes to process")
+    parser.add_argument("--gpu", type=int, default=0,
+                       help="GPU device ID")
+    parser.add_argument("--no_language", action="store_true",
+                       help="Skip language processing")
+    parser.add_argument("--cam_high_key", type=str, default="cam_high",
+                       help="High camera key")
+    parser.add_argument("--cam_right_wrist_key", type=str, default="cam_right_wrist",
+                       help="Right wrist camera key")
+    parser.add_argument("--cam_left_wrist_key", type=str, default="cam_left_wrist",
+                       help="Left wrist camera key")
+    parser.add_argument("--t5_path", type=str, required=True,
+                       help="T5 model path")
+    
+    args = parser.parse_args()
+    
+    if not os.path.exists(args.data_dir):
+        print(f"Error: Data directory does not exist: {args.data_dir}")
+        return
+    
+    meta_file = os.path.join(args.data_dir, "meta/info.json")
+    if not os.path.exists(meta_file):
+        print(f"Error: Meta information file not found: {meta_file}")
+        return
+    
+    try:
+        subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
+        print("ffmpeg is available, will use ffmpeg to extract video frames")
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("Warning: ffmpeg is not available, image data may not be extracted correctly")
+        print("Please install ffmpeg: conda install -c conda-forge ffmpeg=6.1")
+        return
+    
+    with open(meta_file, 'r') as f:
+        meta_info = yaml.safe_load(f)
+    
+    total_episodes = meta_info.get('total_episodes', 10)
+    if args.episode_num > total_episodes:
+        print(f"Warning: Requested episode number ({args.episode_num}) exceeds available number ({total_episodes})")
+        args.episode_num = total_episodes
+    
+    convert_lerobot_to_rdt(
+        args.data_dir, 
+        args.output_dir, 
+        args.episode_num,
+        args.gpu,
+        args.no_language,
+        args.t5_path,
+        args.cam_high_key,
+        args.cam_right_wrist_key,
+    )
+
+if __name__ == "__main__":
+    main()
--- a/RDT/lerobot2rdt/models/init.py
+++ b/RDT/lerobot2rdt/models/init.py
--- a/RDT/lerobot2rdt/models/ema_model.py
+++ b/RDT/lerobot2rdt/models/ema_model.py
@ -0,0 +1,82 @@
+# Reference: DiffusionPolicy [https://github.com/real-stanford/diffusion_policy]
+
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+
+
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+
+    def __init__(self, model, update_after_step=0, inv_gamma=1.0, power=2 / 3, min_value=0.0, max_value=0.9999):
+        """
+        @crowsonkb's notes on EMA Warmup:
+            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
+            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
+            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
+            at 215.4k steps).
+        Args:
+            inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
+            power (float): Exponential factor of EMA warmup. Default: 2/3.
+            min_value (float): The minimum EMA decay rate. Default: 0.
+        """
+
+        self.averaged_model = model
+        self.averaged_model.eval()
+        self.averaged_model.requires_grad_(False)
+
+        self.update_after_step = update_after_step
+        self.inv_gamma = inv_gamma
+        self.power = power
+        self.min_value = min_value
+        self.max_value = max_value
+
+        self.decay = 0.0
+        self.optimization_step = 0
+
+    def get_decay(self, optimization_step):
+        """
+        Compute the decay factor for the exponential moving average.
+        """
+        step = max(0, optimization_step - self.update_after_step - 1)
+        value = 1 - (1 + step / self.inv_gamma)**-self.power
+
+        if step <= 0:
+            return 0.0
+
+        return max(self.min_value, min(value, self.max_value))
+
+    @torch.no_grad()
+    def step(self, new_model):
+        self.decay = self.get_decay(self.optimization_step)
+
+        # old_all_dataptrs = set()
+        # for param in new_model.parameters():
+        #     data_ptr = param.data_ptr()
+        #     if data_ptr != 0:
+        #         old_all_dataptrs.add(data_ptr)
+
+        all_dataptrs = set()
+        for module, ema_module in zip(new_model.modules(), self.averaged_model.modules()):
+            for param, ema_param in zip(module.parameters(recurse=False), ema_module.parameters(recurse=False)):
+                # iterative over immediate parameters only.
+                if isinstance(param, dict):
+                    raise RuntimeError('Dict parameter not supported')
+
+                # data_ptr = param.data_ptr()
+                # if data_ptr != 0:
+                #     all_dataptrs.add(data_ptr)
+
+                if isinstance(module, _BatchNorm):
+                    # skip batchnorms
+                    ema_param.copy_(param.to(dtype=ema_param.dtype).data)
+                elif not param.requires_grad:
+                    ema_param.copy_(param.to(dtype=ema_param.dtype).data)
+                else:
+                    ema_param.mul_(self.decay)
+                    ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=1 - self.decay)
+
+        # verify that iterating over module and then parameters is identical to parameters recursively.
+        # assert old_all_dataptrs == all_dataptrs
+        self.optimization_step += 1
--- a/RDT/lerobot2rdt/models/hub_mixin.py
+++ b/RDT/lerobot2rdt/models/hub_mixin.py
@ -0,0 +1,75 @@
+import os
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+from huggingface_hub import PyTorchModelHubMixin
+from huggingface_hub.constants import (PYTORCH_WEIGHTS_NAME, SAFETENSORS_SINGLE_FILE)
+from huggingface_hub.file_download import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError, is_torch_available
+
+if is_torch_available():
+    import torch  # type: ignore
+
+
+class CompatiblePyTorchModelHubMixin(PyTorchModelHubMixin):
+    """Mixin class to load Pytorch models from the Hub."""
+
+    def _save_pretrained(self, save_directory: Path) -> None:
+        """Save weights from a Pytorch model to a local directory."""
+        # To bypass saving into safetensor by default
+        model_to_save = self.module if hasattr(self, "module") else self  # type: ignore
+        torch.save(model_to_save.state_dict(), save_directory / PYTORCH_WEIGHTS_NAME)
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        *,
+        model_id: str,
+        revision: Optional[str],
+        cache_dir: Optional[Union[str, Path]],
+        force_download: bool,
+        proxies: Optional[Dict],
+        resume_download: Optional[bool],
+        local_files_only: bool,
+        token: Union[str, bool, None],
+        map_location: str = "cpu",
+        strict: bool = False,
+        **model_kwargs,
+    ):
+        """Load Pytorch pretrained weights and return the loaded model."""
+        model = cls(**model_kwargs)
+        if os.path.isdir(model_id):
+            print("Loading weights from local directory")
+            try:
+                model_file = os.path.join(model_id, SAFETENSORS_SINGLE_FILE)
+                return cls._load_as_safetensor(model, model_file, map_location, strict)
+            except FileNotFoundError:
+                model_file = os.path.join(model_id, PYTORCH_WEIGHTS_NAME)
+                return cls._load_as_pickle(model, model_file, map_location, strict)
+        else:
+            try:
+                model_file = hf_hub_download(
+                    repo_id=model_id,
+                    filename=SAFETENSORS_SINGLE_FILE,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    token=token,
+                    local_files_only=local_files_only,
+                )
+                return cls._load_as_safetensor(model, model_file, map_location, strict)
+            except EntryNotFoundError:
+                model_file = hf_hub_download(
+                    repo_id=model_id,
+                    filename=PYTORCH_WEIGHTS_NAME,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    token=token,
+                    local_files_only=local_files_only,
+                )
+                return cls._load_as_pickle(model, model_file, map_location, strict)
--- a/RDT/lerobot2rdt/models/multimodal_encoder/init.py
+++ b/RDT/lerobot2rdt/models/multimodal_encoder/init.py
--- a/RDT/lerobot2rdt/models/multimodal_encoder/clip_encoder.py
+++ b/RDT/lerobot2rdt/models/multimodal_encoder/clip_encoder.py
@ -0,0 +1,159 @@
+import torch
+import torch.nn as nn
+
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+
+
+class CLIPVisionTower(nn.Module):
+
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                                                      output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
+                                                   output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size)**2
+
+
+class CLIPVisionTowerS2(CLIPVisionTower):
+
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__(vision_tower, args, delay_load)
+
+        self.s2_scales = getattr(args, 's2_scales', '336,672,1008')
+        self.s2_scales = list(map(int, self.s2_scales.split(',')))
+        self.s2_scales.sort()
+        self.s2_split_size = self.s2_scales[0]
+        self.s2_image_size = self.s2_scales[-1]
+
+        try:
+            from s2wrapper import forward as multiscale_forward
+        except ImportError:
+            raise ImportError(
+                'Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git'
+            )
+        self.multiscale_forward = multiscale_forward
+
+        # change resize/crop size in preprocessing to the largest image size in s2_scale
+        if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.image_processor.size['shortest_edge'] = self.s2_image_size
+            self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+
+        self.image_processor.size['shortest_edge'] = self.s2_image_size
+        self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
+
+        self.is_loaded = True
+
+    @torch.no_grad()
+    def forward_feature(self, images):
+        image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
+                                               output_hidden_states=True)
+        image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_feature = self.multiscale_forward(self.forward_feature,
+                                                        image.unsqueeze(0),
+                                                        img_sizes=self.s2_scales,
+                                                        max_split_size=self.s2_split_size)
+                image_features.append(image_feature)
+        else:
+            image_features = self.multiscale_forward(self.forward_feature,
+                                                     images,
+                                                     img_sizes=self.s2_scales,
+                                                     max_split_size=self.s2_split_size)
+
+        return image_features
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size * len(self.s2_scales)
--- a/RDT/lerobot2rdt/models/multimodal_encoder/dinov2_encoder.py
+++ b/RDT/lerobot2rdt/models/multimodal_encoder/dinov2_encoder.py
@ -0,0 +1,87 @@
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoImageProcessor, AutoModel, Dinov2Model
+
+
+class DinoV2VisionTower(nn.Module):
+
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)  # FIXME:
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.last_hidden_state
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]  # (B, 1369, 1536)
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features  # (B, 1, 1536)
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size)**2
--- a/RDT/lerobot2rdt/models/multimodal_encoder/siglip_encoder.py
+++ b/RDT/lerobot2rdt/models/multimodal_encoder/siglip_encoder.py
@ -0,0 +1,86 @@
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, SiglipImageProcessor, SiglipVisionModel
+
+
+class SiglipVisionTower(nn.Module):
+
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.eval()
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        if self.select_feature == 'patch':
+            image_features = image_forward_outs.last_hidden_state  # (B, 729, 1536)
+        elif self.select_feature == 'cls_patch':
+            image_features = image_forward_outs.pooler_output  # (B, 1, 1536)
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size)**2
--- a/RDT/lerobot2rdt/models/multimodal_encoder/t5_encoder.py
+++ b/RDT/lerobot2rdt/models/multimodal_encoder/t5_encoder.py
@ -0,0 +1,111 @@
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+
+
+class T5Embedder:
+    # available_models = ["google/t5-v1_1-xxl"]
+
+    def __init__(
+        self,
+        device,
+        from_pretrained=None,
+        *,
+        cache_dir=None,
+        hf_token=None,
+        use_text_preprocessing=True,
+        t5_model_kwargs=None,
+        torch_dtype=None,
+        use_offload_folder=None,
+        model_max_length=120,
+        local_files_only=False,
+    ):
+        # from_pretrained="google/t5-v1_1-xxl" # zijian
+        self.device = torch.device(device)
+        self.torch_dtype = torch_dtype or torch.bfloat16
+        self.cache_dir = cache_dir
+
+        if t5_model_kwargs is None:
+            t5_model_kwargs = {
+                "low_cpu_mem_usage": True,
+                "torch_dtype": self.torch_dtype,
+            }
+
+            if use_offload_folder is not None:
+                t5_model_kwargs["offload_folder"] = use_offload_folder
+                t5_model_kwargs["device_map"] = {
+                    "shared": self.device,
+                    "encoder.embed_tokens": self.device,
+                    "encoder.block.0": self.device,
+                    "encoder.block.1": self.device,
+                    "encoder.block.2": self.device,
+                    "encoder.block.3": self.device,
+                    "encoder.block.4": self.device,
+                    "encoder.block.5": self.device,
+                    "encoder.block.6": self.device,
+                    "encoder.block.7": self.device,
+                    "encoder.block.8": self.device,
+                    "encoder.block.9": self.device,
+                    "encoder.block.10": self.device,
+                    "encoder.block.11": self.device,
+                    "encoder.block.12": "disk",
+                    "encoder.block.13": "disk",
+                    "encoder.block.14": "disk",
+                    "encoder.block.15": "disk",
+                    "encoder.block.16": "disk",
+                    "encoder.block.17": "disk",
+                    "encoder.block.18": "disk",
+                    "encoder.block.19": "disk",
+                    "encoder.block.20": "disk",
+                    "encoder.block.21": "disk",
+                    "encoder.block.22": "disk",
+                    "encoder.block.23": "disk",
+                    "encoder.final_layer_norm": "disk",
+                    "encoder.dropout": "disk",
+                }
+            else:
+                t5_model_kwargs["device_map"] = {
+                    "shared": self.device,
+                    "encoder": self.device,
+                }
+
+        self.use_text_preprocessing = use_text_preprocessing
+        self.hf_token = hf_token
+
+        # assert from_pretrained in self.available_models
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            from_pretrained,
+            model_max_length=model_max_length,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+        )
+        self.model = T5EncoderModel.from_pretrained(
+            from_pretrained,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+            **t5_model_kwargs,
+        ).eval()
+        self.model_max_length = model_max_length
+
+    def get_text_embeddings(self, texts):
+        text_tokens_and_mask = self.tokenizer(
+            texts,
+            max_length=self.model_max_length,
+            padding="longest",
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        input_ids = text_tokens_and_mask["input_ids"].to(self.device)
+        attention_mask = text_tokens_and_mask["attention_mask"].to(self.device)
+        with torch.no_grad():
+            text_encoder_embs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            )["last_hidden_state"].detach()
+        return text_encoder_embs, attention_mask
+
+
+if __name__ == "__main__":
+    T5Embedder(from_pretrained="google/t5-v1_1-xxl", device='cuda:7')
--- a/RDT/lerobot2rdt/models/rdt/blocks.py
+++ b/RDT/lerobot2rdt/models/rdt/blocks.py
@ -0,0 +1,304 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DiT: https://github.com/facebookresearch/DiT
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+
+import math
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.jit import Final
+from timm.models.vision_transformer import Attention, Mlp, RmsNorm, use_fused_attn
+
+
+#################################################################################
+#               Embedding Layers for Timesteps and Condition Inptus             #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+
+    def __init__(self, hidden_size, frequency_embedding_size=256, dtype=torch.bfloat16):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+        self.dtype = dtype
+
+    def timestep_embedding(self, t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(-math.log(max_period) *
+                          torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding.to(self.dtype)
+
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+
+
+#################################################################################
+#                          Cross Attention Layers                               #
+#################################################################################
+class CrossAttention(nn.Module):
+    """
+    A cross-attention layer with flash attention.
+    """
+    fused_attn: Final[bool]
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        attn_drop: float = 0,
+        proj_drop: float = 0,
+        norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.fused_attn = use_fused_attn()
+
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x: torch.Tensor, c: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+        B, N, C = x.shape
+        _, L, _ = c.shape
+        q = self.q(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        kv = self.kv(c).reshape(B, L, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        k, v = kv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+
+        # Prepare attn mask (B, L) to mask the conditioion
+        if mask is not None:
+            mask = mask.reshape(B, 1, 1, L)
+            mask = mask.expand(-1, -1, N, -1)
+
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(query=q,
+                                               key=k,
+                                               value=v,
+                                               dropout_p=self.attn_drop.p if self.training else 0.,
+                                               attn_mask=mask)
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            if mask is not None:
+                attn = attn.masked_fill_(mask.logical_not(), float('-inf'))
+            attn = attn.softmax(dim=-1)
+            if self.attn_drop.p > 0:
+                attn = self.attn_drop(attn)
+            x = attn @ v
+
+        x = x.permute(0, 2, 1, 3).reshape(B, N, C)
+        x = self.proj(x)
+        if self.proj_drop.p > 0:
+            x = self.proj_drop(x)
+        return x
+
+
+#################################################################################
+#                                 RDT Block                                     #
+#################################################################################
+class RDTBlock(nn.Module):
+    """
+    A RDT block with cross-attention conditioning.
+    """
+
+    def __init__(self, hidden_size, num_heads, **block_kwargs):
+        super().__init__()
+        self.norm1 = RmsNorm(hidden_size, eps=1e-6)
+        self.attn = Attention(dim=hidden_size,
+                              num_heads=num_heads,
+                              qkv_bias=True,
+                              qk_norm=True,
+                              norm_layer=RmsNorm,
+                              **block_kwargs)
+        self.cross_attn = CrossAttention(hidden_size,
+                                         num_heads=num_heads,
+                                         qkv_bias=True,
+                                         qk_norm=True,
+                                         norm_layer=RmsNorm,
+                                         **block_kwargs)
+
+        self.norm2 = RmsNorm(hidden_size, eps=1e-6)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.ffn = Mlp(in_features=hidden_size, hidden_features=hidden_size, act_layer=approx_gelu, drop=0)
+        self.norm3 = RmsNorm(hidden_size, eps=1e-6)
+
+    def forward(self, x, c, mask=None):
+        origin_x = x
+        x = self.norm1(x)
+        x = self.attn(x)
+        x = x + origin_x
+
+        origin_x = x
+        x = self.norm2(x)
+        x = self.cross_attn(x, c, mask)
+        x = x + origin_x
+
+        origin_x = x
+        x = self.norm3(x)
+        x = self.ffn(x)
+        x = x + origin_x
+
+        return x
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of RDT.
+    """
+
+    def __init__(self, hidden_size, out_channels):
+        super().__init__()
+        self.norm_final = RmsNorm(hidden_size, eps=1e-6)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.ffn_final = Mlp(in_features=hidden_size,
+                             hidden_features=hidden_size,
+                             out_features=out_channels,
+                             act_layer=approx_gelu,
+                             drop=0)
+
+    def forward(self, x):
+        x = self.norm_final(x)
+        x = self.ffn_final(x)
+        return x
+
+
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    if not isinstance(pos, np.ndarray):
+        pos = np.array(pos, dtype=np.float64)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+def get_nd_sincos_pos_embed_from_grid(embed_dim, grid_sizes):
+    """
+    embed_dim: output dimension for each position
+    grid_sizes: the grids sizes in each dimension (K,).
+    out: (grid_sizes[0], ..., grid_sizes[K-1], D)
+    """
+    num_sizes = len(grid_sizes)
+    # For grid size of 1, we do not need to add any positional embedding
+    num_valid_sizes = len([x for x in grid_sizes if x > 1])
+    emb = np.zeros(grid_sizes + (embed_dim, ))
+    # Uniformly divide the embedding dimension for each grid size
+    dim_for_each_grid = embed_dim // num_valid_sizes
+    # To make it even
+    if dim_for_each_grid % 2 != 0:
+        dim_for_each_grid -= 1
+    valid_size_idx = 0
+    for size_idx in range(num_sizes):
+        grid_size = grid_sizes[size_idx]
+        if grid_size <= 1:
+            continue
+        pos = np.arange(grid_size)
+        posemb_shape = [1] * len(grid_sizes) + [dim_for_each_grid]
+        posemb_shape[size_idx] = -1
+        emb[..., valid_size_idx * dim_for_each_grid:(valid_size_idx + 1) * dim_for_each_grid] += \
+            get_1d_sincos_pos_embed_from_grid(dim_for_each_grid, pos).reshape(posemb_shape)
+        valid_size_idx += 1
+    return emb
+
+
+def get_multimodal_cond_pos_embed(embed_dim, mm_cond_lens: OrderedDict, embed_modality=True):
+    """
+    Generate position embeddings for multimodal conditions. 
+    
+    mm_cond_lens: an OrderedDict containing 
+        (modality name, modality token length) pairs.
+        For `"image"` modality, the value can be a multi-dimensional tuple.
+        If the length < 0, it means there is no position embedding for the modality or grid.
+    embed_modality: whether to embed the modality information. Default is True.
+    """
+    num_modalities = len(mm_cond_lens)
+    modality_pos_embed = np.zeros((num_modalities, embed_dim))
+    if embed_modality:
+        # Get embeddings for various modalites
+        # We put it in the first half
+        modality_sincos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, torch.arange(num_modalities))
+        modality_pos_embed[:, :embed_dim // 2] = modality_sincos_embed
+        # The second half is for position embeddings
+        pos_embed_dim = embed_dim // 2
+    else:
+        # The whole embedding is for position embeddings
+        pos_embed_dim = embed_dim
+
+    # Get embeddings for positions inside each modality
+    c_pos_emb = np.zeros((0, embed_dim))
+    for idx, (modality, cond_len) in enumerate(mm_cond_lens.items()):
+        if modality == "image" and \
+            (isinstance(cond_len, tuple) or isinstance(cond_len, list)):
+            all_grid_sizes = tuple([abs(x) for x in cond_len])
+            embed_grid_sizes = tuple([x if x > 0 else 1 for x in cond_len])
+            cond_sincos_embed = get_nd_sincos_pos_embed_from_grid(pos_embed_dim, embed_grid_sizes)
+            cond_pos_embed = np.zeros(all_grid_sizes + (embed_dim, ))
+            cond_pos_embed[..., -pos_embed_dim:] += cond_sincos_embed
+            cond_pos_embed = cond_pos_embed.reshape((-1, embed_dim))
+        else:
+            cond_sincos_embed = get_1d_sincos_pos_embed_from_grid(pos_embed_dim,
+                                                                  torch.arange(cond_len if cond_len > 0 else 1))
+            cond_pos_embed = np.zeros((abs(cond_len), embed_dim))
+            cond_pos_embed[:, -pos_embed_dim:] += cond_sincos_embed
+        cond_pos_embed += modality_pos_embed[idx]
+        c_pos_emb = np.concatenate([c_pos_emb, cond_pos_embed], axis=0)
+
+    return c_pos_emb
--- a/RDT/lerobot2rdt/models/rdt/model.py
+++ b/RDT/lerobot2rdt/models/rdt/model.py
@ -0,0 +1,156 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DiT: https://github.com/facebookresearch/DiT
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+from pathlib import Path
+import sys, os
+# get current workspace
+current_file = Path(__file__)
+sys.path.append(str(current_file.parent.parent))
+
+from rdt.blocks import (FinalLayer, RDTBlock, TimestepEmbedder, get_1d_sincos_pos_embed_from_grid,
+                        get_multimodal_cond_pos_embed)
+
+
+class RDT(nn.Module):
+    """
+    Class for Robotics Diffusion Transformers.
+    """
+
+    def __init__(self,
+                 output_dim=128,
+                 horizon=32,
+                 hidden_size=1152,
+                 depth=28,
+                 num_heads=16,
+                 max_lang_cond_len=1024,
+                 img_cond_len=4096,
+                 lang_pos_embed_config=None,
+                 img_pos_embed_config=None,
+                 dtype=torch.bfloat16):
+        super().__init__()
+        self.horizon = horizon
+        self.hidden_size = hidden_size
+        self.max_lang_cond_len = max_lang_cond_len
+        self.img_cond_len = img_cond_len
+        self.dtype = dtype
+        self.lang_pos_embed_config = lang_pos_embed_config
+        self.img_pos_embed_config = img_pos_embed_config
+
+        self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
+        self.freq_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
+
+        # We will use trainable sin-cos embeddings
+        # [timestep; state; action]
+        self.x_pos_embed = nn.Parameter(torch.zeros(1, horizon + 3, hidden_size))
+        # Language conditions
+        self.lang_cond_pos_embed = nn.Parameter(torch.zeros(1, max_lang_cond_len, hidden_size))
+        # Image conditions
+        self.img_cond_pos_embed = nn.Parameter(torch.zeros(1, img_cond_len, hidden_size))
+
+        self.blocks = nn.ModuleList([RDTBlock(hidden_size, num_heads) for _ in range(depth)])
+        self.final_layer = FinalLayer(hidden_size, output_dim)
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+
+        self.apply(_basic_init)
+
+        # Initialize pos_embed by sin-cos embedding
+        x_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
+                                                    mm_cond_lens=OrderedDict([
+                                                        ('timestep', 1),
+                                                        ('ctrl_freq', 1),
+                                                        ('state', 1),
+                                                        ('action', self.horizon),
+                                                    ]))
+        self.x_pos_embed.data.copy_(torch.from_numpy(x_pos_embed).float().unsqueeze(0))
+
+        if self.lang_pos_embed_config is None:
+            lang_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size,
+                                                                    torch.arange(self.max_lang_cond_len))
+        else:
+            lang_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
+                                                                mm_cond_lens=OrderedDict(self.lang_pos_embed_config),
+                                                                embed_modality=False)
+        self.lang_cond_pos_embed.data.copy_(torch.from_numpy(lang_cond_pos_embed).float().unsqueeze(0))
+
+        if self.img_pos_embed_config is None:
+            img_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size, torch.arange(self.img_cond_len))
+        else:
+            img_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
+                                                               mm_cond_lens=OrderedDict(self.img_pos_embed_config),
+                                                               embed_modality=False)
+        self.img_cond_pos_embed.data.copy_(torch.from_numpy(img_cond_pos_embed).float().unsqueeze(0))
+
+        # Initialize timestep and control freq embedding MLP
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.freq_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.freq_embedder.mlp[2].weight, std=0.02)
+
+        # Initialize the final layer: zero-out the final linear layer
+        nn.init.constant_(self.final_layer.ffn_final.fc2.weight, 0)
+        nn.init.constant_(self.final_layer.ffn_final.fc2.bias, 0)
+
+        # Move all the params to given data type:
+        self.to(self.dtype)
+
+    def forward(self, x, freq, t, lang_c, img_c, lang_mask=None, img_mask=None):
+        """
+        Forward pass of RDT.
+        
+        x: (B, T, D), state + action token sequence, T = horizon + 1,
+            dimension D is assumed to be the same as the hidden size.
+        freq: (B,), a scalar indicating control frequency.
+        t: (B,) or (1,), diffusion timesteps.
+        lang_c: (B, L_lang, D) or None, language condition tokens (variable length),
+            dimension D is assumed to be the same as the hidden size.
+        img_c: (B, L_img, D) or None, image condition tokens (fixed length),
+            dimension D is assumed to be the same as the hidden size.
+        lang_mask: (B, L_lang) or None, language condition mask (True for valid).
+        img_mask: (B, L_img) or None, image condition mask (True for valid).
+        """
+        t = self.t_embedder(t).unsqueeze(1)  # (B, 1, D) or (1, 1, D)
+        freq = self.freq_embedder(freq).unsqueeze(1)  # (B, 1, D)
+        # Append timestep to the input tokens
+        if t.shape[0] == 1:
+            t = t.expand(x.shape[0], -1, -1)
+        x = torch.cat([t, freq, x], dim=1)  # (B, T+1, D)
+
+        # Add multimodal position embeddings
+        x = x + self.x_pos_embed
+        # Note the lang is of variable length
+        lang_c = lang_c + self.lang_cond_pos_embed[:, :lang_c.shape[1]]
+        img_c = img_c + self.img_cond_pos_embed
+
+        # Forward pass
+        conds = [lang_c, img_c]
+        masks = [lang_mask, img_mask]
+        for i, block in enumerate(self.blocks):
+            c, mask = conds[i % 2], masks[i % 2]
+            x = block(x, c, mask)  # (B, T+1, D)
+        # Inject the language condition at the final layer
+        x = self.final_layer(x)  # (B, T+1, out_channels)
+
+        # Only preserve the action tokens
+        x = x[:, -self.horizon:]
+        return x
--- a/RDT/lerobot2rdt/models/rdt_runner.py
+++ b/RDT/lerobot2rdt/models/rdt_runner.py
@ -0,0 +1,246 @@
+import re, sys, os
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+from diffusers.schedulers.scheduling_dpmsolver_multistep import \
+    DPMSolverMultistepScheduler
+
+from pathlib import Path
+# get current workspace
+current_file = Path(__file__)
+sys.path.append(os.path.join(current_file.parent))
+from hub_mixin import CompatiblePyTorchModelHubMixin
+from rdt.model import RDT
+
+
+class RDTRunner(nn.Module,
+                CompatiblePyTorchModelHubMixin,
+                repo_url="https://huggingface.co/robotics-diffusion-transformer/rdt-1b"):
+
+    def __init__(self,
+                 *,
+                 action_dim,
+                 pred_horizon,
+                 config,
+                 lang_token_dim,
+                 img_token_dim,
+                 state_token_dim,
+                 max_lang_cond_len,
+                 img_cond_len,
+                 lang_pos_embed_config=None,
+                 img_pos_embed_config=None,
+                 dtype=torch.bfloat16):
+        super(RDTRunner, self).__init__()
+        # Create diffusion model
+        hidden_size = config['rdt']['hidden_size']
+        self.model = RDT(
+            output_dim=action_dim,
+            horizon=pred_horizon,
+            hidden_size=hidden_size,
+            depth=config['rdt']['depth'],
+            num_heads=config['rdt']['num_heads'],
+            max_lang_cond_len=max_lang_cond_len,
+            img_cond_len=img_cond_len,
+            lang_pos_embed_config=lang_pos_embed_config,
+            img_pos_embed_config=img_pos_embed_config,
+            dtype=dtype,
+        )
+
+        # Create adpators for various conditional inputs
+        self.lang_adaptor = self.build_condition_adapter(config['lang_adaptor'],
+                                                         in_features=lang_token_dim,
+                                                         out_features=hidden_size)
+        self.img_adaptor = self.build_condition_adapter(config['img_adaptor'],
+                                                        in_features=img_token_dim,
+                                                        out_features=hidden_size)
+        # A `state` refers to an action or a proprioception vector
+        self.state_adaptor = self.build_condition_adapter(
+            config['state_adaptor'],
+            in_features=state_token_dim * 2,  # state + state mask (indicator)
+            out_features=hidden_size)
+
+        # Create the noise scheduler
+        noise_scheduler_config = config['noise_scheduler']
+        self.noise_scheduler = DDPMScheduler(
+            num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
+            beta_schedule=noise_scheduler_config['beta_schedule'],
+            prediction_type=noise_scheduler_config['prediction_type'],
+            clip_sample=noise_scheduler_config['clip_sample'],
+        )
+        self.noise_scheduler_sample = DPMSolverMultistepScheduler(
+            num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
+            beta_schedule=noise_scheduler_config['beta_schedule'],
+            prediction_type=noise_scheduler_config['prediction_type'],
+        )
+
+        self.num_train_timesteps = noise_scheduler_config['num_train_timesteps']
+        self.num_inference_timesteps = noise_scheduler_config['num_inference_timesteps']
+        self.prediction_type = noise_scheduler_config['prediction_type']
+
+        self.pred_horizon = pred_horizon
+        self.action_dim = action_dim
+
+        print("Diffusion params: %e" %
+              sum([p.numel() for p in self.model.parameters()] + [p.numel() for p in self.lang_adaptor.parameters()] +
+                  [p.numel()
+                   for p in self.img_adaptor.parameters()] + [p.numel() for p in self.state_adaptor.parameters()]))
+
+    def build_condition_adapter(self, projector_type, in_features, out_features):
+        projector = None
+        if projector_type == 'linear':
+            projector = nn.Linear(in_features, out_features)
+        else:
+            mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+            if mlp_gelu_match:
+                mlp_depth = int(mlp_gelu_match.group(1))
+                modules = [nn.Linear(in_features, out_features)]
+                for _ in range(1, mlp_depth):
+                    modules.append(nn.GELU(approximate="tanh"))
+                    modules.append(nn.Linear(out_features, out_features))
+                projector = nn.Sequential(*modules)
+
+        if projector is None:
+            raise ValueError(f'Unknown projector type: {projector_type}')
+
+        return projector
+
+    def adapt_conditions(self, lang_tokens, img_tokens, state_tokens):
+        '''
+        lang_tokens: (batch_size, lang_len, lang_token_dim)
+        img_tokens: (batch_size, img_len, img_token_dim)
+        state_tokens: (batch_size, state_len, state_token_dim)
+        
+        return: adpated (..., hidden_size) for all input tokens
+        '''
+        adpated_lang = self.lang_adaptor(lang_tokens)
+        adpated_img = self.img_adaptor(img_tokens)
+        adpated_state = self.state_adaptor(state_tokens)
+
+        return adpated_lang, adpated_img, adpated_state
+
+    def conditional_sample(self, lang_cond, lang_attn_mask, img_cond, state_traj, action_mask, ctrl_freqs):
+        '''
+        lang_cond: language conditional data, (batch_size, lang_len, hidden_size).
+        lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
+            which should be True-False bool tensor.
+        img_cond: image conditional data, (batch_size, img_len, hidden_size).
+        state_traj: (batch_size, 1, hidden_size), state trajectory.
+        action_mask: (batch_size, 1, action_dim), a 0-1 **float** tensor
+            indicating the valid action dimensions.
+        ctrl_freqs: (batch_size,), control frequency for each sample.
+        
+        return: (batch_size, horizon, action_dim)
+        '''
+        device = state_traj.device
+        dtype = state_traj.dtype
+        noisy_action = torch.randn(size=(state_traj.shape[0], self.pred_horizon, self.action_dim),
+                                   dtype=dtype,
+                                   device=device)
+        action_mask = action_mask.expand(-1, self.pred_horizon, -1)
+
+        # Set step values
+        self.noise_scheduler_sample.set_timesteps(self.num_inference_timesteps)
+
+        for t in self.noise_scheduler_sample.timesteps:
+            # Prepare state-action trajectory
+            action_traj = torch.cat([noisy_action, action_mask], dim=2)
+            action_traj = self.state_adaptor(action_traj)
+            state_action_traj = torch.cat([state_traj, action_traj], dim=1)
+
+            # Predict the model output
+            model_output = self.model(state_action_traj,
+                                      ctrl_freqs,
+                                      t.unsqueeze(-1).to(device),
+                                      lang_cond,
+                                      img_cond,
+                                      lang_mask=lang_attn_mask)
+
+            # Compute previous actions: x_t -> x_t-1
+            noisy_action = self.noise_scheduler_sample.step(model_output, t, noisy_action).prev_sample
+            noisy_action = noisy_action.to(state_traj.dtype)
+
+        # Finally apply the action mask to mask invalid action dimensions
+        noisy_action = noisy_action * action_mask
+
+        return noisy_action
+
+    # ========= Train  ============
+    def compute_loss(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_gt, action_mask,
+                     ctrl_freqs) -> torch.Tensor:
+        '''
+        lang_tokens: (batch_size, lang_len, lang_token_dim)
+        lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
+            which should be True-False bool tensor.
+        img_tokens: (batch_size, img_len, img_token_dim)
+        state_tokens: (batch_size, 1, state_token_dim)
+        action_gt: (batch_size, horizon, state_token_dim), ground-truth actions for supervision
+        action_mask: (batch_size, 1, state_token_dim), a 0-1 **float** tensor.
+        ctrl_freqs: (batch_size,), control frequency for each sample.
+        
+        return: loss_value, a scalar tensor
+        '''
+        batch_size = lang_tokens.shape[0]
+        device = lang_tokens.device
+        # Sample noise that we'll add to the actions
+        noise = torch.randn(action_gt.shape, dtype=action_gt.dtype, device=device)
+        # Sample random diffusion timesteps
+        timesteps = torch.randint(0, self.num_train_timesteps, (batch_size, ), device=device).long()
+        # Add noise to the clean actions according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        noisy_action = self.noise_scheduler.add_noise(action_gt, noise, timesteps)
+
+        # Concatenate the state and action tokens to form the input sequence
+        state_action_traj = torch.cat([state_tokens, noisy_action], dim=1)
+        # Append the action mask to the input sequence
+        action_mask = action_mask.expand(-1, state_action_traj.shape[1], -1)
+        state_action_traj = torch.cat([state_action_traj, action_mask], dim=2)
+        # Align the dimension with the hidden size
+        lang_cond, img_cond, state_action_traj = self.adapt_conditions(lang_tokens, img_tokens, state_action_traj)
+        # Predict the denoised result
+        pred = self.model(state_action_traj, ctrl_freqs, timesteps, lang_cond, img_cond, lang_mask=lang_attn_mask)
+
+        pred_type = self.prediction_type
+        if pred_type == 'epsilon':
+            target = noise
+        elif pred_type == 'sample':
+            target = action_gt
+        else:
+            raise ValueError(f"Unsupported prediction type {pred_type}")
+        loss = F.mse_loss(pred, target)
+        return loss
+
+    # ========= Inference  ============
+    def predict_action(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_mask, ctrl_freqs):
+        '''
+        lang_tokens: (batch_size, lang_len, lang_token_dim)
+        lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
+            which should be True-False bool tensor.
+        img_tokens: (batch_size, img_len, img_token_dim)
+        state_tokens: (batch_size, 1, state_token_dim)
+        action_mask: (batch_size, 1, action_dim),
+            which should be a 0-1 **float** tensor.
+        ctrl_freqs: (batch_size,), control frequency for each sample.
+        
+        return: (batch_size, horizon, action_dim), predicted action sequence
+        '''
+        # Prepare the state and conditions
+        state_tokens = torch.cat([state_tokens, action_mask], dim=2)
+        lang_cond, img_cond, state_traj = self.adapt_conditions(lang_tokens, img_tokens, state_tokens)
+
+        # Run sampling
+        action_pred = self.conditional_sample(
+            lang_cond,
+            lang_attn_mask,
+            img_cond,
+            state_traj,
+            action_mask,
+            ctrl_freqs,
+        )
+
+        return action_pred
+
+    def forward(self, *args, **kwargs) -> torch.Tensor:
+        return self.compute_loss(*args, **kwargs)
--- a/RDT/lerobot2rdt/read_json.py
+++ b/RDT/lerobot2rdt/read_json.py
@ -0,0 +1,20 @@
+import sys
+import json
+
+def read_json_value(file_path, key):
+    with open(file_path, "r") as file:
+        data = json.load(file)
+        value = data.get(key)
+        if value is not None:
+            print(value)
+        else:
+            print(f"Key '{key}' not found in {file_path}")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python read_json.py <file_path> <key>")
+        sys.exit(1)
+
+    file_path = sys.argv[1]
+    key = sys.argv[2]
+    read_json_value(file_path, key)
--- a/RDT/lerobot2rdt/requirements.txt
+++ b/RDT/lerobot2rdt/requirements.txt
@ -0,0 +1,24 @@
+numpy<2.0
+packaging==24.0
+deepspeed==0.14.2   
+accelerate==0.30.1
+diffusers==0.27.2
+timm==1.0.3
+transformers==4.41.0
+sentencepiece==0.2.0
+h5py==3.11.0
+opencv-python==4.9.0.80
+imgaug==0.4.0
+pytz==2022.1
+huggingface_hub==0.23.0
+pandas==2.3.3
+
+# requirements_data.txt
+# tfds-nightly==4.9.4.dev202402070044
+gsutil==5.27
+tensorflow==2.15.0.post1
+pillow==10.2.0
+pyyaml==6.0.1
+tensorflow-graphics==2021.12.3
+imageio==2.34.0
+imageio-ffmpeg==0.4.9
--- a/RDT/rdt-export/.dockerignore
+++ b/RDT/rdt-export/.dockerignore
@ -0,0 +1,2 @@
+input/*
+output/*
--- a/RDT/rdt-export/Dockerfile
+++ b/RDT/rdt-export/Dockerfile
@ -0,0 +1,48 @@
+
+FROM registry.d-robotics.cc/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+# ccr-29eug8s3-pub.cnc.bj.baidubce.com/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+WORKDIR /app
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV TZ=Asia/Shanghai
+
+RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
+    sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
+
+RUN apt-get update --allow-unauthenticated && apt-get install -y \
+    software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y \
+    python3.10 \
+    python3.10-dev \
+    python3-pip \
+    python3.10-distutils \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    wget \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+
+COPY . /app/
+
+RUN python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+# RUN pip install torch==2.1.0 torchvision==0.16.0  --index-url https://download.pytorch.org/whl/cu121
+# RUN pip install torch==2.1.0 torchvision==0.16.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
+RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+RUN pip install packaging==24.0
+
+RUN pip install tfds-nightly==4.9.4.dev202402070044
+
+RUN pip install flash_attn-2.7.2.post1+cu12torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+
+# RUN mkdir -p /app/dataset/input /app/dataset/output 
+
+ENTRYPOINT ["bash", "deploy.sh"]
--- a/RDT/rdt-export/configs/base_170M.yaml
+++ b/RDT/rdt-export/configs/base_170M.yaml
@ -0,0 +1,71 @@
+common:
+  # The number of historical images
+  img_history_size: 2
+  # The number of future actions to predict
+  action_chunk_size: 64
+  # The number of cameras to be used in the model
+  num_cameras: 3
+  # Dimension for state/action, we use the same space for both state and action
+  # This MUST be equal to configs/state_vec.py
+  state_dim: 128
+
+
+dataset:
+  # We will extract the data from raw dataset
+  # and store them in the disk buffer by producer
+  # When training, we will read the data 
+  # randomly from the buffer by consumer
+  # The producer will replace the data which has been 
+  # read by the consumer with new data
+
+  # The path to the buffer (at least 400GB)
+  buf_path: /path/to/buffer
+  # The number of chunks in the buffer
+  buf_num_chunks: 512
+  # The number of samples (step rather than episode) in each chunk
+  buf_chunk_size: 512
+
+  # We will filter the episodes with length less than `epsd_len_thresh_low`
+  epsd_len_thresh_low: 32
+  # For those more than `epsd_len_thresh_high`,
+  # we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
+  # to better balance the training datasets
+  epsd_len_thresh_high: 2048
+  # How to fit the image size
+  image_aspect_ratio: pad
+  # Maximum number of language tokens
+  tokenizer_max_length: 1024
+
+model:
+  # Config for condition adpators
+  lang_adaptor: mlp2x_gelu
+  img_adaptor: mlp2x_gelu
+  state_adaptor: mlp3x_gelu
+  lang_token_dim: 4096
+  img_token_dim: 1152
+  # Dim of action or proprioception vector
+  # A `state` refers to an action or a proprioception vector
+  state_token_dim: 128
+  # Config for RDT structure
+  rdt:
+    # 1B: num_head 32 hidden_size 2048
+    hidden_size: 1024
+    depth: 14
+    num_heads: 32
+    cond_pos_embed_type: multimodal 
+  # For noise scheduler
+  noise_scheduler:
+    type: ddpm
+    num_train_timesteps: 1000
+    num_inference_timesteps: 5
+    beta_schedule: squaredcos_cap_v2  # Critical choice
+    prediction_type: sample
+    clip_sample: False
+  # For EMA (params averaging)
+  # We do not use EMA currently
+  ema:
+    update_after_step: 0
+    inv_gamma: 1.0
+    power: 0.75
+    min_value: 0.0
+    max_value: 0.9999
--- a/RDT/rdt-export/configs/base_1B.yaml
+++ b/RDT/rdt-export/configs/base_1B.yaml
@ -0,0 +1,71 @@
+common:
+  # The number of historical images
+  img_history_size: 2
+  # The number of future actions to predict
+  action_chunk_size: 64
+  # The number of cameras to be used in the model
+  num_cameras: 3
+  # Dimension for state/action, we use the same space for both state and action
+  # This MUST be equal to configs/state_vec.py
+  state_dim: 128
+
+
+dataset:
+  # We will extract the data from raw dataset
+  # and store them in the disk buffer by producer
+  # When training, we will read the data 
+  # randomly from the buffer by consumer
+  # The producer will replace the data which has been 
+  # read by the consumer with new data
+
+  # The path to the buffer (at least 400GB)
+  buf_path: /path/to/buffer
+  # The number of chunks in the buffer
+  buf_num_chunks: 512
+  # The number of samples (step rather than episode) in each chunk
+  buf_chunk_size: 512
+
+  # We will filter the episodes with length less than `epsd_len_thresh_low`
+  epsd_len_thresh_low: 32
+  # For those more than `epsd_len_thresh_high`,
+  # we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
+  # to better balance the training datasets
+  epsd_len_thresh_high: 2048
+  # How to fit the image size
+  image_aspect_ratio: pad
+  # Maximum number of language tokens
+  tokenizer_max_length: 1024
+
+model:
+  # Config for condition adpators
+  lang_adaptor: mlp2x_gelu
+  img_adaptor: mlp2x_gelu
+  state_adaptor: mlp3x_gelu
+  lang_token_dim: 4096
+  img_token_dim: 1152
+  # Dim of action or proprioception vector
+  # A `state` refers to an action or a proprioception vector
+  state_token_dim: 128
+  # Config for RDT structure
+  rdt:
+    # 1B: num_head 32 hidden_size 2048
+    hidden_size: 2048
+    depth: 28
+    num_heads: 32
+    cond_pos_embed_type: multimodal 
+  # For noise scheduler
+  noise_scheduler:
+    type: ddpm
+    num_train_timesteps: 1000
+    num_inference_timesteps: 5
+    beta_schedule: squaredcos_cap_v2  # Critical choice
+    prediction_type: sample
+    clip_sample: False
+  # For EMA (params averaging)
+  # We do not use EMA currently
+  ema:
+    update_after_step: 0
+    inv_gamma: 1.0
+    power: 0.75
+    min_value: 0.0
+    max_value: 0.9999
--- a/RDT/rdt-export/configs/calvin_rel_traj_location_bounds_task_ABC_D.json
+++ b/RDT/rdt-export/configs/calvin_rel_traj_location_bounds_task_ABC_D.json
@ -0,0 +1,50 @@
+{
+    "A": [
+        [
+            -0.2691913843154907,
+            -0.21995729207992554,
+            -0.182277649641037
+        ],
+        [
+            0.35127854347229004,
+            0.2769763469696045,
+            0.17159393429756165
+        ]
+    ],
+    "B": [
+        [
+            -0.2576896846294403,
+            -0.22244493663311005,
+            -0.20557966828346252
+        ],
+        [
+            0.32854634523391724,
+            0.2922680974006653,
+            0.17373555898666382
+        ]
+    ],
+    "C": [
+        [
+            -0.29205888509750366,
+            -0.24688798189163208,
+            -0.17577645182609558
+        ],
+        [
+            0.25053921341896057,
+            0.3277084231376648,
+            0.16431939601898193
+        ]
+    ],
+    "D": [
+        [
+            -0.25131964683532715,
+            -0.15233077108860016,
+            -0.13294968008995056
+        ],
+        [
+            0.19209328293800354,
+            0.19344553351402283,
+            0.1370421051979065
+        ]
+    ]
+}
--- a/RDT/rdt-export/configs/dataset_control_freq.json
+++ b/RDT/rdt-export/configs/dataset_control_freq.json
@ -0,0 +1,65 @@
+{
+    "fractal20220817_data": 3,
+    "taco_play": 15,
+    "jaco_play": 10,
+    "berkeley_cable_routing": 10,
+    "nyu_door_opening_surprising_effectiveness": 3,
+    "viola": 20,
+    "berkeley_autolab_ur5": 5,
+    "toto": 30,
+    "kuka": 10,
+    "language_table": 10,
+    "columbia_cairlab_pusht_real": 10,
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 20,
+    "nyu_rot_dataset_converted_externally_to_rlds":3,
+    "stanford_hydra_dataset_converted_externally_to_rlds": 10,
+    "austin_buds_dataset_converted_externally_to_rlds": 20,
+    "nyu_franka_play_dataset_converted_externally_to_rlds": 3,
+    "maniskill_dataset_converted_externally_to_rlds": 20,
+    "furniture_bench_dataset_converted_externally_to_rlds": 10,
+    "ucsd_kitchen_dataset_converted_externally_to_rlds": 2,
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": 3,
+    "austin_sailor_dataset_converted_externally_to_rlds": 20,
+    "austin_sirius_dataset_converted_externally_to_rlds": 20,
+    "bc_z": 10,
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": 10,
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 10,
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
+    "utokyo_xarm_bimanual_converted_externally_to_rlds": 10,
+    "berkeley_mvp_converted_externally_to_rlds": 5,
+    "berkeley_rpt_converted_externally_to_rlds": 30,
+    "kaist_nonprehensile_converted_externally_to_rlds": 10,
+    "stanford_mask_vit_converted_externally_to_rlds": 0,
+    "tokyo_u_lsmo_converted_externally_to_rlds": 10,
+    "dlr_sara_pour_converted_externally_to_rlds": 10,
+    "dlr_sara_grid_clamp_converted_externally_to_rlds": 10,
+    "dlr_edan_shared_control_converted_externally_to_rlds": 5,
+    "asu_table_top_converted_externally_to_rlds": 12.5,
+    "stanford_robocook_converted_externally_to_rlds": 5,
+    "eth_agent_affordances": 66.6,
+    "imperialcollege_sawyer_wrist_cam": 10,
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": 20,
+    "uiuc_d3field": 1,
+    "utaustin_mutex": 20,
+    "berkeley_fanuc_manipulation": 10,
+    "cmu_play_fusion": 5,
+    "cmu_stretch": 10,
+    "berkeley_gnm_recon": 3,
+    "berkeley_gnm_cory_hall": 5,
+    "berkeley_gnm_sac_son": 10,
+    "robo_net": 1,
+    "roboturk_real_towercreation": 10,
+    "roboturk_real_laundrylayout": 10,
+    "roboturk_real_objectsearch": 10,
+    "aloha_mobile": 50,
+    "aloha_static": 50,
+    "roboset": 5,
+    "droid": 15,
+    "fmb": 10,
+    "dobbe": 30,
+    "qut_dexterous_manpulation": 30,
+    "agilex": 25,
+    "rh20t": 10,
+    "calvin": 30,
+    "bridgev2": 5
+}
--- a/RDT/rdt-export/configs/dataset_img_keys.json
+++ b/RDT/rdt-export/configs/dataset_img_keys.json
@ -0,0 +1,575 @@
+{
+    "fractal20220817_data": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[
+            1,0,0,0
+        ]
+    },
+    "taco_play": {
+        "image_keys": [
+            "rgb_static",
+            "rgb_gripper",
+            "rgb_static",
+            "rgb_static"
+        ],
+        "image_mask":[
+            1,1,0,0
+        ]
+    },
+    "jaco_play": {
+        "image_keys": [
+            "image",
+            "image_wrist",
+            "image_wrist",
+            "image_wrist"
+        ],
+        "image_mask":[
+            1,1,0,0
+        ]
+    },
+    "berkeley_cable_routing": {
+        "image_keys": [
+            "image",
+            "wrist45_image",
+            "wrist225_image",
+            "top_image"
+        ],
+        "image_mask":[1,1,0,1]
+    },
+    "nyu_door_opening_surprising_effectiveness": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "viola": {
+        "image_keys": [
+            "agentview_rgb",
+            "eye_in_hand_rgb",
+            "eye_in_hand_rgb",
+            "eye_in_hand_rgb"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "berkeley_autolab_ur5": {
+        "image_keys": [
+            "image",
+            "hand_image",
+            "hand_image",
+            "hand_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "toto": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "kuka": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "language_table": {
+        "image_keys": [
+            "rgb",
+            "rgb",
+            "rgb",
+            "rgb"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "columbia_cairlab_pusht_real": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "nyu_rot_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "stanford_hydra_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "austin_buds_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "nyu_franka_play_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image_additional_view",
+            "image_additional_view",
+            "image_additional_view"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "maniskill_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "furniture_bench_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "ucsd_kitchen_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "austin_sailor_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "austin_sirius_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "bc_z": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "hand_image",
+            "hand_image",
+            "image2"
+        ],
+        "image_mask":[1,1,0,1]
+    },
+    "utokyo_xarm_bimanual_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "berkeley_mvp_converted_externally_to_rlds": {
+        "image_keys": [
+            "hand_image",
+            "hand_image",
+            "hand_image",
+            "hand_image"
+        ],
+        "image_mask":[0,1,0,0]
+    },
+    "berkeley_rpt_converted_externally_to_rlds": {
+        "image_keys": [
+            "hand_image",
+            "hand_image",
+            "hand_image",
+            "hand_image"
+        ],
+        "image_mask":[0,1,0,0]
+    },
+    "kaist_nonprehensile_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "stanford_mask_vit_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "tokyo_u_lsmo_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "dlr_sara_pour_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "dlr_sara_grid_clamp_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "dlr_edan_shared_control_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "asu_table_top_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "stanford_robocook_converted_externally_to_rlds": {
+        "image_keys": [
+            "image_2",
+            "image_1",
+            "image_3",
+            "image_4"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "eth_agent_affordances": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "imperialcollege_sawyer_wrist_cam": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[0,1,0,0]
+    },
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "uiuc_d3field": {
+        "image_keys": [
+            "image_1",
+            "image_2",
+            "image_3",
+            "image_4"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "utaustin_mutex": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "berkeley_fanuc_manipulation": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "cmu_play_fusion": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "cmu_stretch": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "berkeley_gnm_recon": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "berkeley_gnm_cory_hall": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "berkeley_gnm_sac_son": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "robo_net": {
+        "image_keys": [
+            "image",
+            "image1",
+            "image2",
+            "image2"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "roboturk_real_towercreation": {
+        "image_keys": [
+            "top_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "roboturk_real_laundrylayout": {
+        "image_keys": [
+            "top_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "roboturk_real_objectsearch": {
+        "image_keys": [
+            "top_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "aloha_mobile": {
+        "image_keys": [
+            "cam_high",
+            "cam_right_wrist",
+            "cam_left_wrist",
+            "cam_right_wrist"
+        ],
+        "image_mask":[1,1,1,0]
+    },
+    "aloha_static": {
+        "image_keys": [
+            "cam_high",
+            "cam_right_wrist",
+            "cam_left_wrist",
+            "cam_low"
+        ],
+        "image_mask":[1,1,1,1]
+    },
+    "roboset": {
+        "image_keys": [
+            "rgb_top",
+            "rgb_right",
+            "rgb_left",
+            "rgb_right"
+        ],
+        "image_mask":[1,1,1,0]
+    },
+    "droid": {
+        "image_keys": [
+            "exterior_image_1_left",
+            "wrist_image_left",
+            "wrist_image_left",
+            "exterior_image_2_left"
+        ], 
+        "image_mask":[1,1,0,1]
+    },
+    "fmb": {
+        "image_keys": [
+            "image_side_1",
+            "image_wrist_1",
+            "image_wrist_1",
+            "image_side_2"
+        ], 
+        "image_mask":[1,1,0,1]
+    },
+    "dobbe": {
+        "image_keys": [
+            "wrist_image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ], 
+        "image_mask":[0,1,0,0]
+    },
+    "qut_dexterous_manpulation": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ], 
+        "image_mask":[1,1,0,0]
+    },
+    "agilex": {
+        "image_keys": [
+            "cam_high",
+            "cam_right_wrist",
+            "cam_left_wrist",
+            "cam_right_wrist"
+        ],
+        "image_mask":[1,1,1,0]
+    },
+    "rh20t": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",        
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "calvin": {
+        "image_keys": [
+            "rgb_static",
+            "rgb_gripper",
+            "rgb_gripper",        
+            "rgb_gripper"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "bridgev2": {
+        "image_keys": [
+            "images0",
+            "images0",
+            "images0",
+            "images0"
+        ],
+        "image_mask":[1,0,0,0]
+    }
+}
--- a/RDT/rdt-export/configs/dataset_stat.json
+++ b/RDT/rdt-export/configs/dataset_stat.json
@ -0,0 +1,525 @@
+{
+    "agilex": {
+        "dataset_name": "agilex",
+        "state_mean": [
+            -0.0036545392947090432,
+            -0.2773659935760079,
+            0.3147616748061523,
+            0.3813313179910183,
+            0.04028575944090457,
+            0.034888520819083294,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "state_std": [
+            0.05763674563578847,
+            0.2580181064167735,
+            0.19785840483767897,
+            0.05020347749331385,
+            0.054529239104671424,
+            0.05020521339363586,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "state_min": [
+            -0.17447535196940103,
+            -0.5522612677680121,
+            -0.3340397516886393,
+            0.21861712137858072,
+            -0.09725829230414497,
+            0.003396739231215583,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "state_max": [
+            0.21961932712131077,
+            0.30613206227620443,
+            0.5444545321994357,
+            0.4866888682047526,
+            0.31486290825737845,
+            0.3355223337809245,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ]
+    }
+}
--- a/RDT/rdt-export/configs/finetune_datasets.json
+++ b/RDT/rdt-export/configs/finetune_datasets.json
@ -0,0 +1,3 @@
+[
+    "agilex"
+]
--- a/RDT/rdt-export/configs/finetune_sample_weights.json
+++ b/RDT/rdt-export/configs/finetune_sample_weights.json
@ -0,0 +1,3 @@
+{
+    "agilex": 100
+}
--- a/RDT/rdt-export/configs/pretrain_datasets.json
+++ b/RDT/rdt-export/configs/pretrain_datasets.json
@ -0,0 +1,48 @@
+[
+    "fractal20220817_data",
+    "jaco_play",
+    "taco_play",
+    "berkeley_cable_routing",
+    "viola",
+    "berkeley_autolab_ur5",
+    "toto",
+    "nyu_door_opening_surprising_effectiveness",
+    "columbia_cairlab_pusht_real",
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds",
+    "austin_buds_dataset_converted_externally_to_rlds",
+    "kuka",
+    "utokyo_xarm_bimanual_converted_externally_to_rlds",
+    "stanford_hydra_dataset_converted_externally_to_rlds",
+    "maniskill_dataset_converted_externally_to_rlds",
+    "ucsd_kitchen_dataset_converted_externally_to_rlds",
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds",
+    "austin_sailor_dataset_converted_externally_to_rlds",
+    "austin_sirius_dataset_converted_externally_to_rlds",
+    "bc_z",
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds",
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds",
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds",
+    "berkeley_mvp_converted_externally_to_rlds",
+    "berkeley_rpt_converted_externally_to_rlds",
+    "kaist_nonprehensile_converted_externally_to_rlds",
+    "tokyo_u_lsmo_converted_externally_to_rlds",
+    "dlr_sara_grid_clamp_converted_externally_to_rlds",
+    "stanford_robocook_converted_externally_to_rlds",
+    "imperialcollege_sawyer_wrist_cam",
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds",
+    "utaustin_mutex",
+    "berkeley_fanuc_manipulation",
+    "cmu_play_fusion",
+    "language_table",
+    "furniture_bench_dataset_converted_externally_to_rlds",
+    "droid",
+    "fmb",
+    "dobbe",
+    "qut_dexterous_manpulation",
+    "aloha_mobile",
+    "aloha_static",
+    "roboset",
+    "rh20t",
+    "calvin",
+    "bridgev2"
+]
--- a/RDT/rdt-export/configs/pretrain_sample_weights.json
+++ b/RDT/rdt-export/configs/pretrain_sample_weights.json
@ -0,0 +1,48 @@
+{
+    "fractal20220817_data": 271,
+    "taco_play": 60,
+    "jaco_play": 33,
+    "berkeley_cable_routing": 8,
+    "nyu_door_opening_surprising_effectiveness": 10,
+    "viola": 12,
+    "berkeley_autolab_ur5": 32,
+    "toto": 32,
+    "kuka": 50,
+    "language_table": 100,
+    "columbia_cairlab_pusht_real": 12,
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 55,
+    "stanford_hydra_dataset_converted_externally_to_rlds": 24,
+    "austin_buds_dataset_converted_externally_to_rlds": 7,
+    "maniskill_dataset_converted_externally_to_rlds": 174,
+    "furniture_bench_dataset_converted_externally_to_rlds": 71,
+    "ucsd_kitchen_dataset_converted_externally_to_rlds": 12,
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": 37,
+    "austin_sailor_dataset_converted_externally_to_rlds": 15,
+    "austin_sirius_dataset_converted_externally_to_rlds": 24,
+    "bc_z": 208,
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": 9,
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 15,
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
+    "utokyo_xarm_bimanual_converted_externally_to_rlds": 1,
+    "berkeley_mvp_converted_externally_to_rlds": 22,
+    "berkeley_rpt_converted_externally_to_rlds": 30,
+    "kaist_nonprehensile_converted_externally_to_rlds": 14,
+    "tokyo_u_lsmo_converted_externally_to_rlds": 7,
+    "dlr_sara_grid_clamp_converted_externally_to_rlds": 1,
+    "stanford_robocook_converted_externally_to_rlds": 50,
+    "imperialcollege_sawyer_wrist_cam": 13,
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": 25,
+    "utaustin_mutex": 39,
+    "berkeley_fanuc_manipulation": 20,
+    "cmu_play_fusion": 24,
+    "droid": 303,
+    "fmb": 42,
+    "dobbe": 36,
+    "qut_dexterous_manpulation": 14,
+    "aloha_mobile": 150,
+    "aloha_static": 150,
+    "roboset": 135,
+    "rh20t": 331,
+    "calvin": 100,
+    "bridgev2": 224
+}
--- a/RDT/rdt-export/configs/state_vec.py
+++ b/RDT/rdt-export/configs/state_vec.py
@ -0,0 +1,126 @@
+STATE_VEC_IDX_MAPPING = {
+    # [0, 10): right arm joint positions
+    **{
+        "arm_joint_{}_pos".format(i): i
+        for i in range(10)
+    },
+    **{
+        "right_arm_joint_{}_pos".format(i): i
+        for i in range(10)
+    },
+    # [10, 15): right gripper joint positions
+    **{
+        "gripper_joint_{}_pos".format(i): i + 10
+        for i in range(5)
+    },
+    **{
+        "right_gripper_joint_{}_pos".format(i): i + 10
+        for i in range(5)
+    },
+    "gripper_open": 10,  # alias of right_gripper_joint_0_pos
+    "right_gripper_open": 10,
+    # [15, 25): right arm joint velocities
+    **{
+        "arm_joint_{}_vel".format(i): i + 15
+        for i in range(10)
+    },
+    **{
+        "right_arm_joint_{}_vel".format(i): i + 15
+        for i in range(10)
+    },
+    # [25, 30): right gripper joint velocities
+    **{
+        "gripper_joint_{}_vel".format(i): i + 25
+        for i in range(5)
+    },
+    **{
+        "right_gripper_joint_{}_vel".format(i): i + 25
+        for i in range(5)
+    },
+    "gripper_open_vel": 25,  # alias of right_gripper_joint_0_vel
+    "right_gripper_open_vel": 25,
+    # [30, 33): right end effector positions
+    "eef_pos_x": 30,
+    "right_eef_pos_x": 30,
+    "eef_pos_y": 31,
+    "right_eef_pos_y": 31,
+    "eef_pos_z": 32,
+    "right_eef_pos_z": 32,
+    # [33, 39): right end effector 6D pose
+    "eef_angle_0": 33,
+    "right_eef_angle_0": 33,
+    "eef_angle_1": 34,
+    "right_eef_angle_1": 34,
+    "eef_angle_2": 35,
+    "right_eef_angle_2": 35,
+    "eef_angle_3": 36,
+    "right_eef_angle_3": 36,
+    "eef_angle_4": 37,
+    "right_eef_angle_4": 37,
+    "eef_angle_5": 38,
+    "right_eef_angle_5": 38,
+    # [39, 42): right end effector velocities
+    "eef_vel_x": 39,
+    "right_eef_vel_x": 39,
+    "eef_vel_y": 40,
+    "right_eef_vel_y": 40,
+    "eef_vel_z": 41,
+    "right_eef_vel_z": 41,
+    # [42, 45): right end effector angular velocities
+    "eef_angular_vel_roll": 42,
+    "right_eef_angular_vel_roll": 42,
+    "eef_angular_vel_pitch": 43,
+    "right_eef_angular_vel_pitch": 43,
+    "eef_angular_vel_yaw": 44,
+    "right_eef_angular_vel_yaw": 44,
+    # [45, 50): reserved
+    # [50, 60): left arm joint positions
+    **{
+        "left_arm_joint_{}_pos".format(i): i + 50
+        for i in range(10)
+    },
+    # [60, 65): left gripper joint positions
+    **{
+        "left_gripper_joint_{}_pos".format(i): i + 60
+        for i in range(5)
+    },
+    "left_gripper_open": 60,  # alias of left_gripper_joint_0_pos
+    # [65, 75): left arm joint velocities
+    **{
+        "left_arm_joint_{}_vel".format(i): i + 65
+        for i in range(10)
+    },
+    # [75, 80): left gripper joint velocities
+    **{
+        "left_gripper_joint_{}_vel".format(i): i + 75
+        for i in range(5)
+    },
+    "left_gripper_open_vel": 75,  # alias of left_gripper_joint_0_vel
+    # [80, 83): left end effector positions
+    "left_eef_pos_x": 80,
+    "left_eef_pos_y": 81,
+    "left_eef_pos_z": 82,
+    # [83, 89): left end effector 6D pose
+    "left_eef_angle_0": 83,
+    "left_eef_angle_1": 84,
+    "left_eef_angle_2": 85,
+    "left_eef_angle_3": 86,
+    "left_eef_angle_4": 87,
+    "left_eef_angle_5": 88,
+    # [89, 92): left end effector velocities
+    "left_eef_vel_x": 89,
+    "left_eef_vel_y": 90,
+    "left_eef_vel_z": 91,
+    # [92, 95): left end effector angular velocities
+    "left_eef_angular_vel_roll": 92,
+    "left_eef_angular_vel_pitch": 93,
+    "left_eef_angular_vel_yaw": 94,
+    # [95, 100): reserved
+    # [100, 102): base linear velocities
+    "base_vel_x": 100,
+    "base_vel_y": 101,
+    # [102, 103): base angular velocities
+    "base_angular_vel": 102,
+    # [103, 128): reserved
+}
+STATE_VEC_LEN = 128
--- a/RDT/rdt-export/configs/zero2.json
+++ b/RDT/rdt-export/configs/zero2.json
@ -0,0 +1,14 @@
+{
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9
+    }
+}
--- a/RDT/rdt-export/export.py
+++ b/RDT/rdt-export/export.py
@ -0,0 +1,993 @@
+import os
+import re
+import json
+import logging
+import argparse
+from time import time
+from collections import OrderedDict
+from dataclasses import dataclass
+import yaml
+
+import cv2
+import numpy as np
+import torch
+import h5py
+from PIL import Image as PImage
+
+import onnx
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+
+from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+from diffusers.schedulers.scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
+
+from scripts.agilex_model import create_model
+from configs.state_vec import STATE_VEC_IDX_MAPPING
+from models.hub_mixin import CompatiblePyTorchModelHubMixin
+from models.rdt.blocks import (FinalLayer, RDTBlock, TimestepEmbedder, get_1d_sincos_pos_embed_from_grid, get_multimodal_cond_pos_embed)
+from models.multimodal_encoder.siglip_encoder import SiglipVisionTower
+from models.multimodal_encoder.t5_encoder import T5Embedder
+
+logging.basicConfig(
+    level=logging.DEBUG,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    datefmt='%H:%M:%S')
+logger = logging.getLogger("RDT_EXPORT")
+
+os.environ["WANDB_MODE"] = "disabled"
+
+@dataclass
+class ExportConfig:
+    task_id: str = None
+    output_path: str = None
+    model_path: str = None
+    calibration_num: int = 100
+    lang_calibration_num: int = 1
+    dataset_path: str = None
+    gpu_id: str = "0"
+    march: str = None
+    model_type: str = None
+    pretrained_vision_encoder_name_or_path: str = None
+    ctrl_freq: int = 25
+    cal_data_device: str = "cuda"
+    
+    
+AGILEX_STATE_INDICES = [
+        STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"] for i in range(6)
+    ]
+
+def dump_img_adaptor(img_tokens):
+    global img_adaptor_cal_ws
+    global dump_cnt, dump_dataset_name
+    np.save(os.path.join(img_adaptor_cal_ws, f"img_adaptor_{dump_dataset_name}_{dump_cnt}.npy"), img_tokens.float().contiguous().cpu().detach().numpy())
+
+def dump_dit(state_action_traj, ctrl_freqs, t, lang_cond, img_cond, lang_attn_mask):
+    t_str = str(t)
+    x = state_action_traj.float().contiguous().cpu().detach().numpy()
+    freq = ctrl_freqs.float().contiguous().cpu().detach().numpy().astype(np.int32).copy()
+    t_ = t.float().contiguous().cpu().detach().numpy()
+    t_ = np.expand_dims(t_.astype(np.int32), axis=0).copy()
+    lang_c = lang_cond.float().contiguous().cpu().detach().numpy()
+    img_c = img_cond.float().contiguous().cpu().detach().numpy()
+    lang_mask = lang_attn_mask.float().contiguous().cpu().detach().numpy()
+    pad_rows = 64 - lang_mask.shape[1]
+    padded = np.pad(lang_mask, ((0,0), (0,pad_rows)), mode="constant")
+    mask_float = np.where(padded, 0.0, -512.0).astype(np.float32)
+    lang_cond_padded = np.pad(lang_c, pad_width=((0, 0), (0, pad_rows), (0,0)), mode="constant", constant_values=0)
+    global dit_cal_path_x, dit_cal_path_freq, dit_cal_path_t, dit_cal_path_lang_c, dit_cal_path_img_c, dit_cal_path_lang_mask
+    global dump_cnt, dump_dataset_name
+    np.save(os.path.join(dit_cal_path_x, f"x_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), x)
+    np.save(os.path.join(dit_cal_path_freq, f"freq_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), freq)
+    np.save(os.path.join(dit_cal_path_t, f"t_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), t_)
+    np.save(os.path.join(dit_cal_path_lang_c, f"lang_c_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), lang_cond_padded)
+    np.save(os.path.join(dit_cal_path_img_c, f"img_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), img_c)
+    np.save(os.path.join(dit_cal_path_lang_mask, f"lang_mask_{t_str}_{dump_dataset_name}_{dump_cnt}.npy"), mask_float)
+
+def create_dump_model(args, **kwargs):
+    # left_arm_dim, right_arm_dim = (args["arm_dim"]["left_arm_dim"], args["arm_dim"]["right_arm_dim"],)
+    # AGILEX_STATE_INDICES = ([STATE_VEC_IDX_MAPPING[f"left_arm_joint_{i}_pos"]
+    #                          for i in range(left_arm_dim)] + [STATE_VEC_IDX_MAPPING["left_gripper_open"]] +
+    #                         [STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"]
+    #                          for i in range(right_arm_dim)] + [STATE_VEC_IDX_MAPPING[f"right_gripper_open"]])
+
+    model = RoboticDiffusionTransformerModel_Dump(args, **kwargs)
+    pretrained = kwargs.get("pretrained", None)
+    if pretrained is not None and os.path.isfile(pretrained):
+        model.load_pretrained_weights(pretrained)
+    return model
+
+class RDT_Dump(nn.Module):
+    def __init__(self,
+                 output_dim=128,
+                 horizon=32,
+                 hidden_size=1152,
+                 depth=28,
+                 num_heads=16,
+                 max_lang_cond_len=1024,
+                 img_cond_len=4096,
+                 lang_pos_embed_config=None,
+                 img_pos_embed_config=None,
+                 dtype=torch.bfloat16):
+        super().__init__()
+        self.horizon = horizon
+        self.hidden_size = hidden_size
+        self.max_lang_cond_len = max_lang_cond_len
+        self.img_cond_len = img_cond_len
+        self.dtype = dtype
+        self.lang_pos_embed_config = lang_pos_embed_config
+        self.img_pos_embed_config = img_pos_embed_config
+        self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
+        self.freq_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
+        # We will use trainable sin-cos embeddings
+        # [timestep; state; action]
+        self.x_pos_embed = nn.Parameter(torch.zeros(1, horizon + 3, hidden_size))
+        # Language conditions
+        self.lang_cond_pos_embed = nn.Parameter(torch.zeros(1, max_lang_cond_len, hidden_size))
+        # Image conditions
+        self.img_cond_pos_embed = nn.Parameter(torch.zeros(1, img_cond_len, hidden_size))
+        self.blocks = nn.ModuleList([RDTBlock(hidden_size, num_heads) for _ in range(depth)])
+        self.final_layer = FinalLayer(hidden_size, output_dim)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize pos_embed by sin-cos embedding
+        x_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
+                                                    mm_cond_lens=OrderedDict([
+                                                        ('timestep', 1),
+                                                        ('ctrl_freq', 1),
+                                                        ('state', 1),
+                                                        ('action', self.horizon),
+                                                    ]))
+        self.x_pos_embed.data.copy_(torch.from_numpy(x_pos_embed).float().unsqueeze(0))
+        if self.lang_pos_embed_config is None:
+            lang_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size, torch.arange(self.max_lang_cond_len))
+        else:
+            lang_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size, mm_cond_lens=OrderedDict(self.lang_pos_embed_config), embed_modality=False)
+        self.lang_cond_pos_embed.data.copy_(torch.from_numpy(lang_cond_pos_embed).float().unsqueeze(0))
+        if self.img_pos_embed_config is None:
+            img_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size, torch.arange(self.img_cond_len))
+        else:
+            img_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size, mm_cond_lens=OrderedDict(self.img_pos_embed_config), embed_modality=False)
+        self.img_cond_pos_embed.data.copy_(torch.from_numpy(img_cond_pos_embed).float().unsqueeze(0))
+        # Initialize timestep and control freq embedding MLP
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.freq_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.freq_embedder.mlp[2].weight, std=0.02)
+        # Initialize the final layer: zero-out the final linear layer
+        nn.init.constant_(self.final_layer.ffn_final.fc2.weight, 0)
+        nn.init.constant_(self.final_layer.ffn_final.fc2.bias, 0)
+        # Move all the params to given data type:
+        self.to(self.dtype)
+    def forward(self, x, freq, t, lang_c, img_c, lang_mask=None, img_mask=None):
+        t = self.t_embedder(t).unsqueeze(1)  # (B, 1, D) or (1, 1, D)
+        freq = self.freq_embedder(freq).unsqueeze(1)  # (B, 1, D)
+        # Append timestep to the input tokens
+        if t.shape[0] == 1:
+            t = t.expand(x.shape[0], -1, -1)
+        x = torch.cat([t, freq, x], dim=1)  # (B, T+1, D)
+        # Add multimodal position embeddings
+        x = x + self.x_pos_embed
+        # Note the lang is of variable length
+        lang_c = lang_c + self.lang_cond_pos_embed[:, :lang_c.shape[1]]
+        img_c = img_c + self.img_cond_pos_embed
+        # Forward pass
+        conds = [lang_c, img_c]
+        masks = [lang_mask, img_mask]
+        for i, block in enumerate(self.blocks):
+            c, mask = conds[i % 2], masks[i % 2]
+            x = block(x, c, mask)  # (B, T+1, D)
+        # Inject the language condition at the final layer
+        x = self.final_layer(x)  # (B, T+1, out_channels)
+        # Only preserve the action tokens
+        x = x[:, -self.horizon:]
+        return x
+
+class RDTRunner_Dump(nn.Module,
+                CompatiblePyTorchModelHubMixin,
+                repo_url="https://huggingface.co/robotics-diffusion-transformer/rdt-1b"):
+    def __init__(self,
+                 *,
+                 action_dim,
+                 pred_horizon,
+                 config,
+                 lang_token_dim,
+                 img_token_dim,
+                 state_token_dim,
+                 max_lang_cond_len,
+                 img_cond_len,
+                 lang_pos_embed_config=None,
+                 img_pos_embed_config=None,
+                 dtype=torch.bfloat16):
+        super(RDTRunner_Dump, self).__init__()
+        # Create diffusion model
+        hidden_size = config['rdt']['hidden_size']
+        self.model = RDT_Dump(
+            output_dim=action_dim,
+            horizon=pred_horizon,
+            hidden_size=hidden_size,
+            depth=config['rdt']['depth'],
+            num_heads=config['rdt']['num_heads'],
+            max_lang_cond_len=max_lang_cond_len,
+            img_cond_len=img_cond_len,
+            lang_pos_embed_config=lang_pos_embed_config,
+            img_pos_embed_config=img_pos_embed_config,
+            dtype=dtype,
+        )
+        # Create adpators for various conditional inputs
+        self.lang_adaptor = self.build_condition_adapter(config['lang_adaptor'], in_features=lang_token_dim, out_features=hidden_size)
+        self.img_adaptor = self.build_condition_adapter(config['img_adaptor'], in_features=img_token_dim, out_features=hidden_size)
+        # A `state` refers to an action or a proprioception vector
+        self.state_adaptor = self.build_condition_adapter(
+            config['state_adaptor'],
+            in_features=state_token_dim * 2,  # state + state mask (indicator)
+            out_features=hidden_size)
+        # Create the noise scheduler
+        noise_scheduler_config = config['noise_scheduler']
+        self.noise_scheduler = DDPMScheduler(
+            num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
+            beta_schedule=noise_scheduler_config['beta_schedule'],
+            prediction_type=noise_scheduler_config['prediction_type'],
+            clip_sample=noise_scheduler_config['clip_sample'],
+        )
+        self.noise_scheduler_sample = DPMSolverMultistepScheduler(
+            num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
+            beta_schedule=noise_scheduler_config['beta_schedule'],
+            prediction_type=noise_scheduler_config['prediction_type'],
+        )
+        self.num_train_timesteps = noise_scheduler_config['num_train_timesteps']
+        self.num_inference_timesteps = noise_scheduler_config['num_inference_timesteps']
+        self.prediction_type = noise_scheduler_config['prediction_type']
+        self.pred_horizon = pred_horizon
+        self.action_dim = action_dim
+        print("Diffusion params: %e" %
+              sum([p.numel() for p in self.model.parameters()] + [p.numel() for p in self.lang_adaptor.parameters()] +
+                  [p.numel()
+                   for p in self.img_adaptor.parameters()] + [p.numel() for p in self.state_adaptor.parameters()]))
+    def build_condition_adapter(self, projector_type, in_features, out_features):
+        projector = None
+        if projector_type == 'linear':
+            projector = nn.Linear(in_features, out_features)
+        else:
+            mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+            if mlp_gelu_match:
+                mlp_depth = int(mlp_gelu_match.group(1))
+                modules = [nn.Linear(in_features, out_features)]
+                for _ in range(1, mlp_depth):
+                    modules.append(nn.GELU(approximate="tanh"))
+                    modules.append(nn.Linear(out_features, out_features))
+                projector = nn.Sequential(*modules)
+        if projector is None:
+            raise ValueError(f'Unknown projector type: {projector_type}')
+        return projector
+    def adapt_conditions(self, lang_tokens, img_tokens, state_tokens):
+        adpated_lang = self.lang_adaptor(lang_tokens)
+        dump_img_adaptor(img_tokens)
+        adpated_img = self.img_adaptor(img_tokens)
+        adpated_state = self.state_adaptor(state_tokens)
+        return adpated_lang, adpated_img, adpated_state
+    def conditional_sample(self, lang_cond, lang_attn_mask, img_cond, state_traj, action_mask, ctrl_freqs):
+        device = state_traj.device
+        dtype = state_traj.dtype
+        noisy_action = torch.randn(size=(state_traj.shape[0], self.pred_horizon, self.action_dim), dtype=dtype, device=device)
+        action_mask = action_mask.expand(-1, self.pred_horizon, -1)
+        # Set step values
+        self.noise_scheduler_sample.set_timesteps(self.num_inference_timesteps)
+        for t in self.noise_scheduler_sample.timesteps:
+            # Prepare state-action trajectory
+            action_traj = torch.cat([noisy_action, action_mask], dim=2)
+            action_traj = self.state_adaptor(action_traj)
+            state_action_traj = torch.cat([state_traj, action_traj], dim=1)
+            # dump
+            dump_dit(state_action_traj, ctrl_freqs, t, lang_cond, img_cond, lang_attn_mask)
+            # Predict the model output
+            model_output = self.model(state_action_traj,
+                                      ctrl_freqs,
+                                      t.unsqueeze(-1).to(device),
+                                      lang_cond,
+                                      img_cond,
+                                      lang_mask=lang_attn_mask)
+            # Compute previous actions: x_t -> x_t-1
+            noisy_action = self.noise_scheduler_sample.step(model_output, t, noisy_action).prev_sample
+            noisy_action = noisy_action.to(state_traj.dtype)
+        # Finally apply the action mask to mask invalid action dimensions
+        noisy_action = noisy_action * action_mask
+        return noisy_action
+    def compute_loss(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_gt, action_mask,
+                     ctrl_freqs) -> torch.Tensor:
+        '''
+        lang_tokens: (batch_size, lang_len, lang_token_dim)
+        lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
+            which should be True-False bool tensor.
+        img_tokens: (batch_size, img_len, img_token_dim)
+        state_tokens: (batch_size, 1, state_token_dim)
+        action_gt: (batch_size, horizon, state_token_dim), ground-truth actions for supervision
+        action_mask: (batch_size, 1, state_token_dim), a 0-1 **float** tensor.
+        ctrl_freqs: (batch_size,), control frequency for each sample.
+        
+        return: loss_value, a scalar tensor
+        '''
+        batch_size = lang_tokens.shape[0]
+        device = lang_tokens.device
+        # Sample noise that we'll add to the actions
+        noise = torch.randn(action_gt.shape, dtype=action_gt.dtype, device=device)
+        # Sample random diffusion timesteps
+        timesteps = torch.randint(0, self.num_train_timesteps, (batch_size, ), device=device).long()
+        # Add noise to the clean actions according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        noisy_action = self.noise_scheduler.add_noise(action_gt, noise, timesteps)
+        # Concatenate the state and action tokens to form the input sequence
+        state_action_traj = torch.cat([state_tokens, noisy_action], dim=1)
+        # Append the action mask to the input sequence
+        action_mask = action_mask.expand(-1, state_action_traj.shape[1], -1)
+        state_action_traj = torch.cat([state_action_traj, action_mask], dim=2)
+        # Align the dimension with the hidden size
+        lang_cond, img_cond, state_action_traj = self.adapt_conditions(lang_tokens, img_tokens, state_action_traj)
+        # Predict the denoised result
+        pred = self.model(state_action_traj, ctrl_freqs, timesteps, lang_cond, img_cond, lang_mask=lang_attn_mask)
+        pred_type = self.prediction_type
+        if pred_type == 'epsilon':
+            target = noise
+        elif pred_type == 'sample':
+            target = action_gt
+        else:
+            raise ValueError(f"Unsupported prediction type {pred_type}")
+        loss = F.mse_loss(pred, target)
+        return loss
+    def predict_action(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_mask, ctrl_freqs):
+        '''
+        lang_tokens: (batch_size, lang_len, lang_token_dim)
+        lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
+            which should be True-False bool tensor.
+        img_tokens: (batch_size, img_len, img_token_dim)
+        state_tokens: (batch_size, 1, state_token_dim)
+        action_mask: (batch_size, 1, action_dim),
+            which should be a 0-1 **float** tensor.
+        ctrl_freqs: (batch_size,), control frequency for each sample.
+        
+        return: (batch_size, horizon, action_dim), predicted action sequence
+        '''
+        # Prepare the state and conditions
+        state_tokens = torch.cat([state_tokens, action_mask], dim=2)
+        lang_cond, img_cond, state_traj = self.adapt_conditions(lang_tokens, img_tokens, state_tokens)
+        # Run sampling
+        action_pred = self.conditional_sample(
+            lang_cond,
+            lang_attn_mask,
+            img_cond,
+            state_traj,
+            action_mask,
+            ctrl_freqs,
+        )
+        return action_pred
+    def forward(self, *args, **kwargs) -> torch.Tensor:
+        return self.compute_loss(*args, **kwargs)
+
+class RoboticDiffusionTransformerModel_Dump(object):
+    def __init__(
+        self,
+        args,
+        device="cuda",
+        dtype=torch.bfloat16,
+        image_size=None,
+        control_frequency=25,
+        pretrained=None,
+        pretrained_vision_encoder_name_or_path=None,
+    ):
+        self.args = args
+        self.dtype = dtype
+        self.image_size = image_size
+        self.device = device
+        self.control_frequency = control_frequency
+        # We do not use the text encoder due to limited GPU memory
+        # self.text_tokenizer, self.text_model = self.get_text_encoder(pretrained_text_encoder_name_or_path)
+        self.image_processor, self.vision_model = self.get_vision_encoder(pretrained_vision_encoder_name_or_path)
+        self.policy = self.get_policy(pretrained)
+
+        self.reset()
+
+    def get_policy(self, pretrained):
+        # Initialize model with arguments
+        if pretrained is None or os.path.isfile(pretrained):
+            img_cond_len = (self.args["common"]["img_history_size"] * self.args["common"]["num_cameras"] * 
+                            self.vision_model.num_patches)
+
+            _model = RDTRunner_Dump(
+                action_dim=self.args["common"]["state_dim"],
+                pred_horizon=self.args["common"]["action_chunk_size"],
+                config=self.args["model"],
+                lang_token_dim=self.args["model"]["lang_token_dim"],
+                img_token_dim=self.args["model"]["img_token_dim"],
+                state_token_dim=self.args["model"]["state_token_dim"],
+                max_lang_cond_len=self.args["dataset"]["tokenizer_max_length"],
+                img_cond_len=img_cond_len,
+                img_pos_embed_config=[
+                    # No initial pos embed in the last grid size
+                    # since we've already done in ViT
+                    (
+                        "image",
+                        (
+                            self.args["common"]["img_history_size"],
+                            self.args["common"]["num_cameras"],
+                            -self.vision_model.num_patches,
+                        ),
+                    ),
+                ],
+                lang_pos_embed_config=[
+                    # Similarly, no initial pos embed for language
+                    ("lang", -self.args["dataset"]["tokenizer_max_length"]),
+                ],
+                dtype=self.dtype,
+            )
+        else:
+            _model = RDTRunner_Dump.from_pretrained(pretrained)
+
+        return _model
+
+    def get_text_encoder(self, pretrained_text_encoder_name_or_path):
+        text_embedder = T5Embedder(
+            from_pretrained=pretrained_text_encoder_name_or_path,
+            model_max_length=self.args["dataset"]["tokenizer_max_length"],
+            device=self.device,
+        )
+        tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
+        return tokenizer, text_encoder
+    def get_vision_encoder(self, pretrained_vision_encoder_name_or_path):
+        vision_encoder = SiglipVisionTower(vision_tower=pretrained_vision_encoder_name_or_path, args=None)
+        image_processor = vision_encoder.image_processor
+        return image_processor, vision_encoder
+    def reset(self):
+        device = self.device
+        weight_dtype = self.dtype
+        self.policy.eval()
+        # self.text_model.eval()
+        self.vision_model.eval()
+        self.policy = self.policy.to(device, dtype=weight_dtype)
+        # self.text_model = self.text_model.to(device, dtype=weight_dtype)
+        self.vision_model = self.vision_model.to(device, dtype=weight_dtype)
+    def load_pretrained_weights(self, pretrained=None):
+        if pretrained is None:
+            return
+        print(f"Loading weights from {pretrained}")
+        filename = os.path.basename(pretrained)
+        if filename.endswith(".pt"):
+            checkpoint = torch.load(pretrained)
+            self.policy.load_state_dict(checkpoint["module"])
+        elif filename.endswith(".safetensors"):
+            from safetensors.torch import load_model
+            load_model(self.policy, pretrained)
+        else:
+            raise NotImplementedError(f"Unknown checkpoint format: {pretrained}")
+    def encode_instruction(self, instruction, device="cuda"):
+        tokens = self.text_tokenizer(instruction, return_tensors="pt", padding="longest", 
+                                        truncation=True)["input_ids"].to(device)
+        tokens = tokens.view(1, -1)
+        with torch.no_grad():
+            pred = self.text_model(tokens).last_hidden_state.detach()
+        return pred
+    def _format_joint_to_state(self, joints):
+        # Rescale the gripper to the range of [0, 1]
+        joints = joints / torch.tensor(
+            [[[180, 180, 180, 180, 180, 180]]],
+            device=joints.device,
+            dtype=joints.dtype,
+        )
+        B, N, _ = joints.shape
+        state = torch.zeros(
+            (B, N, self.args["model"]["state_token_dim"]),
+            device=joints.device,
+            dtype=joints.dtype,
+        )
+        # Fill into the unified state vector
+        state[:, :, AGILEX_STATE_INDICES] = joints
+        # Assemble the mask indicating each dimension's availability
+        state_elem_mask = torch.zeros(
+            (B, self.args["model"]["state_token_dim"]),
+            device=joints.device,
+            dtype=joints.dtype,
+        )
+        state_elem_mask[:, AGILEX_STATE_INDICES] = 1
+        return state, state_elem_mask
+    def _unformat_action_to_joint(self, action):
+        action_indices = AGILEX_STATE_INDICES
+        joints = action[:, :, action_indices]
+        # Rescale the gripper back to the action range
+        # Note that the action range and proprioception range are different
+        # for Mobile ALOHA robot
+        joints = joints * torch.tensor(
+            [[[180, 180, 180, 180, 180, 180]]],
+            device=joints.device,
+            dtype=joints.dtype,
+        )
+        return joints
+    @torch.no_grad()
+    def step(self, proprio, images, text_embeds):
+        device = self.device
+        dtype = self.dtype
+        # The background image used for padding
+        background_color = np.array([int(x * 255) for x in self.image_processor.image_mean], dtype=np.uint8).reshape(1, 1, 3)
+        background_image = (np.ones(
+            (
+                self.image_processor.size["height"],
+                self.image_processor.size["width"],
+                3,
+            ),
+            dtype=np.uint8,
+        ) * background_color)
+        # Preprocess the images by order and encode them
+        image_tensor_list = []
+        for image in images:
+            if image is None:
+                # Replace it with the background image
+                image = PImage.fromarray(background_image)
+            else:
+                # Convert numpy array to PIL Image if needed
+                if isinstance(image, np.ndarray):
+                    image = PImage.fromarray(image)
+            if self.image_size is not None:
+                image = transforms.Resize(self.data_args.image_size)(image)
+            if self.args["dataset"].get("auto_adjust_image_brightness", False):
+                pixel_values = list(image.getdata())
+                average_brightness = sum(sum(pixel) for pixel in pixel_values) / (len(pixel_values) * 255.0 * 3)
+                if average_brightness <= 0.15:
+                    image = transforms.ColorJitter(brightness=(1.75, 1.75))(image)
+            if self.args["dataset"].get("image_aspect_ratio", "pad") == "pad":
+                def expand2square(pil_img, background_color):
+                    width, height = pil_img.size
+                    if width == height:
+                        return pil_img
+                    elif width > height:
+                        result = PImage.new(pil_img.mode, (width, width), background_color)
+                        result.paste(pil_img, (0, (width - height) // 2))
+                        return result
+                    else:
+                        result = PImage.new(pil_img.mode, (height, height), background_color)
+                        result.paste(pil_img, ((height - width) // 2, 0))
+                        return result
+                image = expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean))
+            image = self.image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+            image_tensor_list.append(image)
+        image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
+        image_embeds = self.vision_model(image_tensor).detach()
+        image_embeds = image_embeds.reshape(-1, self.vision_model.hidden_size).unsqueeze(0)
+        # Prepare the proprioception states and the control frequency
+        joints = proprio.to(device).unsqueeze(0)  # (1, 1, 14)
+        states, state_elem_mask = self._format_joint_to_state(joints)  # (1, 1, 128), (1, 128)
+        states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(device, dtype=dtype)
+        states = states[:, -1:, :]  # (1, 1, 128)
+        ctrl_freqs = torch.tensor([self.control_frequency]).to(device)
+        text_embeds = text_embeds.to(device, dtype=dtype)
+        # Predict the next action chunk given the inputs
+        trajectory = self.policy.predict_action(
+            lang_tokens=text_embeds,
+            lang_attn_mask=torch.ones(text_embeds.shape[:2], dtype=torch.bool, device=text_embeds.device),
+            img_tokens=image_embeds,
+            state_tokens=states,
+            action_mask=state_elem_mask.unsqueeze(1),
+            ctrl_freqs=ctrl_freqs,
+        )
+        trajectory = self._unformat_action_to_joint(trajectory).to(torch.float32)
+        return trajectory
+
+def get_training_samples(data_dirs, num_samples=5, instructions_per_episode=1):
+    """
+    Get training samples from one or multiple data directories.
+    
+    Args:
+        data_dirs: A single directory path (str) or a list of directory paths
+        num_samples: Total number of samples to generate across all directories
+        instructions_per_episode: Number of instructions per episode
+    """
+    training_samples = []
+    
+    # Handle both single directory and list of directories
+    if isinstance(data_dirs, str):
+        data_dirs = [data_dirs]
+    
+    logger.info(f"Get Training Data From: {len(data_dirs)} dataset(s).")
+    
+    # First, collect all available episode files from all directories
+    episode_files = []
+    for data_dir in data_dirs:
+        if not os.path.isdir(data_dir):
+            logger.warning(f"Directory not found: {data_dir}, skipping")
+            continue
+        for root, dirs, files in os.walk(data_dir):
+            for file in files:
+                if file.endswith('.hdf5'):
+                    file_path = os.path.join(root, file)
+                    episode_files.append(file_path)
+    
+    if len(episode_files) == 0:
+        logger.warning(f"No episode files found in the provided directories")
+        return training_samples
+    
+    logger.info(f"Found {len(episode_files)} episode files across all datasets.")
+    
+    # Generate samples by randomly selecting from episodes
+    while len(training_samples) < num_samples:
+        # Randomly select an episode file
+        file_path = np.random.choice(episode_files)
+        try:
+            with h5py.File(file_path, 'r') as f:
+                observations = f['observations']
+                actions = f['action'][:]
+                images = observations['images']
+                qpos = observations['qpos'][:]
+                episode_dir = os.path.dirname(file_path)
+                instructions_dir = os.path.join(episode_dir, 'instructions')
+                num_steps = len(qpos)
+                if num_steps > 1:  # Image部分需要左中右三帧加上对饮历史帧组成4374维
+                    lang_step_idx = int(np.random.randint(0, max(instructions_per_episode, 1)))
+                    instructions_dir = os.path.join(os.path.dirname(file_path), "instructions")
+
+                    lang_embed, lang_str = None, None
+                    # lang embed (optional)
+                    lang_embed_path = os.path.join(instructions_dir, f"lang_embed_{lang_step_idx}.pt")
+                    if os.path.exists(lang_embed_path):
+                        try:
+                            lang_embed = torch.load(lang_embed_path, map_location="cpu")
+                        except Exception as e:
+                            logger.error(f"Error reading {lang_embed_path}: {e}")
+
+                    # lang string (optional)
+                    lang_str_path = os.path.join(instructions_dir, f"txt_lang_embed_{lang_step_idx}.txt")
+                    if os.path.exists(lang_str_path):
+                        try:
+                            with open(lang_str_path, "r", encoding="utf-8") as tf:
+                                lang_str = tf.read().strip()
+                        except Exception as e:
+                            logger.error(f"Error reading {lang_str_path}: {e}")
+                    lang_str = lang_str or ""   
+                            
+                    # 获取多摄像头多历史帧图像
+                    step_idx = np.random.randint(0, num_steps)                            
+                    multi_cam_images = {}
+                    ref_frame = images['cam_high'][0]
+                    ref_img = cv2.imdecode(np.frombuffer(ref_frame, np.uint8), cv2.IMREAD_COLOR)
+                    IMG_HEIGHT, IMG_WIDTH = ref_img.shape[:2]
+                    # IMG_HEIGHT, IMG_WIDTH = images['cam_high'][0].shape[:2]
+                    ground_image = np.zeros((IMG_HEIGHT, IMG_WIDTH, 3), dtype=np.uint8)
+                    for cam_name in ['cam_high', 'cam_left_wrist', 'cam_right_wrist']:
+                        
+                        if cam_name in images:
+                            cam_images = []
+                            # 获取2个历史帧的图像
+                            for i in range(max(step_idx - 1, 0), step_idx + 1):  # 2个历史帧
+                                img_bits = images[cam_name][i]
+                                img = cv2.imdecode(np.frombuffer(img_bits, np.uint8), cv2.IMREAD_COLOR)
+                                # img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+                                cam_images.append(img)    
+                            if len(cam_images) < 2:
+                                cam_images = [cam_images[0]] * 2
+                            multi_cam_images[cam_name] = cam_images
+                        else:
+                            cam_images = []
+                            for i in range(max(step_idx - 1, 0), step_idx + 1):  # 2个历史帧
+                                img_bits = ground_image
+                                # img = cv2.imdecode(np.frombuffer(img_bits, np.uint8), cv2.IMREAD_COLOR)
+                                cam_images.append(img_bits)    
+                            if len(cam_images) < 2:
+                                cam_images = [cam_images[0]] * 2
+                            multi_cam_images[cam_name] = cam_images
+                    training_samples.append({
+                        'multi_cam_images': multi_cam_images,  
+                        'joints': actions[step_idx],
+                        'lang_embed': lang_embed,
+                        'lang_str': lang_str,
+                        'source': file_path,
+                        'step': step_idx
+                    })
+                    logger.debug(f"TimeStep: {step_idx}, Sample: {file_path}")
+        except Exception as e:
+            logger.error(f"Faild: {file_path} : {e}")
+            continue
+    logger.info(f"Total Num: {len(training_samples)}.")
+    return training_samples
+
+
+def main(config_path):
+    with open(config_path, "r") as f:
+        cfg = json.load(f)
+
+    export_info = cfg.get("export", {})
+
+    opt = ExportConfig(
+        task_id=cfg.get("task_id"),
+        output_path=os.path.join(export_info.get("output_path", "."), cfg.get("task_id", "")),
+        model_path=export_info.get("model_path"),
+        calibration_num=export_info.get("calibration_num", 100),
+        dataset_path=export_info.get("dataset_path"),
+        gpu_id=cfg.get("gpu_id", "0"),
+        march=export_info.get("march"),
+        model_type=export_info.get("model_type"),
+        pretrained_vision_encoder_name_or_path="/home/qi.xiong/DualArm/Work_Docker/RDT/weights/siglip-so400m-patch14-384",
+        ctrl_freq=export_info.get("ctrl_freq", 25),
+        cal_data_device=cfg.get("cal_data_device", "cuda"),
+        lang_calibration_num=export_info.get("lang_calibration_num", 1)
+    )
+
+    if opt.model_type not in ["170M", "1B"]:
+        raise ValueError(f"RDT ONLY SUPPORT 170M AND 1B, BUT GOT {opt.model_type}")
+
+    logger.info(f"Export config loaded: {opt}")
+    os.makedirs(opt.output_path, exist_ok=True)
+
+    # PrePare Output Workspace
+    ## BPU_RDT_Policy
+    bpu_rdt_name = "BPU_RDT_Policy_170M" if opt.model_type == "170M" else "BPU_RDT_Policy_1B"
+    bpu_rdt_path = os.path.join(opt.output_path, bpu_rdt_name)
+    os.makedirs(bpu_rdt_path, exist_ok=True)
+    os.system(f"cp configs/base_{opt.model_type}.yaml {bpu_rdt_path}/base.yaml")
+    rdt_config_path = os.path.join(bpu_rdt_path, "base.yaml")
+    ## Test_Datas
+    test_data_name = "test_data"
+    test_data_path = os.path.join(opt.output_path, test_data_name)
+    os.makedirs(test_data_path, exist_ok=True)
+    ## instruction
+    instruction_ws_name = "instructions"
+    instruction_ws_path = os.path.join(opt.output_path, instruction_ws_name)
+    os.makedirs(instruction_ws_path, exist_ok=True)
+    for name in os.listdir(opt.dataset_path):
+        os.makedirs(os.path.join(instruction_ws_path, name), exist_ok=True)
+
+    ## image adaptor
+    global img_adaptor_cal_ws
+    img_adaptor_ws_name = "img_adaptor_WorkSpace"
+    img_adaptor_cal_name = "rdt_image_adaptor_calibration"
+    img_adaptor_name = "rdt_image_adaptor.onnx"
+    img_adaptor_config_name = "config.yaml"
+    img_adaptor_ws = os.path.join(opt.output_path, img_adaptor_ws_name)
+    img_adaptor_path = os.path.join(img_adaptor_ws, img_adaptor_name)
+    img_adaptor_cal_ws = os.path.join(img_adaptor_ws, img_adaptor_cal_name)
+    os.makedirs(img_adaptor_ws, exist_ok=True)
+    os.makedirs(img_adaptor_cal_ws, exist_ok=True)
+    
+    ## action adaptor
+    state_adaptor_name1 = "rdt_state_adaptor_1x1x256.onnx"
+    state_adaptor_path1 = os.path.join(opt.output_path, bpu_rdt_name, state_adaptor_name1)
+    state_adaptor_name2 = "rdt_state_adaptor_1x64x256.onnx"
+    state_adaptor_path2 = os.path.join(opt.output_path, bpu_rdt_name, state_adaptor_name2)
+
+    ## lang adaptor 
+    lang_adaptor_name = "rdt_lang_adaptor.onnx"
+    lang_adaptor_path = os.path.join(opt.output_path, bpu_rdt_name, lang_adaptor_name)
+
+    ## DiT Policy
+    dit_ws_name = "DiT_WorkSpace"
+    dit_cal_name = "rdt_dit_calibration"
+    dit_name = "rdt_dit.onnx"
+    dit_config_name = "config.yaml"
+    dit_json_name = "quant_config.json"
+    dit_ws = os.path.join(opt.output_path, dit_ws_name)
+    dit_path = os.path.join(dit_ws, dit_name)
+    dit_cal_ws = os.path.join(dit_ws, dit_cal_name)
+    os.makedirs(dit_ws, exist_ok=True)
+    os.makedirs(dit_cal_ws, exist_ok=True)
+
+    global dit_cal_path_x, dit_cal_path_freq, dit_cal_path_t, dit_cal_path_lang_c, dit_cal_path_img_c, dit_cal_path_lang_mask
+    dit_cal_path_x = os.path.join(dit_cal_ws, "x")
+    os.makedirs(dit_cal_path_x, exist_ok=True)
+    dit_cal_path_freq = os.path.join(dit_cal_ws, "freq")
+    os.makedirs(dit_cal_path_freq, exist_ok=True)
+    dit_cal_path_t = os.path.join(dit_cal_ws, "t")
+    os.makedirs(dit_cal_path_t, exist_ok=True)
+    dit_cal_path_lang_c = os.path.join(dit_cal_ws, "lang_c")
+    os.makedirs(dit_cal_path_lang_c, exist_ok=True)
+    dit_cal_path_img_c = os.path.join(dit_cal_ws, "img_c")
+    os.makedirs(dit_cal_path_img_c, exist_ok=True)
+    dit_cal_path_lang_mask = os.path.join(dit_cal_ws, "lang_mask")
+    os.makedirs(dit_cal_path_lang_mask, exist_ok=True)
+
+    # Prepare Calibrate Data
+    with open(rdt_config_path, "r") as f:
+        rdt_config = yaml.safe_load(f)
+
+    dump_model = create_dump_model(
+        args=rdt_config,
+        dtype=torch.float32,
+        pretrained=opt.model_path,
+        pretrained_vision_encoder_name_or_path=opt.pretrained_vision_encoder_name_or_path,
+        control_frequency=opt.ctrl_freq,
+        device=opt.cal_data_device
+    )
+
+    # Prepare Calbriation Data
+    # load training data from all datasets
+    global dump_cnt, dump_dataset_name
+    test_data_cnt = 0
+    
+    # Collect all dataset paths
+    all_dataset_paths = []
+    for dump_dataset_name in os.listdir(opt.dataset_path):
+        dump_dataset_path = os.path.join(opt.dataset_path, dump_dataset_name)
+        if os.path.isdir(dump_dataset_path):
+            all_dataset_paths.append(dump_dataset_path)
+    
+    # Get training samples from all datasets together
+    training_samples = get_training_samples(all_dataset_paths, num_samples=opt.calibration_num, instructions_per_episode=opt.lang_calibration_num)
+    
+    if len(training_samples) == 0:
+        logger.warning("No training samples found, skipping calibration data generation")
+    else:
+        # Only process up to the number of samples we actually have
+        num_samples_to_process = min(len(training_samples), opt.calibration_num)
+        for dump_cnt in range(num_samples_to_process):
+            sample = training_samples[dump_cnt]
+            # Extract dataset name from the sample's source path
+            sample_source = sample['source']
+            dump_dataset_name = os.path.basename(os.path.dirname(os.path.dirname(sample_source)))
+            instruction_emb = {
+                "lang_cond": sample["lang_embed"].float().cpu(),
+                "lang_str": sample["lang_str"]
+            }
+            ins_str_name = sample["lang_str"].replace(" ", "_") + "__"
+            torch.save(instruction_emb, os.path.join(instruction_ws_path, dump_dataset_name, f"{ins_str_name}.pt"))
+            image_arrs = [
+                sample['multi_cam_images']['cam_high'][0], 
+                sample['multi_cam_images']['cam_right_wrist'][0], 
+                sample['multi_cam_images']['cam_left_wrist'][0], 
+                sample['multi_cam_images']['cam_high'][1], 
+                sample['multi_cam_images']['cam_right_wrist'][1],
+                sample['multi_cam_images']['cam_left_wrist'][1], 
+            ]
+            test_data_cnt += 1
+            np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_high_0.npy"), sample['multi_cam_images']['cam_high'][0])
+            np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_right_wrist_0.npy"), sample['multi_cam_images']['cam_right_wrist'][0])
+            np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_left_wrist_0.npy"), sample['multi_cam_images']['cam_left_wrist'][0])
+            np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_high_1.npy"), sample['multi_cam_images']['cam_high'][1])
+            np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_right_wrist_1.npy"), sample['multi_cam_images']['cam_right_wrist'][1])
+            np.save(os.path.join(test_data_path, f"{test_data_cnt}_cam_left_wrist_1.npy"), sample['multi_cam_images']['cam_left_wrist'][1])
+            images = [PImage.fromarray(arr) if arr is not None else None for arr in image_arrs]
+            proprio = torch.from_numpy(sample['joints']).float().unsqueeze(0).to(opt.cal_data_device) 
+            np.save(os.path.join(test_data_path, f"{test_data_cnt}_joints.npy"), sample['joints'])
+            lang_embeddings = sample['lang_embed'].float().unsqueeze(0).to(opt.cal_data_device) 
+            torch.save(lang_embeddings, os.path.join(test_data_path, f"{test_data_cnt}_lang_embeddings.pt"))
+            dump_model.reset()
+            begin_time = time()
+            actions = dump_model.step(proprio=proprio, images=images, text_embeds=lang_embeddings).squeeze(0).cpu().numpy()
+            np.save(os.path.join(test_data_path, f"{test_data_cnt}_actions.npy"), actions)
+            logger.debug(f"Dump: Cost {(1000*(time() - begin_time)):.1f} ms, cnt: {dump_cnt}, name: {dump_dataset_name}")
+    logger.info("End Generate Calibration Data.")
+    del dump_model
+
+    # Load RDT Policy: CPU Model For ONNX Export
+    with open(rdt_config_path, "r") as f:
+        rdt_config = yaml.safe_load(f)
+
+    model = create_model(
+        args=rdt_config,
+        dtype=torch.float32,
+        pretrained=opt.model_path,
+        pretrained_vision_encoder_name_or_path=opt.pretrained_vision_encoder_name_or_path,
+        control_frequency=opt.ctrl_freq,
+        device="cpu"
+    )
+
+    # image adaptor: ONNX Model
+    m = model.policy.img_adaptor
+    m.eval()
+
+    input_data = torch.randn(1, 4374, rdt_config['model']['img_token_dim'])  # 假设批量大小为1
+    output = m(input_data)
+
+    torch.onnx.export(
+        m,
+        input_data,
+        img_adaptor_path,
+        opset_version=14,
+        do_constant_folding=True,
+        input_names=["img_tokens"],
+        output_names=["adapted_img"],
+        dynamic_axes=None,
+        verbose=False
+    )
+    logger.info("Export RDT [img_adaptor] Model Success.")
+
+    # DiT
+    hidden_size = rdt_config['model']["rdt"]['hidden_size']
+
+    m = model.policy.model
+    m = m.eval().cpu()
+    x = torch.randn(1, 65, hidden_size)
+    freq = torch.tensor([1], dtype=torch.int32)
+    t = torch.tensor([10], dtype=torch.int32)
+    lang_c = torch.randn(1, 64, hidden_size)
+    img_c = torch.randn(1, 4374, hidden_size)
+    lang_mask = torch.ones(1, 64, dtype=torch.float32)
+    dummy_inputs = (x, freq, t, lang_c, img_c, lang_mask)
+    # outputs = m(x, freq, t, lang_c, img_c, lang_mask)
+    torch.onnx.export(
+        m,
+        dummy_inputs,
+        dit_path,
+        # export_params=True,
+        opset_version=14,
+        do_constant_folding=True,
+        input_names=["x", "freq", "t", "lang_c", "img_c", "lang_mask"],
+        output_names=["actions"],
+        verbose=False
+    )
+
+    logger.info("Export RDT [DiT] Model Success.")
+
+    # state adaptor
+    m = model.policy.state_adaptor
+    m.eval()
+
+    input_data = torch.randn(1, 1, 256)  # 假设批量大小为1
+    output = m(input_data)
+
+    torch.onnx.export(
+        m,
+        input_data,
+        state_adaptor_path1,
+        export_params=True,
+        opset_version=14,
+        do_constant_folding=True,
+        input_names=["state_tokens"],
+        output_names=["state_traj"],
+        dynamic_axes=None,
+        verbose=False
+    )
+
+    logging.info("Export RDT [state 1x1x256] Model Success.")
+
+    input_data = torch.randn(1, 64, 256)  
+    output = m(input_data)
+
+    torch.onnx.export(
+        m,                           
+        input_data,                   
+        state_adaptor_path2,           
+        export_params=True,           
+        opset_version=14,            
+        do_constant_folding=True,     
+        input_names=['state_tokens'],   
+        output_names=['state_traj'], 
+        dynamic_axes=None,
+        verbose=False
+    )
+
+    logging.info("Export RDT [state 1x64x256] Model Success.")
+
+    # lang adaptor
+
+    m = model.policy.lang_adaptor
+    m.eval()
+
+    input_data = torch.randn(1, 14, 4096)  
+    output = m(input_data)
+
+    torch.onnx.export(
+        m,
+        input_data,
+        lang_adaptor_path,
+        export_params=True,
+        opset_version=14,
+        do_constant_folding=True,
+        input_names=["text_embeds"],
+        output_names=["lang_cond"],
+        dynamic_axes={
+            "text_embeds": {1: "N"},
+            "lang_cond": {1: "N"}
+        },
+        verbose=False
+    )
+
+    logger.info("Export RDT [lang adaptor] Model Success.")
+
+######## Prepare Calbibration Data
+
+if __name__ == "__main__":
+    main("/home/qi.xiong/DualArm/Work_Docker/RDT/rdt-export/input/config.json")
+    logger.info("All Models Have Been Exported Success.")
+
--- a/RDT/rdt-export/models/init.py
+++ b/RDT/rdt-export/models/init.py
--- a/RDT/rdt-export/models/ema_model.py
+++ b/RDT/rdt-export/models/ema_model.py
@ -0,0 +1,82 @@
+# Reference: DiffusionPolicy [https://github.com/real-stanford/diffusion_policy]
+
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+
+
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+
+    def __init__(self, model, update_after_step=0, inv_gamma=1.0, power=2 / 3, min_value=0.0, max_value=0.9999):
+        """
+        @crowsonkb's notes on EMA Warmup:
+            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
+            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
+            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
+            at 215.4k steps).
+        Args:
+            inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
+            power (float): Exponential factor of EMA warmup. Default: 2/3.
+            min_value (float): The minimum EMA decay rate. Default: 0.
+        """
+
+        self.averaged_model = model
+        self.averaged_model.eval()
+        self.averaged_model.requires_grad_(False)
+
+        self.update_after_step = update_after_step
+        self.inv_gamma = inv_gamma
+        self.power = power
+        self.min_value = min_value
+        self.max_value = max_value
+
+        self.decay = 0.0
+        self.optimization_step = 0
+
+    def get_decay(self, optimization_step):
+        """
+        Compute the decay factor for the exponential moving average.
+        """
+        step = max(0, optimization_step - self.update_after_step - 1)
+        value = 1 - (1 + step / self.inv_gamma)**-self.power
+
+        if step <= 0:
+            return 0.0
+
+        return max(self.min_value, min(value, self.max_value))
+
+    @torch.no_grad()
+    def step(self, new_model):
+        self.decay = self.get_decay(self.optimization_step)
+
+        # old_all_dataptrs = set()
+        # for param in new_model.parameters():
+        #     data_ptr = param.data_ptr()
+        #     if data_ptr != 0:
+        #         old_all_dataptrs.add(data_ptr)
+
+        all_dataptrs = set()
+        for module, ema_module in zip(new_model.modules(), self.averaged_model.modules()):
+            for param, ema_param in zip(module.parameters(recurse=False), ema_module.parameters(recurse=False)):
+                # iterative over immediate parameters only.
+                if isinstance(param, dict):
+                    raise RuntimeError('Dict parameter not supported')
+
+                # data_ptr = param.data_ptr()
+                # if data_ptr != 0:
+                #     all_dataptrs.add(data_ptr)
+
+                if isinstance(module, _BatchNorm):
+                    # skip batchnorms
+                    ema_param.copy_(param.to(dtype=ema_param.dtype).data)
+                elif not param.requires_grad:
+                    ema_param.copy_(param.to(dtype=ema_param.dtype).data)
+                else:
+                    ema_param.mul_(self.decay)
+                    ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=1 - self.decay)
+
+        # verify that iterating over module and then parameters is identical to parameters recursively.
+        # assert old_all_dataptrs == all_dataptrs
+        self.optimization_step += 1
--- a/RDT/rdt-export/models/hub_mixin.py
+++ b/RDT/rdt-export/models/hub_mixin.py
@ -0,0 +1,75 @@
+import os
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+from huggingface_hub import PyTorchModelHubMixin
+from huggingface_hub.constants import (PYTORCH_WEIGHTS_NAME, SAFETENSORS_SINGLE_FILE)
+from huggingface_hub.file_download import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError, is_torch_available
+
+if is_torch_available():
+    import torch  # type: ignore
+
+
+class CompatiblePyTorchModelHubMixin(PyTorchModelHubMixin):
+    """Mixin class to load Pytorch models from the Hub."""
+
+    def _save_pretrained(self, save_directory: Path) -> None:
+        """Save weights from a Pytorch model to a local directory."""
+        # To bypass saving into safetensor by default
+        model_to_save = self.module if hasattr(self, "module") else self  # type: ignore
+        torch.save(model_to_save.state_dict(), save_directory / PYTORCH_WEIGHTS_NAME)
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        *,
+        model_id: str,
+        revision: Optional[str],
+        cache_dir: Optional[Union[str, Path]],
+        force_download: bool,
+        proxies: Optional[Dict],
+        resume_download: Optional[bool],
+        local_files_only: bool,
+        token: Union[str, bool, None],
+        map_location: str = "cpu",
+        strict: bool = False,
+        **model_kwargs,
+    ):
+        """Load Pytorch pretrained weights and return the loaded model."""
+        model = cls(**model_kwargs)
+        if os.path.isdir(model_id):
+            print("Loading weights from local directory")
+            try:
+                model_file = os.path.join(model_id, SAFETENSORS_SINGLE_FILE)
+                return cls._load_as_safetensor(model, model_file, map_location, strict)
+            except FileNotFoundError:
+                model_file = os.path.join(model_id, PYTORCH_WEIGHTS_NAME)
+                return cls._load_as_pickle(model, model_file, map_location, strict)
+        else:
+            try:
+                model_file = hf_hub_download(
+                    repo_id=model_id,
+                    filename=SAFETENSORS_SINGLE_FILE,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    token=token,
+                    local_files_only=local_files_only,
+                )
+                return cls._load_as_safetensor(model, model_file, map_location, strict)
+            except EntryNotFoundError:
+                model_file = hf_hub_download(
+                    repo_id=model_id,
+                    filename=PYTORCH_WEIGHTS_NAME,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    token=token,
+                    local_files_only=local_files_only,
+                )
+                return cls._load_as_pickle(model, model_file, map_location, strict)
--- a/RDT/rdt-export/models/multimodal_encoder/init.py
+++ b/RDT/rdt-export/models/multimodal_encoder/init.py
--- a/RDT/rdt-export/models/multimodal_encoder/clip_encoder.py
+++ b/RDT/rdt-export/models/multimodal_encoder/clip_encoder.py
@ -0,0 +1,159 @@
+import torch
+import torch.nn as nn
+
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+
+
+class CLIPVisionTower(nn.Module):
+
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                                                      output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
+                                                   output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size)**2
+
+
+class CLIPVisionTowerS2(CLIPVisionTower):
+
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__(vision_tower, args, delay_load)
+
+        self.s2_scales = getattr(args, 's2_scales', '336,672,1008')
+        self.s2_scales = list(map(int, self.s2_scales.split(',')))
+        self.s2_scales.sort()
+        self.s2_split_size = self.s2_scales[0]
+        self.s2_image_size = self.s2_scales[-1]
+
+        try:
+            from s2wrapper import forward as multiscale_forward
+        except ImportError:
+            raise ImportError(
+                'Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git'
+            )
+        self.multiscale_forward = multiscale_forward
+
+        # change resize/crop size in preprocessing to the largest image size in s2_scale
+        if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.image_processor.size['shortest_edge'] = self.s2_image_size
+            self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+
+        self.image_processor.size['shortest_edge'] = self.s2_image_size
+        self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
+
+        self.is_loaded = True
+
+    @torch.no_grad()
+    def forward_feature(self, images):
+        image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype),
+                                               output_hidden_states=True)
+        image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_feature = self.multiscale_forward(self.forward_feature,
+                                                        image.unsqueeze(0),
+                                                        img_sizes=self.s2_scales,
+                                                        max_split_size=self.s2_split_size)
+                image_features.append(image_feature)
+        else:
+            image_features = self.multiscale_forward(self.forward_feature,
+                                                     images,
+                                                     img_sizes=self.s2_scales,
+                                                     max_split_size=self.s2_split_size)
+
+        return image_features
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size * len(self.s2_scales)
--- a/RDT/rdt-export/models/multimodal_encoder/dinov2_encoder.py
+++ b/RDT/rdt-export/models/multimodal_encoder/dinov2_encoder.py
@ -0,0 +1,87 @@
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoImageProcessor, AutoModel, Dinov2Model
+
+
+class DinoV2VisionTower(nn.Module):
+
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)  # FIXME:
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.last_hidden_state
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]  # (B, 1369, 1536)
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features  # (B, 1, 1536)
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size)**2
--- a/RDT/rdt-export/models/multimodal_encoder/siglip_encoder.py
+++ b/RDT/rdt-export/models/multimodal_encoder/siglip_encoder.py
@ -0,0 +1,86 @@
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, SiglipImageProcessor, SiglipVisionModel
+
+
+class SiglipVisionTower(nn.Module):
+
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+
+        self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.eval()
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        if self.select_feature == 'patch':
+            image_features = image_forward_outs.last_hidden_state  # (B, 729, 1536)
+        elif self.select_feature == 'cls_patch':
+            image_features = image_forward_outs.pooler_output  # (B, 1, 1536)
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size)**2
--- a/RDT/rdt-export/models/multimodal_encoder/t5_encoder.py
+++ b/RDT/rdt-export/models/multimodal_encoder/t5_encoder.py
@ -0,0 +1,111 @@
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+
+
+class T5Embedder:
+    # available_models = ["google/t5-v1_1-xxl"]
+
+    def __init__(
+        self,
+        device,
+        from_pretrained=None,
+        *,
+        cache_dir=None,
+        hf_token=None,
+        use_text_preprocessing=True,
+        t5_model_kwargs=None,
+        torch_dtype=None,
+        use_offload_folder=None,
+        model_max_length=120,
+        local_files_only=False,
+    ):
+        # from_pretrained="google/t5-v1_1-xxl" # zijian
+        self.device = torch.device(device)
+        self.torch_dtype = torch_dtype or torch.bfloat16
+        self.cache_dir = cache_dir
+
+        if t5_model_kwargs is None:
+            t5_model_kwargs = {
+                "low_cpu_mem_usage": True,
+                "torch_dtype": self.torch_dtype,
+            }
+
+            if use_offload_folder is not None:
+                t5_model_kwargs["offload_folder"] = use_offload_folder
+                t5_model_kwargs["device_map"] = {
+                    "shared": self.device,
+                    "encoder.embed_tokens": self.device,
+                    "encoder.block.0": self.device,
+                    "encoder.block.1": self.device,
+                    "encoder.block.2": self.device,
+                    "encoder.block.3": self.device,
+                    "encoder.block.4": self.device,
+                    "encoder.block.5": self.device,
+                    "encoder.block.6": self.device,
+                    "encoder.block.7": self.device,
+                    "encoder.block.8": self.device,
+                    "encoder.block.9": self.device,
+                    "encoder.block.10": self.device,
+                    "encoder.block.11": self.device,
+                    "encoder.block.12": "disk",
+                    "encoder.block.13": "disk",
+                    "encoder.block.14": "disk",
+                    "encoder.block.15": "disk",
+                    "encoder.block.16": "disk",
+                    "encoder.block.17": "disk",
+                    "encoder.block.18": "disk",
+                    "encoder.block.19": "disk",
+                    "encoder.block.20": "disk",
+                    "encoder.block.21": "disk",
+                    "encoder.block.22": "disk",
+                    "encoder.block.23": "disk",
+                    "encoder.final_layer_norm": "disk",
+                    "encoder.dropout": "disk",
+                }
+            else:
+                t5_model_kwargs["device_map"] = {
+                    "shared": self.device,
+                    "encoder": self.device,
+                }
+
+        self.use_text_preprocessing = use_text_preprocessing
+        self.hf_token = hf_token
+
+        # assert from_pretrained in self.available_models
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            from_pretrained,
+            model_max_length=model_max_length,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+        )
+        self.model = T5EncoderModel.from_pretrained(
+            from_pretrained,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+            **t5_model_kwargs,
+        ).eval()
+        self.model_max_length = model_max_length
+
+    def get_text_embeddings(self, texts):
+        text_tokens_and_mask = self.tokenizer(
+            texts,
+            max_length=self.model_max_length,
+            padding="longest",
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        input_ids = text_tokens_and_mask["input_ids"].to(self.device)
+        attention_mask = text_tokens_and_mask["attention_mask"].to(self.device)
+        with torch.no_grad():
+            text_encoder_embs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            )["last_hidden_state"].detach()
+        return text_encoder_embs, attention_mask
+
+
+if __name__ == "__main__":
+    T5Embedder(from_pretrained="google/t5-v1_1-xxl", device='cuda:7')
--- a/RDT/rdt-export/models/rdt/blocks.py
+++ b/RDT/rdt-export/models/rdt/blocks.py
@ -0,0 +1,304 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DiT: https://github.com/facebookresearch/DiT
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+
+import math
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.jit import Final
+from timm.models.vision_transformer import Attention, Mlp, RmsNorm, use_fused_attn
+
+
+#################################################################################
+#               Embedding Layers for Timesteps and Condition Inptus             #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+
+    def __init__(self, hidden_size, frequency_embedding_size=256, dtype=torch.bfloat16):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+        self.dtype = dtype
+
+    def timestep_embedding(self, t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(-math.log(max_period) *
+                          torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding.to(self.dtype)
+
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+
+
+#################################################################################
+#                          Cross Attention Layers                               #
+#################################################################################
+class CrossAttention(nn.Module):
+    """
+    A cross-attention layer with flash attention.
+    """
+    fused_attn: Final[bool]
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        attn_drop: float = 0,
+        proj_drop: float = 0,
+        norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.fused_attn = use_fused_attn()
+
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x: torch.Tensor, c: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+        B, N, C = x.shape
+        _, L, _ = c.shape
+        q = self.q(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        kv = self.kv(c).reshape(B, L, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        k, v = kv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+
+        # Prepare attn mask (B, L) to mask the conditioion
+        if mask is not None:
+            mask = mask.reshape(B, 1, 1, L)
+            mask = mask.expand(-1, -1, N, -1)
+
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(query=q,
+                                               key=k,
+                                               value=v,
+                                               dropout_p=self.attn_drop.p if self.training else 0.,
+                                               attn_mask=mask)
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            if mask is not None:
+                attn = attn.masked_fill_(mask.logical_not(), float('-inf'))
+            attn = attn.softmax(dim=-1)
+            if self.attn_drop.p > 0:
+                attn = self.attn_drop(attn)
+            x = attn @ v
+
+        x = x.permute(0, 2, 1, 3).reshape(B, N, C)
+        x = self.proj(x)
+        if self.proj_drop.p > 0:
+            x = self.proj_drop(x)
+        return x
+
+
+#################################################################################
+#                                 RDT Block                                     #
+#################################################################################
+class RDTBlock(nn.Module):
+    """
+    A RDT block with cross-attention conditioning.
+    """
+
+    def __init__(self, hidden_size, num_heads, **block_kwargs):
+        super().__init__()
+        self.norm1 = RmsNorm(hidden_size, eps=1e-6)
+        self.attn = Attention(dim=hidden_size,
+                              num_heads=num_heads,
+                              qkv_bias=True,
+                              qk_norm=True,
+                              norm_layer=RmsNorm,
+                              **block_kwargs)
+        self.cross_attn = CrossAttention(hidden_size,
+                                         num_heads=num_heads,
+                                         qkv_bias=True,
+                                         qk_norm=True,
+                                         norm_layer=RmsNorm,
+                                         **block_kwargs)
+
+        self.norm2 = RmsNorm(hidden_size, eps=1e-6)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.ffn = Mlp(in_features=hidden_size, hidden_features=hidden_size, act_layer=approx_gelu, drop=0)
+        self.norm3 = RmsNorm(hidden_size, eps=1e-6)
+
+    def forward(self, x, c, mask=None):
+        origin_x = x
+        x = self.norm1(x)
+        x = self.attn(x)
+        x = x + origin_x
+
+        origin_x = x
+        x = self.norm2(x)
+        x = self.cross_attn(x, c, mask)
+        x = x + origin_x
+
+        origin_x = x
+        x = self.norm3(x)
+        x = self.ffn(x)
+        x = x + origin_x
+
+        return x
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of RDT.
+    """
+
+    def __init__(self, hidden_size, out_channels):
+        super().__init__()
+        self.norm_final = RmsNorm(hidden_size, eps=1e-6)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.ffn_final = Mlp(in_features=hidden_size,
+                             hidden_features=hidden_size,
+                             out_features=out_channels,
+                             act_layer=approx_gelu,
+                             drop=0)
+
+    def forward(self, x):
+        x = self.norm_final(x)
+        x = self.ffn_final(x)
+        return x
+
+
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    if not isinstance(pos, np.ndarray):
+        pos = np.array(pos, dtype=np.float64)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+def get_nd_sincos_pos_embed_from_grid(embed_dim, grid_sizes):
+    """
+    embed_dim: output dimension for each position
+    grid_sizes: the grids sizes in each dimension (K,).
+    out: (grid_sizes[0], ..., grid_sizes[K-1], D)
+    """
+    num_sizes = len(grid_sizes)
+    # For grid size of 1, we do not need to add any positional embedding
+    num_valid_sizes = len([x for x in grid_sizes if x > 1])
+    emb = np.zeros(grid_sizes + (embed_dim, ))
+    # Uniformly divide the embedding dimension for each grid size
+    dim_for_each_grid = embed_dim // num_valid_sizes
+    # To make it even
+    if dim_for_each_grid % 2 != 0:
+        dim_for_each_grid -= 1
+    valid_size_idx = 0
+    for size_idx in range(num_sizes):
+        grid_size = grid_sizes[size_idx]
+        if grid_size <= 1:
+            continue
+        pos = np.arange(grid_size)
+        posemb_shape = [1] * len(grid_sizes) + [dim_for_each_grid]
+        posemb_shape[size_idx] = -1
+        emb[..., valid_size_idx * dim_for_each_grid:(valid_size_idx + 1) * dim_for_each_grid] += \
+            get_1d_sincos_pos_embed_from_grid(dim_for_each_grid, pos).reshape(posemb_shape)
+        valid_size_idx += 1
+    return emb
+
+
+def get_multimodal_cond_pos_embed(embed_dim, mm_cond_lens: OrderedDict, embed_modality=True):
+    """
+    Generate position embeddings for multimodal conditions. 
+    
+    mm_cond_lens: an OrderedDict containing 
+        (modality name, modality token length) pairs.
+        For `"image"` modality, the value can be a multi-dimensional tuple.
+        If the length < 0, it means there is no position embedding for the modality or grid.
+    embed_modality: whether to embed the modality information. Default is True.
+    """
+    num_modalities = len(mm_cond_lens)
+    modality_pos_embed = np.zeros((num_modalities, embed_dim))
+    if embed_modality:
+        # Get embeddings for various modalites
+        # We put it in the first half
+        modality_sincos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, torch.arange(num_modalities))
+        modality_pos_embed[:, :embed_dim // 2] = modality_sincos_embed
+        # The second half is for position embeddings
+        pos_embed_dim = embed_dim // 2
+    else:
+        # The whole embedding is for position embeddings
+        pos_embed_dim = embed_dim
+
+    # Get embeddings for positions inside each modality
+    c_pos_emb = np.zeros((0, embed_dim))
+    for idx, (modality, cond_len) in enumerate(mm_cond_lens.items()):
+        if modality == "image" and \
+            (isinstance(cond_len, tuple) or isinstance(cond_len, list)):
+            all_grid_sizes = tuple([abs(x) for x in cond_len])
+            embed_grid_sizes = tuple([x if x > 0 else 1 for x in cond_len])
+            cond_sincos_embed = get_nd_sincos_pos_embed_from_grid(pos_embed_dim, embed_grid_sizes)
+            cond_pos_embed = np.zeros(all_grid_sizes + (embed_dim, ))
+            cond_pos_embed[..., -pos_embed_dim:] += cond_sincos_embed
+            cond_pos_embed = cond_pos_embed.reshape((-1, embed_dim))
+        else:
+            cond_sincos_embed = get_1d_sincos_pos_embed_from_grid(pos_embed_dim,
+                                                                  torch.arange(cond_len if cond_len > 0 else 1))
+            cond_pos_embed = np.zeros((abs(cond_len), embed_dim))
+            cond_pos_embed[:, -pos_embed_dim:] += cond_sincos_embed
+        cond_pos_embed += modality_pos_embed[idx]
+        c_pos_emb = np.concatenate([c_pos_emb, cond_pos_embed], axis=0)
+
+    return c_pos_emb
--- a/RDT/rdt-export/models/rdt/model.py
+++ b/RDT/rdt-export/models/rdt/model.py
@ -0,0 +1,156 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DiT: https://github.com/facebookresearch/DiT
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+from pathlib import Path
+import sys, os
+# get current workspace
+current_file = Path(__file__)
+sys.path.append(str(current_file.parent.parent))
+
+from rdt.blocks import (FinalLayer, RDTBlock, TimestepEmbedder, get_1d_sincos_pos_embed_from_grid,
+                        get_multimodal_cond_pos_embed)
+
+
+class RDT(nn.Module):
+    """
+    Class for Robotics Diffusion Transformers.
+    """
+
+    def __init__(self,
+                 output_dim=128,
+                 horizon=32,
+                 hidden_size=1152,
+                 depth=28,
+                 num_heads=16,
+                 max_lang_cond_len=1024,
+                 img_cond_len=4096,
+                 lang_pos_embed_config=None,
+                 img_pos_embed_config=None,
+                 dtype=torch.bfloat16):
+        super().__init__()
+        self.horizon = horizon
+        self.hidden_size = hidden_size
+        self.max_lang_cond_len = max_lang_cond_len
+        self.img_cond_len = img_cond_len
+        self.dtype = dtype
+        self.lang_pos_embed_config = lang_pos_embed_config
+        self.img_pos_embed_config = img_pos_embed_config
+
+        self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
+        self.freq_embedder = TimestepEmbedder(hidden_size, dtype=dtype)
+
+        # We will use trainable sin-cos embeddings
+        # [timestep; state; action]
+        self.x_pos_embed = nn.Parameter(torch.zeros(1, horizon + 3, hidden_size))
+        # Language conditions
+        self.lang_cond_pos_embed = nn.Parameter(torch.zeros(1, max_lang_cond_len, hidden_size))
+        # Image conditions
+        self.img_cond_pos_embed = nn.Parameter(torch.zeros(1, img_cond_len, hidden_size))
+
+        self.blocks = nn.ModuleList([RDTBlock(hidden_size, num_heads) for _ in range(depth)])
+        self.final_layer = FinalLayer(hidden_size, output_dim)
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+
+        self.apply(_basic_init)
+
+        # Initialize pos_embed by sin-cos embedding
+        x_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
+                                                    mm_cond_lens=OrderedDict([
+                                                        ('timestep', 1),
+                                                        ('ctrl_freq', 1),
+                                                        ('state', 1),
+                                                        ('action', self.horizon),
+                                                    ]))
+        self.x_pos_embed.data.copy_(torch.from_numpy(x_pos_embed).float().unsqueeze(0))
+
+        if self.lang_pos_embed_config is None:
+            lang_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size,
+                                                                    torch.arange(self.max_lang_cond_len))
+        else:
+            lang_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
+                                                                mm_cond_lens=OrderedDict(self.lang_pos_embed_config),
+                                                                embed_modality=False)
+        self.lang_cond_pos_embed.data.copy_(torch.from_numpy(lang_cond_pos_embed).float().unsqueeze(0))
+
+        if self.img_pos_embed_config is None:
+            img_cond_pos_embed = get_1d_sincos_pos_embed_from_grid(self.hidden_size, torch.arange(self.img_cond_len))
+        else:
+            img_cond_pos_embed = get_multimodal_cond_pos_embed(embed_dim=self.hidden_size,
+                                                               mm_cond_lens=OrderedDict(self.img_pos_embed_config),
+                                                               embed_modality=False)
+        self.img_cond_pos_embed.data.copy_(torch.from_numpy(img_cond_pos_embed).float().unsqueeze(0))
+
+        # Initialize timestep and control freq embedding MLP
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.freq_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.freq_embedder.mlp[2].weight, std=0.02)
+
+        # Initialize the final layer: zero-out the final linear layer
+        nn.init.constant_(self.final_layer.ffn_final.fc2.weight, 0)
+        nn.init.constant_(self.final_layer.ffn_final.fc2.bias, 0)
+
+        # Move all the params to given data type:
+        self.to(self.dtype)
+
+    def forward(self, x, freq, t, lang_c, img_c, lang_mask=None, img_mask=None):
+        """
+        Forward pass of RDT.
+        
+        x: (B, T, D), state + action token sequence, T = horizon + 1,
+            dimension D is assumed to be the same as the hidden size.
+        freq: (B,), a scalar indicating control frequency.
+        t: (B,) or (1,), diffusion timesteps.
+        lang_c: (B, L_lang, D) or None, language condition tokens (variable length),
+            dimension D is assumed to be the same as the hidden size.
+        img_c: (B, L_img, D) or None, image condition tokens (fixed length),
+            dimension D is assumed to be the same as the hidden size.
+        lang_mask: (B, L_lang) or None, language condition mask (True for valid).
+        img_mask: (B, L_img) or None, image condition mask (True for valid).
+        """
+        t = self.t_embedder(t).unsqueeze(1)  # (B, 1, D) or (1, 1, D)
+        freq = self.freq_embedder(freq).unsqueeze(1)  # (B, 1, D)
+        # Append timestep to the input tokens
+        if t.shape[0] == 1:
+            t = t.expand(x.shape[0], -1, -1)
+        x = torch.cat([t, freq, x], dim=1)  # (B, T+1, D)
+
+        # Add multimodal position embeddings
+        x = x + self.x_pos_embed
+        # Note the lang is of variable length
+        lang_c = lang_c + self.lang_cond_pos_embed[:, :lang_c.shape[1]]
+        img_c = img_c + self.img_cond_pos_embed
+
+        # Forward pass
+        conds = [lang_c, img_c]
+        masks = [lang_mask, img_mask]
+        for i, block in enumerate(self.blocks):
+            c, mask = conds[i % 2], masks[i % 2]
+            x = block(x, c, mask)  # (B, T+1, D)
+        # Inject the language condition at the final layer
+        x = self.final_layer(x)  # (B, T+1, out_channels)
+
+        # Only preserve the action tokens
+        x = x[:, -self.horizon:]
+        return x
--- a/RDT/rdt-export/models/rdt_runner.py
+++ b/RDT/rdt-export/models/rdt_runner.py
@ -0,0 +1,246 @@
+import re, sys, os
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+from diffusers.schedulers.scheduling_dpmsolver_multistep import \
+    DPMSolverMultistepScheduler
+
+from pathlib import Path
+# get current workspace
+current_file = Path(__file__)
+sys.path.append(os.path.join(current_file.parent))
+from hub_mixin import CompatiblePyTorchModelHubMixin
+from rdt.model import RDT
+
+
+class RDTRunner(nn.Module,
+                CompatiblePyTorchModelHubMixin,
+                repo_url="https://huggingface.co/robotics-diffusion-transformer/rdt-1b"):
+
+    def __init__(self,
+                 *,
+                 action_dim,
+                 pred_horizon,
+                 config,
+                 lang_token_dim,
+                 img_token_dim,
+                 state_token_dim,
+                 max_lang_cond_len,
+                 img_cond_len,
+                 lang_pos_embed_config=None,
+                 img_pos_embed_config=None,
+                 dtype=torch.bfloat16):
+        super(RDTRunner, self).__init__()
+        # Create diffusion model
+        hidden_size = config['rdt']['hidden_size']
+        self.model = RDT(
+            output_dim=action_dim,
+            horizon=pred_horizon,
+            hidden_size=hidden_size,
+            depth=config['rdt']['depth'],
+            num_heads=config['rdt']['num_heads'],
+            max_lang_cond_len=max_lang_cond_len,
+            img_cond_len=img_cond_len,
+            lang_pos_embed_config=lang_pos_embed_config,
+            img_pos_embed_config=img_pos_embed_config,
+            dtype=dtype,
+        )
+
+        # Create adpators for various conditional inputs
+        self.lang_adaptor = self.build_condition_adapter(config['lang_adaptor'],
+                                                         in_features=lang_token_dim,
+                                                         out_features=hidden_size)
+        self.img_adaptor = self.build_condition_adapter(config['img_adaptor'],
+                                                        in_features=img_token_dim,
+                                                        out_features=hidden_size)
+        # A `state` refers to an action or a proprioception vector
+        self.state_adaptor = self.build_condition_adapter(
+            config['state_adaptor'],
+            in_features=state_token_dim * 2,  # state + state mask (indicator)
+            out_features=hidden_size)
+
+        # Create the noise scheduler
+        noise_scheduler_config = config['noise_scheduler']
+        self.noise_scheduler = DDPMScheduler(
+            num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
+            beta_schedule=noise_scheduler_config['beta_schedule'],
+            prediction_type=noise_scheduler_config['prediction_type'],
+            clip_sample=noise_scheduler_config['clip_sample'],
+        )
+        self.noise_scheduler_sample = DPMSolverMultistepScheduler(
+            num_train_timesteps=noise_scheduler_config['num_train_timesteps'],
+            beta_schedule=noise_scheduler_config['beta_schedule'],
+            prediction_type=noise_scheduler_config['prediction_type'],
+        )
+
+        self.num_train_timesteps = noise_scheduler_config['num_train_timesteps']
+        self.num_inference_timesteps = noise_scheduler_config['num_inference_timesteps']
+        self.prediction_type = noise_scheduler_config['prediction_type']
+
+        self.pred_horizon = pred_horizon
+        self.action_dim = action_dim
+
+        print("Diffusion params: %e" %
+              sum([p.numel() for p in self.model.parameters()] + [p.numel() for p in self.lang_adaptor.parameters()] +
+                  [p.numel()
+                   for p in self.img_adaptor.parameters()] + [p.numel() for p in self.state_adaptor.parameters()]))
+
+    def build_condition_adapter(self, projector_type, in_features, out_features):
+        projector = None
+        if projector_type == 'linear':
+            projector = nn.Linear(in_features, out_features)
+        else:
+            mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+            if mlp_gelu_match:
+                mlp_depth = int(mlp_gelu_match.group(1))
+                modules = [nn.Linear(in_features, out_features)]
+                for _ in range(1, mlp_depth):
+                    modules.append(nn.GELU(approximate="tanh"))
+                    modules.append(nn.Linear(out_features, out_features))
+                projector = nn.Sequential(*modules)
+
+        if projector is None:
+            raise ValueError(f'Unknown projector type: {projector_type}')
+
+        return projector
+
+    def adapt_conditions(self, lang_tokens, img_tokens, state_tokens):
+        '''
+        lang_tokens: (batch_size, lang_len, lang_token_dim)
+        img_tokens: (batch_size, img_len, img_token_dim)
+        state_tokens: (batch_size, state_len, state_token_dim)
+        
+        return: adpated (..., hidden_size) for all input tokens
+        '''
+        adpated_lang = self.lang_adaptor(lang_tokens)
+        adpated_img = self.img_adaptor(img_tokens)
+        adpated_state = self.state_adaptor(state_tokens)
+
+        return adpated_lang, adpated_img, adpated_state
+
+    def conditional_sample(self, lang_cond, lang_attn_mask, img_cond, state_traj, action_mask, ctrl_freqs):
+        '''
+        lang_cond: language conditional data, (batch_size, lang_len, hidden_size).
+        lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
+            which should be True-False bool tensor.
+        img_cond: image conditional data, (batch_size, img_len, hidden_size).
+        state_traj: (batch_size, 1, hidden_size), state trajectory.
+        action_mask: (batch_size, 1, action_dim), a 0-1 **float** tensor
+            indicating the valid action dimensions.
+        ctrl_freqs: (batch_size,), control frequency for each sample.
+        
+        return: (batch_size, horizon, action_dim)
+        '''
+        device = state_traj.device
+        dtype = state_traj.dtype
+        noisy_action = torch.randn(size=(state_traj.shape[0], self.pred_horizon, self.action_dim),
+                                   dtype=dtype,
+                                   device=device)
+        action_mask = action_mask.expand(-1, self.pred_horizon, -1)
+
+        # Set step values
+        self.noise_scheduler_sample.set_timesteps(self.num_inference_timesteps)
+
+        for t in self.noise_scheduler_sample.timesteps:
+            # Prepare state-action trajectory
+            action_traj = torch.cat([noisy_action, action_mask], dim=2)
+            action_traj = self.state_adaptor(action_traj)
+            state_action_traj = torch.cat([state_traj, action_traj], dim=1)
+
+            # Predict the model output
+            model_output = self.model(state_action_traj,
+                                      ctrl_freqs,
+                                      t.unsqueeze(-1).to(device),
+                                      lang_cond,
+                                      img_cond,
+                                      lang_mask=lang_attn_mask)
+
+            # Compute previous actions: x_t -> x_t-1
+            noisy_action = self.noise_scheduler_sample.step(model_output, t, noisy_action).prev_sample
+            noisy_action = noisy_action.to(state_traj.dtype)
+
+        # Finally apply the action mask to mask invalid action dimensions
+        noisy_action = noisy_action * action_mask
+
+        return noisy_action
+
+    # ========= Train  ============
+    def compute_loss(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_gt, action_mask,
+                     ctrl_freqs) -> torch.Tensor:
+        '''
+        lang_tokens: (batch_size, lang_len, lang_token_dim)
+        lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
+            which should be True-False bool tensor.
+        img_tokens: (batch_size, img_len, img_token_dim)
+        state_tokens: (batch_size, 1, state_token_dim)
+        action_gt: (batch_size, horizon, state_token_dim), ground-truth actions for supervision
+        action_mask: (batch_size, 1, state_token_dim), a 0-1 **float** tensor.
+        ctrl_freqs: (batch_size,), control frequency for each sample.
+        
+        return: loss_value, a scalar tensor
+        '''
+        batch_size = lang_tokens.shape[0]
+        device = lang_tokens.device
+        # Sample noise that we'll add to the actions
+        noise = torch.randn(action_gt.shape, dtype=action_gt.dtype, device=device)
+        # Sample random diffusion timesteps
+        timesteps = torch.randint(0, self.num_train_timesteps, (batch_size, ), device=device).long()
+        # Add noise to the clean actions according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        noisy_action = self.noise_scheduler.add_noise(action_gt, noise, timesteps)
+
+        # Concatenate the state and action tokens to form the input sequence
+        state_action_traj = torch.cat([state_tokens, noisy_action], dim=1)
+        # Append the action mask to the input sequence
+        action_mask = action_mask.expand(-1, state_action_traj.shape[1], -1)
+        state_action_traj = torch.cat([state_action_traj, action_mask], dim=2)
+        # Align the dimension with the hidden size
+        lang_cond, img_cond, state_action_traj = self.adapt_conditions(lang_tokens, img_tokens, state_action_traj)
+        # Predict the denoised result
+        pred = self.model(state_action_traj, ctrl_freqs, timesteps, lang_cond, img_cond, lang_mask=lang_attn_mask)
+
+        pred_type = self.prediction_type
+        if pred_type == 'epsilon':
+            target = noise
+        elif pred_type == 'sample':
+            target = action_gt
+        else:
+            raise ValueError(f"Unsupported prediction type {pred_type}")
+        loss = F.mse_loss(pred, target)
+        return loss
+
+    # ========= Inference  ============
+    def predict_action(self, lang_tokens, lang_attn_mask, img_tokens, state_tokens, action_mask, ctrl_freqs):
+        '''
+        lang_tokens: (batch_size, lang_len, lang_token_dim)
+        lang_attn_mask: (batch_size, lang_len), a mask for valid language tokens,
+            which should be True-False bool tensor.
+        img_tokens: (batch_size, img_len, img_token_dim)
+        state_tokens: (batch_size, 1, state_token_dim)
+        action_mask: (batch_size, 1, action_dim),
+            which should be a 0-1 **float** tensor.
+        ctrl_freqs: (batch_size,), control frequency for each sample.
+        
+        return: (batch_size, horizon, action_dim), predicted action sequence
+        '''
+        # Prepare the state and conditions
+        state_tokens = torch.cat([state_tokens, action_mask], dim=2)
+        lang_cond, img_cond, state_traj = self.adapt_conditions(lang_tokens, img_tokens, state_tokens)
+
+        # Run sampling
+        action_pred = self.conditional_sample(
+            lang_cond,
+            lang_attn_mask,
+            img_cond,
+            state_traj,
+            action_mask,
+            ctrl_freqs,
+        )
+
+        return action_pred
+
+    def forward(self, *args, **kwargs) -> torch.Tensor:
+        return self.compute_loss(*args, **kwargs)
--- a/RDT/rdt-export/requirements.txt
+++ b/RDT/rdt-export/requirements.txt
@ -0,0 +1,35 @@
+numpy<2.0
+packaging==24.0
+wandb==0.17.0
+deepspeed==0.14.2   
+accelerate==0.30.1
+diffusers==0.27.2
+timm==1.0.3
+transformers==4.41.0
+sentencepiece==0.2.0
+h5py==3.11.0
+opencv-python==4.9.0.80
+imgaug==0.4.0
+pytz==2022.1
+huggingface_hub==0.23.0
+
+torch==2.1.0
+torchvision==0.16.0
+pyzmq
+msgpack
+msgpack_numpy
+zstandard
+onnx 
+onnxruntime
+onnxsim
+onnxsim
+
+# requirements_data.txt
+# tfds-nightly==4.9.4.dev202402070044
+gsutil==5.27
+tensorflow==2.15.0.post1
+pillow==10.2.0
+pyyaml==6.0.1
+tensorflow-graphics==2021.12.3
+imageio==2.34.0
+imageio-ffmpeg==0.4.9
--- a/RDT/rdt-export/scripts/agilex_inference.py
+++ b/RDT/rdt-export/scripts/agilex_inference.py
@ -0,0 +1,941 @@
+#!/home/lin/software/miniconda3/envs/aloha/bin/python
+# -- coding: UTF-8
+"""
+#!/usr/bin/python3
+"""
+
+import argparse
+import sys
+import threading
+import time
+import yaml
+from collections import deque
+
+import numpy as np
+import rospy
+import torch
+from cv_bridge import CvBridge
+from geometry_msgs.msg import Twist
+from nav_msgs.msg import Odometry
+from PIL import Image as PImage
+from sensor_msgs.msg import Image, JointState
+from std_msgs.msg import Header
+import cv2
+
+from scripts.agilex_model import create_model
+
+# sys.path.append("./")
+
+CAMERA_NAMES = ["cam_high", "cam_right_wrist", "cam_left_wrist"]
+
+observation_window = None
+
+lang_embeddings = None
+
+# debug
+preload_images = None
+
+
+# Initialize the model
+def make_policy(args):
+    with open(args.config_path, "r") as fp:
+        config = yaml.safe_load(fp)
+    args.config = config
+
+    # pretrained_text_encoder_name_or_path = "google/t5-v1_1-xxl"
+    pretrained_vision_encoder_name_or_path = "google/siglip-so400m-patch14-384"
+    model = create_model(
+        args=args.config,
+        dtype=torch.bfloat16,
+        pretrained=args.pretrained_model_name_or_path,
+        # pretrained_text_encoder_name_or_path=pretrained_text_encoder_name_or_path,
+        pretrained_vision_encoder_name_or_path=pretrained_vision_encoder_name_or_path,
+        control_frequency=args.ctrl_freq,
+    )
+
+    return model
+
+
+def set_seed(seed):
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+
+# Interpolate the actions to make the robot move smoothly
+def interpolate_action(args, prev_action, cur_action):
+    steps = np.concatenate((np.array(args.arm_steps_length), np.array(args.arm_steps_length)), axis=0)
+    diff = np.abs(cur_action - prev_action)
+    step = np.ceil(diff / steps).astype(int)
+    step = np.max(step)
+    if step <= 1:
+        return cur_action[np.newaxis, :]
+    new_actions = np.linspace(prev_action, cur_action, step + 1)
+    return new_actions[1:]
+
+
+def get_config(args):
+    config = {
+        "episode_len": args.max_publish_step,
+        "state_dim": 14,
+        "chunk_size": args.chunk_size,
+        "camera_names": CAMERA_NAMES,
+    }
+    return config
+
+
+# Get the observation from the ROS topic
+def get_ros_observation(args, ros_operator):
+    rate = rospy.Rate(args.publish_rate)
+    print_flag = True
+
+    while True and not rospy.is_shutdown():
+        result = ros_operator.get_frame()
+        if not result:
+            if print_flag:
+                print("syn fail when get_ros_observation")
+                print_flag = False
+            rate.sleep()
+            continue
+        print_flag = True
+        (
+            img_front,
+            img_left,
+            img_right,
+            img_front_depth,
+            img_left_depth,
+            img_right_depth,
+            puppet_arm_left,
+            puppet_arm_right,
+            robot_base,
+        ) = result
+        # print(f"sync success when get_ros_observation")
+        return (img_front, img_left, img_right, puppet_arm_left, puppet_arm_right)
+
+
+# Update the observation window buffer
+def update_observation_window(args, config, ros_operator):
+    # JPEG transformation
+    # Align with training
+    def jpeg_mapping(img):
+        img = cv2.imencode(".jpg", img)[1].tobytes()
+        img = cv2.imdecode(np.frombuffer(img, np.uint8), cv2.IMREAD_COLOR)
+        return img
+
+    global observation_window
+    if observation_window is None:
+        observation_window = deque(maxlen=2)
+
+        # Append the first dummy image
+        observation_window.append({
+            "qpos": None,
+            "images": {
+                config["camera_names"][0]: None,
+                config["camera_names"][1]: None,
+                config["camera_names"][2]: None,
+            },
+        })
+
+    img_front, img_left, img_right, puppet_arm_left, puppet_arm_right = (get_ros_observation(args, ros_operator))
+    img_front = jpeg_mapping(img_front)
+    img_left = jpeg_mapping(img_left)
+    img_right = jpeg_mapping(img_right)
+
+    qpos = np.concatenate(
+        (np.array(puppet_arm_left.position), np.array(puppet_arm_right.position)),
+        axis=0,
+    )
+    qpos = torch.from_numpy(qpos).float().cuda()
+    observation_window.append({
+        "qpos": qpos,
+        "images": {
+            config["camera_names"][0]: img_front,
+            config["camera_names"][1]: img_right,
+            config["camera_names"][2]: img_left,
+        },
+    })
+
+
+# RDT inference
+def inference_fn(args, config, policy, t):
+    global observation_window
+    global lang_embeddings
+
+    # print(f"Start inference_thread_fn: t={t}")
+    while True and not rospy.is_shutdown():
+        time1 = time.time()
+
+        # fetch images in sequence [front, right, left]
+        image_arrs = [
+            observation_window[-2]["images"][config["camera_names"][0]],
+            observation_window[-2]["images"][config["camera_names"][1]],
+            observation_window[-2]["images"][config["camera_names"][2]],
+            observation_window[-1]["images"][config["camera_names"][0]],
+            observation_window[-1]["images"][config["camera_names"][1]],
+            observation_window[-1]["images"][config["camera_names"][2]],
+        ]
+
+        # fetch debug images in sequence [front, right, left]
+        # image_arrs = [
+        #     preload_images[config['camera_names'][0]][max(t - 1, 0)],
+        #     preload_images[config['camera_names'][2]][max(t - 1, 0)],
+        #     preload_images[config['camera_names'][1]][max(t - 1, 0)],
+        #     preload_images[config['camera_names'][0]][t],
+        #     preload_images[config['camera_names'][2]][t],
+        #     preload_images[config['camera_names'][1]][t]
+        # ]
+        # # encode the images
+        # for i in range(len(image_arrs)):
+        #     image_arrs[i] = cv2.imdecode(np.frombuffer(image_arrs[i], np.uint8), cv2.IMREAD_COLOR)
+        # proprio = torch.from_numpy(preload_images['qpos'][t]).float().cuda()
+
+        images = [PImage.fromarray(arr) if arr is not None else None for arr in image_arrs]
+
+        # for i, pos in enumerate(['f', 'r', 'l'] * 2):
+        #     images[i].save(f'{t}-{i}-{pos}.png')
+
+        # get last qpos in shape [14, ]
+        proprio = observation_window[-1]["qpos"]
+        # unsqueeze to [1, 14]
+        proprio = proprio.unsqueeze(0)
+
+        # actions shaped as [1, 64, 14] in format [left, right]
+        actions = (policy.step(proprio=proprio, images=images, text_embeds=lang_embeddings).squeeze(0).cpu().numpy())
+        # print(f"inference_actions: {actions.squeeze()}")
+
+        # print(f"Model inference time: {time.time() - time1} s")
+
+        # print(f"Finish inference_thread_fn: t={t}")
+        return actions
+
+
+# Main loop for the manipulation task
+def model_inference(args, config, ros_operator):
+    global lang_embeddings
+
+    # Load rdt model
+    policy = make_policy(args)
+
+    lang_dict = torch.load(args.lang_embeddings_path)
+    print(f"Running with instruction: \"{lang_dict['instruction']}\" from \"{lang_dict['name']}\"")
+    lang_embeddings = lang_dict["embeddings"]
+
+    max_publish_step = config["episode_len"]
+    chunk_size = config["chunk_size"]
+
+    # Initialize position of the puppet arm
+    left0 = [
+        -0.00133514404296875,
+        0.00209808349609375,
+        0.01583099365234375,
+        -0.032616615295410156,
+        -0.00286102294921875,
+        0.00095367431640625,
+        3.557830810546875,
+    ]
+    right0 = [
+        -0.00133514404296875,
+        0.00438690185546875,
+        0.034523963928222656,
+        -0.053597450256347656,
+        -0.00476837158203125,
+        -0.00209808349609375,
+        3.557830810546875,
+    ]
+    left1 = [
+        -0.00133514404296875,
+        0.00209808349609375,
+        0.01583099365234375,
+        -0.032616615295410156,
+        -0.00286102294921875,
+        0.00095367431640625,
+        -0.3393220901489258,
+    ]
+    right1 = [
+        -0.00133514404296875,
+        0.00247955322265625,
+        0.01583099365234375,
+        -0.032616615295410156,
+        -0.00286102294921875,
+        0.00095367431640625,
+        -0.3397035598754883,
+    ]
+    ros_operator.puppet_arm_publish_continuous(left0, right0)
+    input("Press enter to continue")
+    ros_operator.puppet_arm_publish_continuous(left1, right1)
+    # Initialize the previous action to be the initial robot state
+    pre_action = np.zeros(config["state_dim"])
+    pre_action[:14] = np.array([
+        -0.00133514404296875,
+        0.00209808349609375,
+        0.01583099365234375,
+        -0.032616615295410156,
+        -0.00286102294921875,
+        0.00095367431640625,
+        -0.3393220901489258,
+    ] + [
+        -0.00133514404296875,
+        0.00247955322265625,
+        0.01583099365234375,
+        -0.032616615295410156,
+        -0.00286102294921875,
+        0.00095367431640625,
+        -0.3397035598754883,
+    ])
+    action = None
+    # Inference loop
+    with torch.inference_mode():
+        while True and not rospy.is_shutdown():
+            # The current time step
+            t = 0
+            rate = rospy.Rate(args.publish_rate)
+
+            action_buffer = np.zeros([chunk_size, config["state_dim"]])
+
+            while t < max_publish_step and not rospy.is_shutdown():
+                # Update observation window
+                update_observation_window(args, config, ros_operator)
+
+                # When coming to the end of the action chunk
+                if t % chunk_size == 0:
+                    # Start inference
+                    action_buffer = inference_fn(args, config, policy, t).copy()
+
+                raw_action = action_buffer[t % chunk_size]
+                action = raw_action
+                # Interpolate the original action sequence
+                if args.use_actions_interpolation:
+                    # print(f"Time {t}, pre {pre_action}, act {action}")
+                    interp_actions = interpolate_action(args, pre_action, action)
+                else:
+                    interp_actions = action[np.newaxis, :]
+                # Execute the interpolated actions one by one
+                for act in interp_actions:
+                    left_action = act[:7]
+                    right_action = act[7:14]
+
+                    if not args.disable_puppet_arm:
+                        ros_operator.puppet_arm_publish(left_action,
+                                                        right_action)  # puppet_arm_publish_continuous_thread
+
+                    if args.use_robot_base:
+                        vel_action = act[14:16]
+                        ros_operator.robot_base_publish(vel_action)
+                    rate.sleep()
+                    # print(f"doing action: {act}")
+                t += 1
+
+                print("Published Step", t)
+                pre_action = action.copy()
+
+
+# ROS operator class
+class RosOperator:
+
+    def __init__(self, args):
+        self.robot_base_deque = None
+        self.puppet_arm_right_deque = None
+        self.puppet_arm_left_deque = None
+        self.img_front_deque = None
+        self.img_right_deque = None
+        self.img_left_deque = None
+        self.img_front_depth_deque = None
+        self.img_right_depth_deque = None
+        self.img_left_depth_deque = None
+        self.bridge = None
+        self.puppet_arm_left_publisher = None
+        self.puppet_arm_right_publisher = None
+        self.robot_base_publisher = None
+        self.puppet_arm_publish_thread = None
+        self.puppet_arm_publish_lock = None
+        self.args = args
+        self.init()
+        self.init_ros()
+
+    def init(self):
+        self.bridge = CvBridge()
+        self.img_left_deque = deque()
+        self.img_right_deque = deque()
+        self.img_front_deque = deque()
+        self.img_left_depth_deque = deque()
+        self.img_right_depth_deque = deque()
+        self.img_front_depth_deque = deque()
+        self.puppet_arm_left_deque = deque()
+        self.puppet_arm_right_deque = deque()
+        self.robot_base_deque = deque()
+        self.puppet_arm_publish_lock = threading.Lock()
+        self.puppet_arm_publish_lock.acquire()
+
+    def puppet_arm_publish(self, left, right):
+        joint_state_msg = JointState()
+        joint_state_msg.header = Header()
+        joint_state_msg.header.stamp = rospy.Time.now()  # Set timestep
+        joint_state_msg.name = [
+            "joint0",
+            "joint1",
+            "joint2",
+            "joint3",
+            "joint4",
+            "joint5",
+            "joint6",
+        ]  # 设置关节名称
+        joint_state_msg.position = left
+        self.puppet_arm_left_publisher.publish(joint_state_msg)
+        joint_state_msg.position = right
+        self.puppet_arm_right_publisher.publish(joint_state_msg)
+
+    def robot_base_publish(self, vel):
+        vel_msg = Twist()
+        vel_msg.linear.x = vel[0]
+        vel_msg.linear.y = 0
+        vel_msg.linear.z = 0
+        vel_msg.angular.x = 0
+        vel_msg.angular.y = 0
+        vel_msg.angular.z = vel[1]
+        self.robot_base_publisher.publish(vel_msg)
+
+    def puppet_arm_publish_continuous(self, left, right):
+        rate = rospy.Rate(self.args.publish_rate)
+        left_arm = None
+        right_arm = None
+        while True and not rospy.is_shutdown():
+            if len(self.puppet_arm_left_deque) != 0:
+                left_arm = list(self.puppet_arm_left_deque[-1].position)
+            if len(self.puppet_arm_right_deque) != 0:
+                right_arm = list(self.puppet_arm_right_deque[-1].position)
+            if left_arm is None or right_arm is None:
+                rate.sleep()
+                continue
+            else:
+                break
+        left_symbol = [1 if left[i] - left_arm[i] > 0 else -1 for i in range(len(left))]
+        right_symbol = [1 if right[i] - right_arm[i] > 0 else -1 for i in range(len(right))]
+        flag = True
+        step = 0
+        while flag and not rospy.is_shutdown():
+            if self.puppet_arm_publish_lock.acquire(False):
+                return
+            left_diff = [abs(left[i] - left_arm[i]) for i in range(len(left))]
+            right_diff = [abs(right[i] - right_arm[i]) for i in range(len(right))]
+            flag = False
+            for i in range(len(left)):
+                if left_diff[i] < self.args.arm_steps_length[i]:
+                    left_arm[i] = left[i]
+                else:
+                    left_arm[i] += left_symbol[i] * self.args.arm_steps_length[i]
+                    flag = True
+            for i in range(len(right)):
+                if right_diff[i] < self.args.arm_steps_length[i]:
+                    right_arm[i] = right[i]
+                else:
+                    right_arm[i] += right_symbol[i] * self.args.arm_steps_length[i]
+                    flag = True
+            joint_state_msg = JointState()
+            joint_state_msg.header = Header()
+            joint_state_msg.header.stamp = rospy.Time.now()  # Set the timestep
+            joint_state_msg.name = [
+                "joint0",
+                "joint1",
+                "joint2",
+                "joint3",
+                "joint4",
+                "joint5",
+                "joint6",
+            ]  # 设置关节名称
+            joint_state_msg.position = left_arm
+            self.puppet_arm_left_publisher.publish(joint_state_msg)
+            joint_state_msg.position = right_arm
+            self.puppet_arm_right_publisher.publish(joint_state_msg)
+            step += 1
+            print("puppet_arm_publish_continuous:", step)
+            rate.sleep()
+
+    def puppet_arm_publish_linear(self, left, right):
+        num_step = 100
+        rate = rospy.Rate(200)
+
+        left_arm = None
+        right_arm = None
+
+        while True and not rospy.is_shutdown():
+            if len(self.puppet_arm_left_deque) != 0:
+                left_arm = list(self.puppet_arm_left_deque[-1].position)
+            if len(self.puppet_arm_right_deque) != 0:
+                right_arm = list(self.puppet_arm_right_deque[-1].position)
+            if left_arm is None or right_arm is None:
+                rate.sleep()
+                continue
+            else:
+                break
+
+        traj_left_list = np.linspace(left_arm, left, num_step)
+        traj_right_list = np.linspace(right_arm, right, num_step)
+
+        for i in range(len(traj_left_list)):
+            traj_left = traj_left_list[i]
+            traj_right = traj_right_list[i]
+            traj_left[-1] = left[-1]
+            traj_right[-1] = right[-1]
+            joint_state_msg = JointState()
+            joint_state_msg.header = Header()
+            joint_state_msg.header.stamp = rospy.Time.now()  # 设置时间戳
+            joint_state_msg.name = [
+                "joint0",
+                "joint1",
+                "joint2",
+                "joint3",
+                "joint4",
+                "joint5",
+                "joint6",
+            ]  # 设置关节名称
+            joint_state_msg.position = traj_left
+            self.puppet_arm_left_publisher.publish(joint_state_msg)
+            joint_state_msg.position = traj_right
+            self.puppet_arm_right_publisher.publish(joint_state_msg)
+            rate.sleep()
+
+    def puppet_arm_publish_continuous_thread(self, left, right):
+        if self.puppet_arm_publish_thread is not None:
+            self.puppet_arm_publish_lock.release()
+            self.puppet_arm_publish_thread.join()
+            self.puppet_arm_publish_lock.acquire(False)
+            self.puppet_arm_publish_thread = None
+        self.puppet_arm_publish_thread = threading.Thread(target=self.puppet_arm_publish_continuous, args=(left, right))
+        self.puppet_arm_publish_thread.start()
+
+    def get_frame(self):
+        if (len(self.img_left_deque) == 0 or len(self.img_right_deque) == 0 or len(self.img_front_deque) == 0 or
+            (self.args.use_depth_image and (len(self.img_left_depth_deque) == 0 or len(self.img_right_depth_deque) == 0
+                                            or len(self.img_front_depth_deque) == 0))):
+            return False
+        if self.args.use_depth_image:
+            frame_time = min([
+                self.img_left_deque[-1].header.stamp.to_sec(),
+                self.img_right_deque[-1].header.stamp.to_sec(),
+                self.img_front_deque[-1].header.stamp.to_sec(),
+                self.img_left_depth_deque[-1].header.stamp.to_sec(),
+                self.img_right_depth_deque[-1].header.stamp.to_sec(),
+                self.img_front_depth_deque[-1].header.stamp.to_sec(),
+            ])
+        else:
+            frame_time = min([
+                self.img_left_deque[-1].header.stamp.to_sec(),
+                self.img_right_deque[-1].header.stamp.to_sec(),
+                self.img_front_deque[-1].header.stamp.to_sec(),
+            ])
+
+        if (len(self.img_left_deque) == 0 or self.img_left_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+        if (len(self.img_right_deque) == 0 or self.img_right_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+        if (len(self.img_front_deque) == 0 or self.img_front_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+        if (len(self.puppet_arm_left_deque) == 0 or self.puppet_arm_left_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+        if (len(self.puppet_arm_right_deque) == 0
+                or self.puppet_arm_right_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+        if self.args.use_depth_image and (len(self.img_left_depth_deque) == 0
+                                          or self.img_left_depth_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+        if self.args.use_depth_image and (len(self.img_right_depth_deque) == 0
+                                          or self.img_right_depth_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+        if self.args.use_depth_image and (len(self.img_front_depth_deque) == 0
+                                          or self.img_front_depth_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+        if self.args.use_robot_base and (len(self.robot_base_deque) == 0
+                                         or self.robot_base_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+
+        while self.img_left_deque[0].header.stamp.to_sec() < frame_time:
+            self.img_left_deque.popleft()
+        img_left = self.bridge.imgmsg_to_cv2(self.img_left_deque.popleft(), "passthrough")
+
+        while self.img_right_deque[0].header.stamp.to_sec() < frame_time:
+            self.img_right_deque.popleft()
+        img_right = self.bridge.imgmsg_to_cv2(self.img_right_deque.popleft(), "passthrough")
+
+        while self.img_front_deque[0].header.stamp.to_sec() < frame_time:
+            self.img_front_deque.popleft()
+        img_front = self.bridge.imgmsg_to_cv2(self.img_front_deque.popleft(), "passthrough")
+
+        while self.puppet_arm_left_deque[0].header.stamp.to_sec() < frame_time:
+            self.puppet_arm_left_deque.popleft()
+        puppet_arm_left = self.puppet_arm_left_deque.popleft()
+
+        while self.puppet_arm_right_deque[0].header.stamp.to_sec() < frame_time:
+            self.puppet_arm_right_deque.popleft()
+        puppet_arm_right = self.puppet_arm_right_deque.popleft()
+
+        img_left_depth = None
+        if self.args.use_depth_image:
+            while self.img_left_depth_deque[0].header.stamp.to_sec() < frame_time:
+                self.img_left_depth_deque.popleft()
+            img_left_depth = self.bridge.imgmsg_to_cv2(self.img_left_depth_deque.popleft(), "passthrough")
+
+        img_right_depth = None
+        if self.args.use_depth_image:
+            while self.img_right_depth_deque[0].header.stamp.to_sec() < frame_time:
+                self.img_right_depth_deque.popleft()
+            img_right_depth = self.bridge.imgmsg_to_cv2(self.img_right_depth_deque.popleft(), "passthrough")
+
+        img_front_depth = None
+        if self.args.use_depth_image:
+            while self.img_front_depth_deque[0].header.stamp.to_sec() < frame_time:
+                self.img_front_depth_deque.popleft()
+            img_front_depth = self.bridge.imgmsg_to_cv2(self.img_front_depth_deque.popleft(), "passthrough")
+
+        robot_base = None
+        if self.args.use_robot_base:
+            while self.robot_base_deque[0].header.stamp.to_sec() < frame_time:
+                self.robot_base_deque.popleft()
+            robot_base = self.robot_base_deque.popleft()
+
+        return (
+            img_front,
+            img_left,
+            img_right,
+            img_front_depth,
+            img_left_depth,
+            img_right_depth,
+            puppet_arm_left,
+            puppet_arm_right,
+            robot_base,
+        )
+
+    def img_left_callback(self, msg):
+        if len(self.img_left_deque) >= 2000:
+            self.img_left_deque.popleft()
+        self.img_left_deque.append(msg)
+
+    def img_right_callback(self, msg):
+        if len(self.img_right_deque) >= 2000:
+            self.img_right_deque.popleft()
+        self.img_right_deque.append(msg)
+
+    def img_front_callback(self, msg):
+        if len(self.img_front_deque) >= 2000:
+            self.img_front_deque.popleft()
+        self.img_front_deque.append(msg)
+
+    def img_left_depth_callback(self, msg):
+        if len(self.img_left_depth_deque) >= 2000:
+            self.img_left_depth_deque.popleft()
+        self.img_left_depth_deque.append(msg)
+
+    def img_right_depth_callback(self, msg):
+        if len(self.img_right_depth_deque) >= 2000:
+            self.img_right_depth_deque.popleft()
+        self.img_right_depth_deque.append(msg)
+
+    def img_front_depth_callback(self, msg):
+        if len(self.img_front_depth_deque) >= 2000:
+            self.img_front_depth_deque.popleft()
+        self.img_front_depth_deque.append(msg)
+
+    def puppet_arm_left_callback(self, msg):
+        if len(self.puppet_arm_left_deque) >= 2000:
+            self.puppet_arm_left_deque.popleft()
+        self.puppet_arm_left_deque.append(msg)
+
+    def puppet_arm_right_callback(self, msg):
+        if len(self.puppet_arm_right_deque) >= 2000:
+            self.puppet_arm_right_deque.popleft()
+        self.puppet_arm_right_deque.append(msg)
+
+    def robot_base_callback(self, msg):
+        if len(self.robot_base_deque) >= 2000:
+            self.robot_base_deque.popleft()
+        self.robot_base_deque.append(msg)
+
+    def init_ros(self):
+        rospy.init_node("joint_state_publisher", anonymous=True)
+        rospy.Subscriber(
+            self.args.img_left_topic,
+            Image,
+            self.img_left_callback,
+            queue_size=1000,
+            tcp_nodelay=True,
+        )
+        rospy.Subscriber(
+            self.args.img_right_topic,
+            Image,
+            self.img_right_callback,
+            queue_size=1000,
+            tcp_nodelay=True,
+        )
+        rospy.Subscriber(
+            self.args.img_front_topic,
+            Image,
+            self.img_front_callback,
+            queue_size=1000,
+            tcp_nodelay=True,
+        )
+        if self.args.use_depth_image:
+            rospy.Subscriber(
+                self.args.img_left_depth_topic,
+                Image,
+                self.img_left_depth_callback,
+                queue_size=1000,
+                tcp_nodelay=True,
+            )
+            rospy.Subscriber(
+                self.args.img_right_depth_topic,
+                Image,
+                self.img_right_depth_callback,
+                queue_size=1000,
+                tcp_nodelay=True,
+            )
+            rospy.Subscriber(
+                self.args.img_front_depth_topic,
+                Image,
+                self.img_front_depth_callback,
+                queue_size=1000,
+                tcp_nodelay=True,
+            )
+        rospy.Subscriber(
+            self.args.puppet_arm_left_topic,
+            JointState,
+            self.puppet_arm_left_callback,
+            queue_size=1000,
+            tcp_nodelay=True,
+        )
+        rospy.Subscriber(
+            self.args.puppet_arm_right_topic,
+            JointState,
+            self.puppet_arm_right_callback,
+            queue_size=1000,
+            tcp_nodelay=True,
+        )
+        rospy.Subscriber(
+            self.args.robot_base_topic,
+            Odometry,
+            self.robot_base_callback,
+            queue_size=1000,
+            tcp_nodelay=True,
+        )
+        self.puppet_arm_left_publisher = rospy.Publisher(self.args.puppet_arm_left_cmd_topic, JointState, queue_size=10)
+        self.puppet_arm_right_publisher = rospy.Publisher(self.args.puppet_arm_right_cmd_topic,
+                                                          JointState,
+                                                          queue_size=10)
+        self.robot_base_publisher = rospy.Publisher(self.args.robot_base_cmd_topic, Twist, queue_size=10)
+
+
+def get_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--max_publish_step",
+        action="store",
+        type=int,
+        help="Maximum number of action publishing steps",
+        default=10000,
+        required=False,
+    )
+    parser.add_argument(
+        "--seed",
+        action="store",
+        type=int,
+        help="Random seed",
+        default=None,
+        required=False,
+    )
+
+    parser.add_argument(
+        "--img_front_topic",
+        action="store",
+        type=str,
+        help="img_front_topic",
+        default="/camera_f/color/image_raw",
+        required=False,
+    )
+    parser.add_argument(
+        "--img_left_topic",
+        action="store",
+        type=str,
+        help="img_left_topic",
+        default="/camera_l/color/image_raw",
+        required=False,
+    )
+    parser.add_argument(
+        "--img_right_topic",
+        action="store",
+        type=str,
+        help="img_right_topic",
+        default="/camera_r/color/image_raw",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--img_front_depth_topic",
+        action="store",
+        type=str,
+        help="img_front_depth_topic",
+        default="/camera_f/depth/image_raw",
+        required=False,
+    )
+    parser.add_argument(
+        "--img_left_depth_topic",
+        action="store",
+        type=str,
+        help="img_left_depth_topic",
+        default="/camera_l/depth/image_raw",
+        required=False,
+    )
+    parser.add_argument(
+        "--img_right_depth_topic",
+        action="store",
+        type=str,
+        help="img_right_depth_topic",
+        default="/camera_r/depth/image_raw",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--puppet_arm_left_cmd_topic",
+        action="store",
+        type=str,
+        help="puppet_arm_left_cmd_topic",
+        default="/master/joint_left",
+        required=False,
+    )
+    parser.add_argument(
+        "--puppet_arm_right_cmd_topic",
+        action="store",
+        type=str,
+        help="puppet_arm_right_cmd_topic",
+        default="/master/joint_right",
+        required=False,
+    )
+    parser.add_argument(
+        "--puppet_arm_left_topic",
+        action="store",
+        type=str,
+        help="puppet_arm_left_topic",
+        default="/puppet/joint_left",
+        required=False,
+    )
+    parser.add_argument(
+        "--puppet_arm_right_topic",
+        action="store",
+        type=str,
+        help="puppet_arm_right_topic",
+        default="/puppet/joint_right",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--robot_base_topic",
+        action="store",
+        type=str,
+        help="robot_base_topic",
+        default="/odom_raw",
+        required=False,
+    )
+    parser.add_argument(
+        "--robot_base_cmd_topic",
+        action="store",
+        type=str,
+        help="robot_base_topic",
+        default="/cmd_vel",
+        required=False,
+    )
+    parser.add_argument(
+        "--use_robot_base",
+        action="store_true",
+        help="Whether to use the robot base to move around",
+        default=False,
+        required=False,
+    )
+    parser.add_argument(
+        "--publish_rate",
+        action="store",
+        type=int,
+        help="The rate at which to publish the actions",
+        default=30,
+        required=False,
+    )
+    parser.add_argument(
+        "--ctrl_freq",
+        action="store",
+        type=int,
+        help="The control frequency of the robot",
+        default=25,
+        required=False,
+    )
+
+    parser.add_argument(
+        "--chunk_size",
+        action="store",
+        type=int,
+        help="Action chunk size",
+        default=64,
+        required=False,
+    )
+    parser.add_argument(
+        "--arm_steps_length",
+        action="store",
+        type=float,
+        help="The maximum change allowed for each joint per timestep",
+        default=[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.2],
+        required=False,
+    )
+
+    parser.add_argument(
+        "--use_actions_interpolation",
+        action="store_true",
+        help="Whether to interpolate the actions if the difference is too large",
+        default=False,
+        required=False,
+    )
+    parser.add_argument(
+        "--use_depth_image",
+        action="store_true",
+        help="Whether to use depth images",
+        default=False,
+        required=False,
+    )
+
+    parser.add_argument(
+        "--disable_puppet_arm",
+        action="store_true",
+        help="Whether to disable the puppet arm. This is useful for safely debugging",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        default="configs/base.yaml",
+        help="Path to the config file",
+    )
+    # parser.add_argument('--cfg_scale', type=float, default=2.0,
+    #                     help='the scaling factor used to modify the magnitude of the control features during denoising')
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        required=True,
+        help="Name or path to the pretrained model",
+    )
+
+    parser.add_argument(
+        "--lang_embeddings_path",
+        type=str,
+        required=True,
+        help="Path to the pre-encoded language instruction embeddings",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_arguments()
+    ros_operator = RosOperator(args)
+    if args.seed is not None:
+        set_seed(args.seed)
+    config = get_config(args)
+    model_inference(args, config, ros_operator)
+
+
+if __name__ == "__main__":
+    main()
--- a/RDT/rdt-export/scripts/agilex_model.py
+++ b/RDT/rdt-export/scripts/agilex_model.py
@ -0,0 +1,315 @@
+import os, sys
+
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from configs.state_vec import STATE_VEC_IDX_MAPPING
+
+from pathlib import Path
+
+# get current workspace
+current_file = Path(__file__)
+sys.path.append(os.path.join(current_file.parent.parent, "models"))
+sys.path.append(os.path.join(current_file.parent.parent, "models"))
+
+from multimodal_encoder.siglip_encoder import SiglipVisionTower
+from multimodal_encoder.t5_encoder import T5Embedder
+from rdt_runner import RDTRunner
+
+AGILEX_STATE_INDICES = [
+        STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"] for i in range(6)
+    ]
+
+# Create the RDT model
+def create_model(args, **kwargs):
+    model = RoboticDiffusionTransformerModel(args, **kwargs)
+    pretrained = kwargs.get("pretrained", None)
+    if pretrained is not None and os.path.isfile(pretrained):
+        model.load_pretrained_weights(pretrained)
+
+    return model
+
+
+class RoboticDiffusionTransformerModel(object):
+    """A wrapper for the RDT model, which handles
+    1. Model initialization
+    2. Encodings of instructions
+    3. Model inference
+    """
+
+    def __init__(
+        self,
+        args,
+        device="cuda",
+        dtype=torch.bfloat16,
+        image_size=None,
+        control_frequency=25,
+        pretrained=None,
+        pretrained_vision_encoder_name_or_path=None,
+    ):
+        self.args = args
+        self.dtype = dtype
+        self.image_size = image_size
+        self.device = device
+        self.control_frequency = control_frequency
+        # We do not use the text encoder due to limited GPU memory
+        # self.text_tokenizer, self.text_model = self.get_text_encoder(pretrained_text_encoder_name_or_path)
+        self.image_processor, self.vision_model = self.get_vision_encoder(pretrained_vision_encoder_name_or_path)
+        self.policy = self.get_policy(pretrained)
+
+        self.reset()
+
+    def get_policy(self, pretrained):
+        """Initialize the model."""
+        # Initialize model with arguments
+        if pretrained is None or os.path.isfile(pretrained):
+            img_cond_len = (self.args["common"]["img_history_size"] * self.args["common"]["num_cameras"] *
+                            self.vision_model.num_patches)
+
+            _model = RDTRunner(
+                action_dim=self.args["common"]["state_dim"],
+                pred_horizon=self.args["common"]["action_chunk_size"],
+                config=self.args["model"],
+                lang_token_dim=self.args["model"]["lang_token_dim"],
+                img_token_dim=self.args["model"]["img_token_dim"],
+                state_token_dim=self.args["model"]["state_token_dim"],
+                max_lang_cond_len=self.args["dataset"]["tokenizer_max_length"],
+                img_cond_len=img_cond_len,
+                img_pos_embed_config=[
+                    # No initial pos embed in the last grid size
+                    # since we've already done in ViT
+                    (
+                        "image",
+                        (
+                            self.args["common"]["img_history_size"],
+                            self.args["common"]["num_cameras"],
+                            -self.vision_model.num_patches,
+                        ),
+                    ),
+                ],
+                lang_pos_embed_config=[
+                    # Similarly, no initial pos embed for language
+                    ("lang", -self.args["dataset"]["tokenizer_max_length"]),
+                ],
+                dtype=self.dtype,
+            )
+        else:
+            _model = RDTRunner.from_pretrained(pretrained)
+
+        return _model
+
+    def get_text_encoder(self, pretrained_text_encoder_name_or_path):
+        text_embedder = T5Embedder(
+            from_pretrained=pretrained_text_encoder_name_or_path,
+            model_max_length=self.args["dataset"]["tokenizer_max_length"],
+            device=self.device,
+        )
+        tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
+        return tokenizer, text_encoder
+
+    def get_vision_encoder(self, pretrained_vision_encoder_name_or_path):
+        vision_encoder = SiglipVisionTower(vision_tower=pretrained_vision_encoder_name_or_path, args=None)
+        image_processor = vision_encoder.image_processor
+        return image_processor, vision_encoder
+
+    def reset(self):
+        """Set model to evaluation mode."""
+        device = self.device
+        weight_dtype = self.dtype
+        self.policy.eval()
+        # self.text_model.eval()
+        self.vision_model.eval()
+
+        self.policy = self.policy.to(device, dtype=weight_dtype)
+        # self.text_model = self.text_model.to(device, dtype=weight_dtype)
+        self.vision_model = self.vision_model.to(device, dtype=weight_dtype)
+
+    def load_pretrained_weights(self, pretrained=None):
+        if pretrained is None:
+            return
+        print(f"Loading weights from {pretrained}")
+        filename = os.path.basename(pretrained)
+        if filename.endswith(".pt"):
+            checkpoint = torch.load(pretrained)
+            self.policy.load_state_dict(checkpoint["module"])
+        elif filename.endswith(".safetensors"):
+            from safetensors.torch import load_model
+
+            load_model(self.policy, pretrained)
+        else:
+            raise NotImplementedError(f"Unknown checkpoint format: {pretrained}")
+
+    def encode_instruction(self, instruction, device="cuda"):
+        """Encode string instruction to latent embeddings.
+
+        Args:
+            instruction: a string of instruction
+            device: a string of device
+
+        Returns:
+            pred: a tensor of latent embeddings of shape (text_max_length, 512)
+        """
+        tokens = self.text_tokenizer(instruction, return_tensors="pt", padding="longest",
+                                     truncation=True)["input_ids"].to(device)
+
+        tokens = tokens.view(1, -1)
+        with torch.no_grad():
+            pred = self.text_model(tokens).last_hidden_state.detach()
+
+        return pred
+
+    def _format_joint_to_state(self, joints):
+        """
+        Format the joint proprioception into the unified action vector.
+
+        Args:
+            joints (torch.Tensor): The joint proprioception to be formatted.
+                qpos ([B, N, 14]).
+
+        Returns:
+            state (torch.Tensor): The formatted vector for RDT ([B, N, 128]).
+        """
+        # Rescale the gripper to the range of [0, 1]
+        joints = joints / torch.tensor(
+            [[[180, 180, 180, 180, 180, 180]]],
+            device=joints.device,
+            dtype=joints.dtype,
+        )
+
+        B, N, _ = joints.shape
+        state = torch.zeros(
+            (B, N, self.args["model"]["state_token_dim"]),
+            device=joints.device,
+            dtype=joints.dtype,
+        )
+        # Fill into the unified state vector
+        state[:, :, AGILEX_STATE_INDICES] = joints
+        # Assemble the mask indicating each dimension's availability
+        state_elem_mask = torch.zeros(
+            (B, self.args["model"]["state_token_dim"]),
+            device=joints.device,
+            dtype=joints.dtype,
+        )
+        state_elem_mask[:, AGILEX_STATE_INDICES] = 1
+        return state, state_elem_mask
+
+    def _unformat_action_to_joint(self, action):
+        """
+        Unformat the unified action vector into the joint action to be executed.
+
+        Args:
+            action (torch.Tensor): The unified action vector to be unformatted.
+                ([B, N, 128])
+
+        Returns:
+            joints (torch.Tensor): The unformatted robot joint action.
+                qpos ([B, N, 14]).
+        """
+        action_indices = AGILEX_STATE_INDICES
+        joints = action[:, :, action_indices]
+
+        # Rescale the gripper back to the action range
+        # Note that the action range and proprioception range are different
+        # for Mobile ALOHA robot
+        joints = joints * torch.tensor(
+            [[[180, 180, 180, 180, 180, 180]]],
+            device=joints.device,
+            dtype=joints.dtype,
+        )
+
+        return joints
+
+    @torch.no_grad()
+    def step(self, proprio, images, text_embeds):
+        """
+        Predict the next action chunk given the
+        proprioceptive states, images, and instruction embeddings.
+
+        Args:
+            proprio: proprioceptive states
+            images: RGB images, the order should be
+                [ext_{t-1}, right_wrist_{t-1}, left_wrist_{t-1},
+                ext_{t}, right_wrist_{t}, left_wrist_{t}]
+            text_embeds: instruction embeddings
+
+        Returns:
+            action: predicted action
+        """
+        device = self.device
+        dtype = self.dtype
+
+        # The background image used for padding
+        background_color = np.array([int(x * 255) for x in self.image_processor.image_mean],
+                                    dtype=np.uint8).reshape(1, 1, 3)
+        background_image = (np.ones(
+            (
+                self.image_processor.size["height"],
+                self.image_processor.size["width"],
+                3,
+            ),
+            dtype=np.uint8,
+        ) * background_color)
+
+        # Preprocess the images by order and encode them
+        image_tensor_list = []
+        for image in images:
+            if image is None:
+                # Replace it with the background image
+                image = Image.fromarray(background_image)
+
+            if self.image_size is not None:
+                image = transforms.Resize(self.data_args.image_size)(image)
+
+            if self.args["dataset"].get("auto_adjust_image_brightness", False):
+                pixel_values = list(image.getdata())
+                average_brightness = sum(sum(pixel) for pixel in pixel_values) / (len(pixel_values) * 255.0 * 3)
+                if average_brightness <= 0.15:
+                    image = transforms.ColorJitter(brightness=(1.75, 1.75))(image)
+
+            if self.args["dataset"].get("image_aspect_ratio", "pad") == "pad":
+
+                def expand2square(pil_img, background_color):
+                    width, height = pil_img.size
+                    if width == height:
+                        return pil_img
+                    elif width > height:
+                        result = Image.new(pil_img.mode, (width, width), background_color)
+                        result.paste(pil_img, (0, (width - height) // 2))
+                        return result
+                    else:
+                        result = Image.new(pil_img.mode, (height, height), background_color)
+                        result.paste(pil_img, ((height - width) // 2, 0))
+                        return result
+
+                image = expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean))
+            image = self.image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+            image_tensor_list.append(image)
+
+        image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
+
+        image_embeds = self.vision_model(image_tensor).detach()
+        image_embeds = image_embeds.reshape(-1, self.vision_model.hidden_size).unsqueeze(0)
+
+        # Prepare the proprioception states and the control frequency
+        joints = proprio.to(device).unsqueeze(0)  # (1, 1, 14)
+        states, state_elem_mask = self._format_joint_to_state(joints)  # (1, 1, 128), (1, 128)
+        states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(device, dtype=dtype)
+        states = states[:, -1:, :]  # (1, 1, 128)
+        ctrl_freqs = torch.tensor([self.control_frequency]).to(device)
+
+        text_embeds = text_embeds.to(device, dtype=dtype)
+
+        # Predict the next action chunk given the inputs
+        trajectory = self.policy.predict_action(
+            lang_tokens=text_embeds,
+            lang_attn_mask=torch.ones(text_embeds.shape[:2], dtype=torch.bool, device=text_embeds.device),
+            img_tokens=image_embeds,
+            state_tokens=states,
+            action_mask=state_elem_mask.unsqueeze(1),
+            ctrl_freqs=ctrl_freqs,
+        )
+        trajectory = self._unformat_action_to_joint(trajectory).to(torch.float32)
+
+        return trajectory
--- a/RDT/rdt-export/scripts/encode_lang.py
+++ b/RDT/rdt-export/scripts/encode_lang.py
@ -0,0 +1,53 @@
+import os
+
+import torch
+import yaml
+
+from models.multimodal_encoder.t5_encoder import T5Embedder
+
+GPU = 0
+MODEL_PATH = "google/t5-v1_1-xxl"
+CONFIG_PATH = "configs/base.yaml"
+SAVE_DIR = "outs/"
+
+# Modify this to your task name and instruction
+TASK_NAME = "handover_pan"
+INSTRUCTION = "Pick up the black marker on the right and put it into the packaging box on the left."
+
+# Note: if your GPU VRAM is less than 24GB,
+# it is recommended to enable offloading by specifying an offload directory.
+OFFLOAD_DIR = (
+    None  # Specify your offload directory here, ensuring the directory exists.
+)
+
+
+def main():
+    with open(CONFIG_PATH, "r") as fp:
+        config = yaml.safe_load(fp)
+
+    device = torch.device(f"cuda:{GPU}")
+    text_embedder = T5Embedder(
+        from_pretrained=MODEL_PATH,
+        model_max_length=config["dataset"]["tokenizer_max_length"],
+        device=device,
+        use_offload_folder=OFFLOAD_DIR,
+    )
+    tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
+
+    tokens = tokenizer(INSTRUCTION, return_tensors="pt", padding="longest", truncation=True)["input_ids"].to(device)
+
+    tokens = tokens.view(1, -1)
+    with torch.no_grad():
+        pred = text_encoder(tokens).last_hidden_state.detach().cpu()
+
+    save_path = os.path.join(SAVE_DIR, f"{TASK_NAME}.pt")
+    # We save the embeddings in a dictionary format
+    torch.save({"name": TASK_NAME, "instruction": INSTRUCTION, "embeddings": pred}, save_path)
+
+    print(
+        f'"{INSTRUCTION}" from "{TASK_NAME}" is encoded by "{MODEL_PATH}" into shape {pred.shape} and saved to "{save_path}"'
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/RDT/rdt-export/scripts/encode_lang_batch_once.py
+++ b/RDT/rdt-export/scripts/encode_lang_batch_once.py
@ -0,0 +1,57 @@
+import os
+import json
+import argparse
+import torch
+import yaml
+from tqdm import tqdm
+
+from models.multimodal_encoder.t5_encoder import T5Embedder
+
+
+def encode_lang(
+    DATA_FILE_PATH,
+    TARGET_DIR,
+    GPU,
+    desc_type="seen",
+    tokenizer=None,
+    text_encoder=None,
+):
+    current_dir = os.path.dirname(__file__)
+
+    with open(os.path.join(current_dir, "../configs/base.yaml"), "r") as fp:
+        config = yaml.safe_load(fp)
+
+    device = torch.device(f"cuda:{GPU}")
+    if tokenizer is None or text_encoder is None:
+        text_embedder = T5Embedder(
+            from_pretrained=os.path.join(current_dir, "../../weights/RDT/t5-v1_1-xxl"),
+            model_max_length=config["dataset"]["tokenizer_max_length"],
+            device=device,
+            use_offload_folder=None,
+        )
+        tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
+
+    with open(DATA_FILE_PATH, "r") as f_instr:
+        instruction_dict = json.load(f_instr)
+
+    instructions = instruction_dict[desc_type]
+
+    # Encode the instructions
+    tokenized_res = tokenizer(instructions, return_tensors="pt", padding="longest", truncation=True)
+    tokens = tokenized_res["input_ids"].to(device)
+    attn_mask = tokenized_res["attention_mask"].to(device)
+
+    with torch.no_grad():
+        text_embeds = (text_encoder(input_ids=tokens, attention_mask=attn_mask)["last_hidden_state"].detach().cpu())
+
+    attn_mask = attn_mask.cpu().bool()
+    if not os.path.exists(f"{TARGET_DIR}/instructions"):
+        os.makedirs(f"{TARGET_DIR}/instructions")
+    # Save the embeddings for training use
+    for i in range(len(instructions)):
+        text_embed = text_embeds[i][attn_mask[i]]
+        save_path = os.path.join(TARGET_DIR, f"instructions/lang_embed_{i}.pt")
+        # print("encoded instructions save_path:",save_path)
+        torch.save(text_embed, save_path)
+
+    return tokenizer, text_encoder
--- a/RDT/rdt-export/scripts/generate_output_json.py
+++ b/RDT/rdt-export/scripts/generate_output_json.py
@ -0,0 +1,84 @@
+import json
+import os
+import sys
+import re
+
+def extract_metrics_from_log(log_file_path):
+    all_metrics = []
+    pattern = re.compile(
+        r"\{'agilex_sample_mse':\s*([0-9.eE+-]+),\s*'agilex_sample_l2err':\s*([0-9.eE+-]+),\s*'overall_avg_sample_mse':\s*([0-9.eE+-]+),\s*'overall_avg_sample_l2err':\s*([0-9.eE+-]+)\}"
+    )
+    try:
+        with open(log_file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                m = pattern.search(line)
+                if m:
+                    metrics = (
+                        float(m.group(1)), 
+                        float(m.group(2)), 
+                        float(m.group(3)), 
+                        float(m.group(4))
+                    )
+                    all_metrics.append(metrics)
+                    print(f"Find Metrics: agilex_sample_mse={metrics[0]}, agilex_sample_l2err={metrics[1]}, "
+                          f"overall_avg_sample_mse={metrics[2]}, overall_avg_sample_l2err={metrics[3]}")
+    except Exception as e:
+        print(f"Failed to read log: {e}")
+        return (None, None, None, None)
+    
+    if not all_metrics:
+        print("No metrics found in the log file")
+        return (None, None, None, None)
+    
+    print(f"\nTotal {len(all_metrics)} metrics found in the log file")
+    
+    best_agilex_mse = min(m[0] for m in all_metrics)
+    best_agilex_l2err = min(m[1] for m in all_metrics)
+    best_overall_mse = min(m[2] for m in all_metrics)
+    best_overall_l2err = min(m[3] for m in all_metrics)
+    
+    print(f"\nBest metrics:")
+    print(f"  agilex_sample_mse: {best_agilex_mse}")
+    print(f"  agilex_sample_l2err: {best_agilex_l2err}")
+    print(f"  overall_avg_sample_mse: {best_overall_mse}")
+    print(f"  overall_avg_sample_l2err: {best_overall_l2err}")
+    
+    return (best_agilex_mse, best_agilex_l2err, best_overall_mse, best_overall_l2err)
+
+def generate_output_json(input_config_file, output_dir, runtime):
+    with open(input_config_file, 'r') as f:
+        config = json.load(f)
+
+    log_file = os.path.join(output_dir, 'output.log')
+    agilex_sample_mse, agilex_sample_l2err, overall_avg_sample_mse, overall_avg_sample_l2err = extract_metrics_from_log(log_file)
+
+    if None in [agilex_sample_mse, agilex_sample_l2err, overall_avg_sample_mse, overall_avg_sample_l2err]:
+        print("Warning: Some metrics are missing in the log file.")
+
+    output_json = {
+        "task_id": config.get("task_id"),
+        "model_type": "RDT-170M",
+        "model_name": config.get("model_name") if "model_name" in config else config.get("train", {}).get("model"),
+        "gpu_id": config.get("gpu_id"),
+        "runtime": runtime,
+        "log_path": log_file,
+        "output_dir": output_dir,
+        "model_path": os.path.join(output_dir, 'pytorch_model.bin'),
+        "metrics": {
+            "agilex_sample_mse": agilex_sample_mse,
+            "agilex_sample_l2err": agilex_sample_l2err,
+            "overall_avg_sample_mse": overall_avg_sample_mse,
+            "overall_avg_sample_l2err": overall_avg_sample_l2err
+        }
+    }
+
+    # 写入 output.json，格式化输出、确保null与规范json一致
+    output_json_path = os.path.join(output_dir, 'output.json')
+    with open(output_json_path, 'w') as f:
+        json.dump(output_json, f, indent=4, ensure_ascii=False)
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: python generate_output_json.py <input_config_file> <output_dir> <runtime>")
+        sys.exit(1)
+    generate_output_json(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/RDT/rdt-export/scripts/maniskill_model.py
+++ b/RDT/rdt-export/scripts/maniskill_model.py
@ -0,0 +1,325 @@
+import os
+
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from configs.state_vec import STATE_VEC_IDX_MAPPING
+from models.multimodal_encoder.siglip_encoder import SiglipVisionTower
+from models.multimodal_encoder.t5_encoder import T5Embedder
+from models.rdt_runner import RDTRunner
+
+MANISKILL_INDICES = [STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"]
+                     for i in range(7)] + [STATE_VEC_IDX_MAPPING[f"right_gripper_open"]]
+
+
+def create_model(args, pretrained, **kwargs):
+    model = RoboticDiffusionTransformerModel(args, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_weights(pretrained)
+    return model
+
+
+DATA_STAT = {
+    "state_min": [
+        -0.7463043928146362,
+        -0.0801204964518547,
+        -0.4976441562175751,
+        -2.657780647277832,
+        -0.5742632150650024,
+        1.8309762477874756,
+        -2.2423808574676514,
+        0.0,
+    ],
+    "state_max": [
+        0.7645499110221863,
+        1.4967026710510254,
+        0.4650936424732208,
+        -0.3866899907588959,
+        0.5505855679512024,
+        3.2900545597076416,
+        2.5737812519073486,
+        0.03999999910593033,
+    ],
+    "action_min": [
+        -0.7472005486488342,
+        -0.08631071448326111,
+        -0.4995281398296356,
+        -2.658363103866577,
+        -0.5751323103904724,
+        1.8290787935256958,
+        -2.245187997817993,
+        -1.0,
+    ],
+    "action_max": [
+        0.7654682397842407,
+        1.4984270334243774,
+        0.46786263585090637,
+        -0.38181185722351074,
+        0.5517147779464722,
+        3.291581630706787,
+        2.575840711593628,
+        1.0,
+    ],
+}
+
+
+class RoboticDiffusionTransformerModel(object):
+    """A wrapper for the RDT model, which handles
+    1. Model initialization
+    2. Encodings of instructions
+    3. Model inference
+    """
+
+    def __init__(
+        self,
+        args,
+        device="cuda",
+        dtype=torch.bfloat16,
+        image_size=None,
+        control_frequency=25,
+        pretrained_text_encoder_name_or_path=None,
+        pretrained_vision_encoder_name_or_path=None,
+    ):
+        self.args = args
+        self.dtype = dtype
+        self.image_size = image_size
+        self.device = device
+        self.control_frequency = control_frequency
+        self.text_tokenizer, self.text_model = self.get_text_encoder(pretrained_text_encoder_name_or_path)
+        self.image_processor, self.vision_model = self.get_vision_encoder(pretrained_vision_encoder_name_or_path)
+        self.policy = self.get_policy()
+
+        self.state_min = torch.tensor(DATA_STAT["state_min"]).to(device)
+        self.state_max = torch.tensor(DATA_STAT["state_max"]).to(device)
+        self.action_min = torch.tensor(DATA_STAT["action_min"]).to(device)
+        self.action_max = torch.tensor(DATA_STAT["action_max"]).to(device)
+
+        self.reset()
+
+    def get_policy(self):
+        """Initialize the model."""
+        # Initialize model with arguments
+        img_cond_len = (self.args["common"]["img_history_size"] * self.args["common"]["num_cameras"] *
+                        self.vision_model.num_patches)
+
+        _model = RDTRunner(
+            action_dim=self.args["common"]["state_dim"],
+            pred_horizon=self.args["common"]["action_chunk_size"],
+            config=self.args["model"],
+            lang_token_dim=self.args["model"]["lang_token_dim"],
+            img_token_dim=self.args["model"]["img_token_dim"],
+            state_token_dim=self.args["model"]["state_token_dim"],
+            max_lang_cond_len=self.args["dataset"]["tokenizer_max_length"],
+            img_cond_len=img_cond_len,
+            img_pos_embed_config=[
+                # No initial pos embed in the last grid size
+                # since we've already done in ViT
+                (
+                    "image",
+                    (
+                        self.args["common"]["img_history_size"],
+                        self.args["common"]["num_cameras"],
+                        -self.vision_model.num_patches,
+                    ),
+                ),
+            ],
+            lang_pos_embed_config=[
+                # Similarly, no initial pos embed for language
+                ("lang", -self.args["dataset"]["tokenizer_max_length"]),
+            ],
+            dtype=self.dtype,
+        )
+
+        return _model
+
+    def get_text_encoder(self, pretrained_text_encoder_name_or_path):
+        text_embedder = T5Embedder(
+            from_pretrained=pretrained_text_encoder_name_or_path,
+            model_max_length=self.args["dataset"]["tokenizer_max_length"],
+            device=self.device,
+        )
+        tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
+        return tokenizer, text_encoder
+
+    def get_vision_encoder(self, pretrained_vision_encoder_name_or_path):
+        vision_encoder = SiglipVisionTower(vision_tower=pretrained_vision_encoder_name_or_path, args=None)
+        image_processor = vision_encoder.image_processor
+        return image_processor, vision_encoder
+
+    def reset(self):
+        """Set model to evaluation mode."""
+        device = self.device
+        weight_dtype = self.dtype
+        self.policy.eval()
+        self.text_model.eval()
+        self.vision_model.eval()
+
+        self.policy = self.policy.to(device, dtype=weight_dtype)
+        self.text_model = self.text_model.to(device, dtype=weight_dtype)
+        self.vision_model = self.vision_model.to(device, dtype=weight_dtype)
+
+    def load_pretrained_weights(self, pretrained=None):
+        if pretrained is None:
+            return
+        print(f"Loading weights from {pretrained}")
+        filename = os.path.basename(pretrained)
+        if filename.endswith(".pt"):
+            checkpoint = torch.load(pretrained)
+            self.policy.load_state_dict(checkpoint["module"])
+        elif filename.endswith(".safetensors"):
+            from safetensors.torch import load_model
+
+            load_model(self.policy, pretrained)
+        else:
+            raise NotImplementedError(f"Unknown checkpoint format: {pretrained}")
+
+    def encode_instruction(self, instruction, device="cuda"):
+        """Encode string instruction to latent embeddings.
+
+        Args:
+            instruction: a string of instruction
+            device: a string of device
+
+        Returns:
+            pred: a tensor of latent embeddings of shape (text_max_length, 512)
+        """
+        tokens = self.text_tokenizer(instruction, return_tensors="pt", padding="longest",
+                                     truncation=True)["input_ids"].to(device)
+
+        tokens = tokens.view(1, -1)
+        with torch.no_grad():
+            pred = self.text_model(tokens).last_hidden_state.detach()
+
+        return pred
+
+    def _format_joint_to_state(self, joints):
+        """
+        Format the robot joint state into the unified state vector.
+
+        Args:
+            joints (torch.Tensor): The joint state to be formatted.
+                qpos ([B, N, 14]).
+
+        Returns:
+            state (torch.Tensor): The formatted state for RDT ([B, N, 128]).
+        """
+        # Rescale the gripper
+        # joints = joints / torch.tensor(
+        #     [[[1, 1, 1, 1, 1, 1, 4.7908, 1, 1, 1, 1, 1, 1, 4.7888]]],
+        #     device=joints.device, dtype=joints.dtype
+        # )
+
+        # normalize to -1,1
+        joints = (joints - self.state_min) / (self.state_max - self.state_min) * 2 - 1
+        B, N, _ = joints.shape
+        state = torch.zeros(
+            (B, N, self.args["model"]["state_token_dim"]),
+            device=joints.device,
+            dtype=joints.dtype,
+        )
+        # assemble the unifed state vector
+        state[:, :, MANISKILL_INDICES] = joints
+        state_elem_mask = torch.zeros(
+            (B, self.args["model"]["state_token_dim"]),
+            device=joints.device,
+            dtype=joints.dtype,
+        )
+        state_elem_mask[:, MANISKILL_INDICES] = 1
+        return state, state_elem_mask
+
+    def _unformat_action_to_joint(self, action):
+        action_indices = MANISKILL_INDICES
+        joints = action[:, :, action_indices]
+
+        # denormalize to action space
+
+        joints = (joints + 1) / 2 * (self.action_max - self.action_min) + self.action_min
+
+        return joints
+
+    @torch.no_grad()
+    def step(self, proprio, images, text_embeds):
+        """
+        Args:
+            proprio: proprioceptive states
+            images: RGB images
+            text_embeds: instruction embeddings
+
+        Returns:
+            action: predicted action
+        """
+        device = self.device
+        dtype = self.dtype
+
+        background_color = np.array([int(x * 255) for x in self.image_processor.image_mean],
+                                    dtype=np.uint8).reshape(1, 1, 3)
+        background_image = (np.ones(
+            (
+                self.image_processor.size["height"],
+                self.image_processor.size["width"],
+                3,
+            ),
+            dtype=np.uint8,
+        ) * background_color)
+
+        image_tensor_list = []
+        for image in images:
+            if image is None:
+                # Replace it with the background image
+                image = Image.fromarray(background_image)
+
+            if self.image_size is not None:
+                image = transforms.Resize(self.data_args.image_size)(image)
+
+            if self.args["dataset"].get("auto_adjust_image_brightness", False):
+                pixel_values = list(image.getdata())
+                average_brightness = sum(sum(pixel) for pixel in pixel_values) / (len(pixel_values) * 255.0 * 3)
+                if average_brightness <= 0.15:
+                    image = transforms.ColorJitter(brightness=(1.75, 1.75))(image)
+
+            if self.args["dataset"].get("image_aspect_ratio", "pad") == "pad":
+
+                def expand2square(pil_img, background_color):
+                    width, height = pil_img.size
+                    if width == height:
+                        return pil_img
+                    elif width > height:
+                        result = Image.new(pil_img.mode, (width, width), background_color)
+                        result.paste(pil_img, (0, (width - height) // 2))
+                        return result
+                    else:
+                        result = Image.new(pil_img.mode, (height, height), background_color)
+                        result.paste(pil_img, ((height - width) // 2, 0))
+                        return result
+
+                image = expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean))
+            image = self.image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+            image_tensor_list.append(image)
+
+        image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
+
+        image_embeds = self.vision_model(image_tensor).detach()
+        image_embeds = image_embeds.reshape(-1, self.vision_model.hidden_size).unsqueeze(0)
+
+        # history of actions
+        joints = proprio.to(device).unsqueeze(0)  # (1, 1, 14)
+        states, state_elem_mask = self._format_joint_to_state(joints)  # (1, 1, 128), (1, 128)
+        states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(device, dtype=dtype)
+        states = states[:, -1:, :]  # (1, 1, 128)
+        ctrl_freqs = torch.tensor([self.control_frequency]).to(device)
+
+        text_embeds = text_embeds.to(device, dtype=dtype)
+
+        trajectory = self.policy.predict_action(
+            lang_tokens=text_embeds,
+            lang_attn_mask=torch.ones(text_embeds.shape[:2], dtype=torch.bool, device=text_embeds.device),
+            img_tokens=image_embeds,
+            state_tokens=states,
+            action_mask=state_elem_mask.unsqueeze(1),
+            ctrl_freqs=ctrl_freqs,
+        )
+        trajectory = self._unformat_action_to_joint(trajectory).to(torch.float32)
+
+        return trajectory
--- a/RDT/rdt-export/scripts/process_data.py
+++ b/RDT/rdt-export/scripts/process_data.py
@ -0,0 +1,169 @@
+import sys
+
+sys.path.append("./")
+
+import os
+import h5py
+import numpy as np
+import pickle
+import cv2
+import argparse
+import yaml
+from scripts.encode_lang_batch_once import encode_lang
+
+
+def load_hdf5(dataset_path):
+    if not os.path.isfile(dataset_path):
+        print(f"Dataset does not exist at \n{dataset_path}\n")
+        exit()
+
+    with h5py.File(dataset_path, "r") as root:
+        left_gripper, left_arm = (
+            root["/joint_action/left_gripper"][()],
+            root["/joint_action/left_arm"][()],
+        )
+        right_gripper, right_arm = (
+            root["/joint_action/right_gripper"][()],
+            root["/joint_action/right_arm"][()],
+        )
+        image_dict = dict()
+        for cam_name in root[f"/observation/"].keys():
+            image_dict[cam_name] = root[f"/observation/{cam_name}/rgb"][()]
+
+    return left_gripper, left_arm, right_gripper, right_arm, image_dict
+
+
+def images_encoding(imgs):
+    encode_data = []
+    padded_data = []
+    max_len = 0
+    for i in range(len(imgs)):
+        success, encoded_image = cv2.imencode(".jpg", imgs[i])
+        jpeg_data = encoded_image.tobytes()
+        encode_data.append(jpeg_data)
+        max_len = max(max_len, len(jpeg_data))
+    # padding
+    for i in range(len(imgs)):
+        padded_data.append(encode_data[i].ljust(max_len, b"\0"))
+    return encode_data, max_len
+
+
+def get_task_config(task_name):
+    with open(f"./task_config/{task_name}.yml", "r", encoding="utf-8") as f:
+        args = yaml.load(f.read(), Loader=yaml.FullLoader)
+    return args
+
+
+def data_transform(path, episode_num, save_path):
+    begin = 0
+    floders = os.listdir(path)
+    assert episode_num <= len(floders), "data num not enough"
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    for i in range(episode_num):
+        left_gripper_all, left_arm_all, right_gripper_all, right_arm_all, image_dict = (load_hdf5(
+            os.path.join(path, f"episode{i}.hdf5")))
+        qpos = []
+        actions = []
+        cam_high = []
+        cam_right_wrist = []
+        cam_left_wrist = []
+        left_arm_dim = []
+        right_arm_dim = []
+
+        last_state = None
+        for j in range(0, left_gripper_all.shape[0]):
+
+            left_gripper, left_arm, right_gripper, right_arm = (
+                left_gripper_all[j],
+                left_arm_all[j],
+                right_gripper_all[j],
+                right_arm_all[j],
+            )
+
+            state = np.concatenate((left_arm, [left_gripper], right_arm, [right_gripper]), axis=0)  # joint
+            state = state.astype(np.float32)
+
+            if j != left_gripper_all.shape[0] - 1:
+
+                qpos.append(state)
+
+                camera_high_bits = image_dict["head_camera"][j]
+                camera_high = cv2.imdecode(np.frombuffer(camera_high_bits, np.uint8), cv2.IMREAD_COLOR)
+                camera_high_resized = cv2.resize(camera_high, (640, 480))
+                cam_high.append(camera_high_resized)
+
+                camera_right_wrist_bits = image_dict["right_camera"][j]
+                camera_right_wrist = cv2.imdecode(np.frombuffer(camera_right_wrist_bits, np.uint8), cv2.IMREAD_COLOR)
+                camera_right_wrist_resized = cv2.resize(camera_right_wrist, (640, 480))
+                cam_right_wrist.append(camera_right_wrist_resized)
+
+                camera_left_wrist_bits = image_dict["left_camera"][j]
+                camera_left_wrist = cv2.imdecode(np.frombuffer(camera_left_wrist_bits, np.uint8), cv2.IMREAD_COLOR)
+                camera_left_wrist_resized = cv2.resize(camera_left_wrist, (640, 480))
+                cam_left_wrist.append(camera_left_wrist_resized)
+
+            if j != 0:
+                action = state
+                actions.append(action)
+                left_arm_dim.append(left_arm.shape[0])
+                right_arm_dim.append(right_arm.shape[0])
+
+        if not os.path.exists(os.path.join(save_path, f"episode_{i}")):
+            os.makedirs(os.path.join(save_path, f"episode_{i}"))
+        hdf5path = os.path.join(save_path, f"episode_{i}/episode_{i}.hdf5")
+
+        with h5py.File(hdf5path, "w") as f:
+            f.create_dataset("action", data=np.array(actions))
+            obs = f.create_group("observations")
+            obs.create_dataset("qpos", data=np.array(qpos))
+            obs.create_dataset("left_arm_dim", data=np.array(left_arm_dim))
+            obs.create_dataset("right_arm_dim", data=np.array(right_arm_dim))
+            image = obs.create_group("images")
+            cam_high_enc, len_high = images_encoding(cam_high)
+            cam_right_wrist_enc, len_right = images_encoding(cam_right_wrist)
+            cam_left_wrist_enc, len_left = images_encoding(cam_left_wrist)
+            image.create_dataset("cam_high", data=cam_high_enc, dtype=f"S{len_high}")
+            image.create_dataset("cam_right_wrist", data=cam_right_wrist_enc, dtype=f"S{len_right}")
+            image.create_dataset("cam_left_wrist", data=cam_left_wrist_enc, dtype=f"S{len_left}")
+
+        begin += 1
+        print(f"proccess {i} success!")
+
+    return begin
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process some episodes.")
+    parser.add_argument("task_name", type=str)
+    parser.add_argument("task_config", type=str)
+    parser.add_argument("expert_data_num", type=int)
+    args = parser.parse_args()
+
+    task_name = args.task_name
+    task_config = args.task_config
+    expert_data_num = args.expert_data_num
+
+    load_dir = os.path.join("../../data", str(task_name), str(task_config), "data")
+
+    print(f"read data from path: {load_dir}")
+    begin = data_transform(
+        load_dir,
+        expert_data_num,
+        f"./processed_data/{task_name}-{task_config}-{expert_data_num}",
+    )
+    tokenizer, text_encoder = None, None
+    for idx in range(expert_data_num):
+        print(f"Processing Language: {idx}", end="\r")
+        data_file_path = (f"../../data/{task_name}/{task_config}/instructions/episode{idx}.json")
+        target_dir = (f"processed_data/{task_name}-{task_config}-{expert_data_num}/episode_{idx}")
+        tokenizer, text_encoder = encode_lang(
+            DATA_FILE_PATH=data_file_path,
+            TARGET_DIR=target_dir,
+            GPU=0,
+            desc_type="seen",
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+        )
--- a/RDT/rdt-export/scripts/read_config.py
+++ b/RDT/rdt-export/scripts/read_config.py
@ -0,0 +1,42 @@
+import json
+import sys
+
+def read_config(config_file, key_path):
+    """
+    Read a value from JSON config file.
+    
+    Args:
+        config_file: Path to JSON config file
+        key_path: Dot-separated path to the key (e.g., "evaluation.checkpoint_path")
+    
+    Returns:
+        The value at the specified key path
+    """
+    with open(config_file, 'r') as f:
+        json_config = json.load(f)
+    
+    # Navigate through nested keys
+    keys = key_path.split('.')
+    value = json_config
+    for key in keys:
+        if isinstance(value, dict):
+            value = value.get(key)
+        else:
+            return None
+    
+    return value
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python read_config.py <config_file> <key_path>", file=sys.stderr)
+        sys.exit(1)
+    
+    config_file = sys.argv[1]
+    key_path = sys.argv[2]
+    
+    value = read_config(config_file, key_path)
+    if value is not None:
+        print(value)
+    else:
+        print("", file=sys.stderr)
+        sys.exit(1)
--- a/RDT/rdt-export/scripts/read_yaml.py
+++ b/RDT/rdt-export/scripts/read_yaml.py
@ -0,0 +1,22 @@
+import sys
+import yaml
+
+
+def read_yaml_value(file_path, key):
+    with open(file_path, "r") as file:
+        data = yaml.safe_load(file)
+        value = data.get(key)
+        if value is not None:
+            print(value)
+        else:
+            print(f"Key '{key}' not found in {file_path}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python read_yaml.py <file_path> <key>")
+        sys.exit(1)
+
+    file_path = sys.argv[1]
+    key = sys.argv[2]
+    read_yaml_value(file_path, key)
--- a/RDT/rdt-quant/.dockerignore
+++ b/RDT/rdt-quant/.dockerignore
@ -0,0 +1,2 @@
+input/
+output/
--- a/RDT/rdt-quant/Dockerfile
+++ b/RDT/rdt-quant/Dockerfile
@ -0,0 +1,14 @@
+FROM ai_toolchain_ubuntu_22_s100_gpu:v3.2.0
+
+WORKDIR /app
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV TZ=Asia/Shanghai
+
+RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
+    sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
+
+COPY . /app/
+
+ENTRYPOINT ["bash", "convert.sh"]
--- a/RDT/rdt-quant/convert.sh
+++ b/RDT/rdt-quant/convert.sh
@ -0,0 +1,30 @@
+CONFIG=input/config.json
+OUTPUT=/app/output/$(python3 read_json.py $CONFIG task_id)
+python3 load_config.py $CONFIG
+echo "Convert PTQ YAML Haved been Prepared"
+
+
+
+
+######### Img Adaptor
+cd $OUTPUT/Img_Adaptor
+BEGIN_IMG_ADAPTOR_TIME=$(date +%s)
+echo -e "\033[44;37m===== Start Compiling Img Adaptor =====\033[0m"
+hb_compile --config $OUTPUT/img_adaptor.yaml
+echo -e "\033[44;37m===== End Compiling Img Adaptor =====\033[0m"
+END_IMG_ADAPTOR_TIME=$(date +%s)
+IMG_ADAPTOR_TIME=$((END_IMG_ADAPTOR_TIME - BEGIN_IMG_ADAPTOR_TIME))
+echo -e "\033[44;37m===== Img Adaptor Time =====\033[0m"
+echo -e "\033[44;37m===== $IMG_ADAPTOR_TIME seconds =====\033[0m"
+
+########## DiT
+cd $OUTPUT/DiT_Policy
+BEGIN_DIT_TIME=$(date +%s)
+echo -e "\033[44;37m===== Start Compiling DiT =====\033[0m"
+hb_compile --config $OUTPUT/dit.yaml
+echo -e "\033[44;37m===== End Compiling DiT =====\033[0m"
+END_DIT_TIME=$(date +%s)
+DIT_TIME=$((END_DIT_TIME - BEGIN_DIT_TIME))
+echo -e "\033[44;37m===== DiT Time =====\033[0m"
+echo -e "\033[44;37m===== $DIT_TIME seconds =====\033[0m"
+
--- a/RDT/rdt-quant/load_config.py
+++ b/RDT/rdt-quant/load_config.py
@ -0,0 +1,88 @@
+import json
+import yaml
+import sys
+import os
+from dataclasses import dataclass
+
+DIT = "DiT_Policy"
+IMG_ADAPTOR = "Img_Adaptor"
+
+@dataclass
+class QuantConfig:
+    task_id: str = None
+    gpu_id: str = None
+    march: str = None
+    model_type: str = None
+    output_path: str = None
+    DiT_Policy_ONNX: str = None
+    DiT_Policy_CALIBRATION: str = None
+    Img_Adaptor_ONNX: str = None
+    Img_Adaptor_CALIBRATION: str = None
+
+
+
+
+def load_config(config_path):
+    with open(config_path, "r") as file:
+        config = json.load(file)
+
+    if "quant" in config:
+        quant_info = config["quant"]
+    if "DiT_Policy" in quant_info:
+        dit_policy = quant_info["DiT_Policy"]
+    if "Img_Adaptor" in quant_info:
+        img_adaptor = quant_info["Img_Adaptor"]
+    
+    opt = QuantConfig(
+        task_id=config.get("task_id"),
+        gpu_id=config.get("gpu_id"),
+        march=quant_info.get("march"),
+        model_type=quant_info.get("model_type"),
+        output_path=os.path.join(quant_info.get("output_path"), config.get("task_id")),
+        DiT_Policy_ONNX=dit_policy.get("onnx_model"),
+        DiT_Policy_CALIBRATION=dit_policy.get("calibration_data"),
+        Img_Adaptor_ONNX=img_adaptor.get("onnx_model"),
+        Img_Adaptor_CALIBRATION=img_adaptor.get("calibration_data")
+    )
+    os.makedirs(opt.output_path, exist_ok=True)
+
+    # PrePare Img Convert YAML
+    with open(f"ptq_yaml/{opt.model_type}/img_adaptor.yaml", "r") as file:
+        img_adaptor_yaml = yaml.safe_load(file)
+    img_adaptor_yaml["model_parameters"]["onnx_model"] = opt.Img_Adaptor_ONNX
+    img_adaptor_yaml["model_parameters"]["march"] = opt.march
+    img_adaptor_yaml["model_parameters"]["output_model_file_prefix"] = "rdt_img_adaptor"
+    img_adaptor_yaml["calibration_parameters"]["cal_data_dir"] = opt.Img_Adaptor_CALIBRATION
+    img_adaptor_yaml["model_parameters"]["working_dir"] = IMG_ADAPTOR
+    img_adaptor_yaml_path = os.path.join(opt.output_path, "img_adaptor.yaml")
+    with open(img_adaptor_yaml_path, 'w') as f:
+        yaml.safe_dump(img_adaptor_yaml, f, default_flow_style=False, allow_unicode=True)
+
+
+    # PrePare DiT Convert YAML
+    with open(f"ptq_yaml/{opt.model_type}/dit.yaml", "r") as file:
+        dit_yaml = yaml.safe_load(file)
+    for k, v in dit_yaml.get("calibration_parameters", {}).items():
+        if isinstance(v, str) and "{dit_cal_name}" in v:
+            if opt.DiT_Policy_CALIBRATION is not None:
+                dit_yaml["calibration_parameters"][k] = v.replace("{dit_cal_name}", opt.DiT_Policy_CALIBRATION)
+            else:
+                raise ValueError(f"DiT_Policy_CALIBRATION is None, cannot replace {{dit_cal_name}} in {k}")
+    dit_yaml["model_parameters"]["onnx_model"] = opt.DiT_Policy_ONNX
+    dit_yaml["model_parameters"]["march"] = opt.march
+    dit_yaml["model_parameters"]["working_dir"] = DIT
+
+    # dit_onnx_dir = os.path.dirname(opt.DiT_Policy_ONNX) if opt.DiT_Policy_ONNX else ""
+    # os.environ["DIT_ONNX_DIR"] = dit_onnx_dir
+    
+    with open(f"ptq_yaml/{opt.model_type}/dit_op_config.json", "r") as file:
+        dit_json = json.load(file)
+    dit_yaml["calibration_parameters"]["quant_config"] = dit_json
+
+    dit_yaml_path = os.path.join(opt.output_path, "dit.yaml")
+    with open(dit_yaml_path, 'w') as f:
+        yaml.safe_dump(dit_yaml, f, default_flow_style=False, allow_unicode=True)
+
+if __name__ == "__main__":
+    config_path = sys.argv[1]
+    config = load_config(config_path)
--- a/RDT/rdt-quant/node_accumulate_err_of_qmodel_cosine-similarity.png
+++ b/RDT/rdt-quant/node_accumulate_err_of_qmodel_cosine-similarity.png
--- a/RDT/rdt-quant/ptq_yaml/170M/dit.yaml
+++ b/RDT/rdt-quant/ptq_yaml/170M/dit.yaml
@ -0,0 +1,29 @@
+calibration_parameters:
+    cal_data_dir: '{dit_cal_name}/x/;{dit_cal_name}/freq/;{dit_cal_name}/t/;{dit_cal_name}/lang_c/;{dit_cal_name}/img_c/;{dit_cal_name}/lang_mask/;'
+    quant_config: dit_json_name
+    run_on_cpu: '/t_embedder/Cos;/t_embedder/Sin;/freq_embedder/Cos;/freq_embedder/Sin'
+compiler_parameters:
+    compile_mode: latency
+    core_num: 1
+    debug: true
+    jobs: 8
+    max_time_per_fc: 0
+    optimize_level: O2
+    advice: 1
+input_parameters:
+    input_layout_rt: NCHW;NCHW;NCHW;NCHW;NCHW;NCHW
+    input_layout_train: NCHW;NCHW;NCHW;NCHW;NCHW;NCHW
+    input_name: x;freq;t;lang_c;img_c;lang_mask;
+    input_shape: 1x65x1024;1;1;1x64x1024;1x4374x1024;1x64
+    input_space_and_range: ''
+    input_type_rt: featuremap;featuremap;featuremap;featuremap;featuremap;featuremap
+    input_type_train: featuremap;featuremap;featuremap;featuremap;featuremap;featuremap
+    norm_type: no_preprocess;no_preprocess;no_preprocess;no_preprocess;no_preprocess;no_preprocess
+model_parameters:
+    layer_out_dump: false
+    debug_mode: "dump_calibration_data"
+    enable_vpu: True
+    march: {opt.march}
+    onnx_model: {dit_name}
+    output_model_file_prefix: rdt_dit
+    working_dir: bpu_output
--- a/RDT/rdt-quant/ptq_yaml/170M/dit_op_config.json
+++ b/RDT/rdt-quant/ptq_yaml/170M/dit_op_config.json
@ -0,0 +1,251 @@
+{
+    "model_config": {
+        "all_node_type": "int16",
+        "model_output_type": "float32",
+        "activation": {
+            "calibration_type": ["max"],
+            "num_bin": [1024, 2048, 4096],
+            "max_num_bin": 16384,
+            "max_percentile": 1.0,
+            "per_channel": true,
+            "asymmetric": [true]
+        },
+        "weight": {
+            "bias_correction": {
+                "metric": "mae"
+            }
+        },
+        "modelwise_search": {
+            "metric": "mae"
+        }
+    },
+    "op_config": {
+        "ReduceMean": {"qtype": "int16"},
+        "Sub": {"qtype": "int16"},
+        "Softmax": {"qtype": "int16"}
+    },
+    "node_config": {
+        "/t_embedder/Mul": {"qtype": "float32"},
+        "/t_embedder/Cos": {"qtype": "float32"},
+        "/t_embedder/Sin": {"qtype": "float32"},
+        "/t_embedder/Concat": {"qtype": "float32"},
+        "/freq_embedder/Mul": {"qtype": "float32"},
+        "/freq_embedder/Cos": {"qtype": "float32"},
+        "/freq_embedder/Sin": {"qtype": "float32"},
+        "/freq_embedder/Concat": {"qtype": "float32"},
+        "/blocks.0/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.0/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.0/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.0/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.0/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.0/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.0/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.0/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.0/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.0/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.0/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.0/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.0/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.0/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.0/Add": {"qtype": "int16"},
+        "/blocks.1/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.1/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.1/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.1/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.1/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.1/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.1/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.1/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.1/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.1/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.1/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.1/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.1/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.1/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.1/Add": {"qtype": "int16"},
+        "/blocks.2/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.2/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.2/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.2/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.2/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.2/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.2/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.2/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.2/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.2/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.2/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.2/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.2/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.2/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.2/Add": {"qtype": "int16"},
+        "/blocks.3/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.3/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.3/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.3/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.3/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.3/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.3/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.3/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.3/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.3/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.3/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.3/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.3/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.3/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.3/Add": {"qtype": "int16"},
+        "/blocks.4/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.4/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.4/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.4/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.4/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.4/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.4/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.4/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.4/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.4/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.4/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.4/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.4/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.4/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.4/Add": {"qtype": "int16"},
+        "/blocks.5/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.5/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.5/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.5/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.5/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.5/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.5/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.5/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.5/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.5/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.5/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.5/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.5/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.5/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.5/Add": {"qtype": "int16"},
+        "/blocks.6/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.6/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.6/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.6/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.6/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.6/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.6/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.6/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.6/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.6/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.6/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.6/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.6/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.6/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.6/Add": {"qtype": "int16"},
+        "/blocks.7/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.7/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.7/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.7/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.7/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.7/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.7/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.7/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.7/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.7/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.7/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.7/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.7/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.7/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.7/Add": {"qtype": "int16"},
+        "/blocks.8/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.8/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.8/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.8/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.8/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.8/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.8/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.8/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.8/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.8/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.8/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.8/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.8/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.8/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.8/Add": {"qtype": "int16"},
+        "/blocks.9/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.9/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.9/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.9/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.9/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.9/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.9/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.9/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.9/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.9/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.9/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.9/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.9/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.9/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.9/Add": {"qtype": "int16"},
+        "/blocks.10/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.10/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.10/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.10/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.10/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.10/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.10/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.10/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.10/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.10/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.10/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.10/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.10/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.10/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.10/Add": {"qtype": "int16"},
+        "/blocks.11/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.11/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.11/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.11/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.11/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.11/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.11/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.11/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.11/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.11/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.11/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.11/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.11/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.11/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.11/Add": {"qtype": "int16"},
+        "/blocks.12/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.12/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.12/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.12/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.12/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.12/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.12/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.12/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.12/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.12/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.12/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.12/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.12/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.12/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.12/Add": {"qtype": "int16"},
+        "/blocks.13/attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.13/attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.13/cross_attn/MatMul": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.13/cross_attn/MatMul_1": {"InputType0": "int16", "InputType1": "int16"},
+        "/blocks.13/cross_attn/k_norm/Mul_1": {"qtype": "int16"},
+        "/blocks.13/ffn/fc1/MatMul": {"qtype": "int16"},
+        "/blocks.13/ffn/act/Mul": {"qtype": "int16"},
+        "/blocks.13/ffn/act/Mul_1": {"qtype": "int16"},
+        "/blocks.13/ffn/act/Mul_2": {"qtype": "int16"},
+        "/blocks.13/ffn/act/Add": {"qtype": "int16"},
+        "/blocks.13/ffn/act/Mul_3": {"qtype": "int16"},
+        "/blocks.13/ffn/act/Tanh": {"qtype": "int16"},
+        "/blocks.13/norm1/Mul_2": {"qtype": "int16"},
+        "/blocks.13/cross_attn/k_norm/Div_1_reciprocal": {"qtype": "int16"},
+        "/blocks.13/Add": {"qtype": "int16"},
+        "/blocks.13/norm3/Div_1_reciprocal": {"qtype": "int16"},
+        "/final_layer/ffn_final/act/Mul_1": {"qtype": "int16"},
+        "/final_layer/ffn_final/act/Mul_2 ": {"qtype": "int16"},
+        "/final_layer/norm_final/Div_1_reciprocal": {"qtype": "float32"}
+    }
+}
--- a/RDT/rdt-quant/ptq_yaml/170M/img_adaptor.yaml
+++ b/RDT/rdt-quant/ptq_yaml/170M/img_adaptor.yaml
@ -0,0 +1,33 @@
+model_parameters:
+    onnx_model: '{img_adaptor_name}'
+    march: {opt.march}
+    layer_out_dump: False
+    working_dir: bpu_output
+    output_model_file_prefix: rdt_img_adaptor
+    enable_vpu: True
+input_parameters:
+    input_name: ''
+    input_type_rt: 'featuremap;'
+    input_layout_rt: 'NCHW;'
+    input_type_train: 'featuremap;'
+    input_layout_train: 'NCHW;'
+    norm_type: 'no_preprocess;'
+calibration_parameters:
+    cal_data_dir: '{img_adaptor_cal_name}'
+    cal_data_type: 'float32'
+    calibration_type: 'default'
+    quant_config:
+        model_config:
+            all_node_type: int16
+            model_output_type: int16
+
+compiler_parameters:
+    extra_params:
+        input_no_padding: true
+        output_no_padding: true
+    jobs: 8
+    compile_mode: 'latency'
+    debug: True
+    advice: 1
+    optimize_level: 'O2'
+    core_num: 2
--- a/RDT/rdt-quant/ptq_yaml/1B/dit.yaml
+++ b/RDT/rdt-quant/ptq_yaml/1B/dit.yaml
@ -0,0 +1,29 @@
+calibration_parameters:
+    cal_data_dir: '{dit_cal_name}/x/;{dit_cal_name}/freq/;{dit_cal_name}/t/;{dit_cal_name}/lang_c/;{dit_cal_name}/img_c/;{dit_cal_name}/lang_mask/;'
+    quant_config: dit_json_name
+    run_on_cpu: '/t_embedder/Cos;/t_embedder/Sin;/freq_embedder/Cos;/freq_embedder/Sin'
+compiler_parameters:
+    compile_mode: latency
+    core_num: 1
+    debug: true
+    jobs: 8
+    max_time_per_fc: 0
+    optimize_level: O2
+    advice: 1
+input_parameters:
+    input_layout_rt: NCHW;NCHW;NCHW;NCHW;NCHW;NCHW
+    input_layout_train: NCHW;NCHW;NCHW;NCHW;NCHW;NCHW
+    input_name: x;freq;t;lang_c;img_c;lang_mask;
+    input_shape: 1x65x2048;1;1;1x64x2048;1x4374x2048;1x64
+    input_space_and_range: ''
+    input_type_rt: featuremap;featuremap;featuremap;featuremap;featuremap;featuremap
+    input_type_train: featuremap;featuremap;featuremap;featuremap;featuremap;featuremap
+    norm_type: no_preprocess;no_preprocess;no_preprocess;no_preprocess;no_preprocess;no_preprocess
+model_parameters:
+    layer_out_dump: false
+    debug_mode: "dump_calibration_data"
+    enable_vpu: True
+    march: {opt.march}
+    onnx_model: {dit_name}
+    output_model_file_prefix: rdt_dit
+    working_dir: bpu_output
--- a/RDT/rdt-quant/ptq_yaml/1B/dit_op_config.json
+++ b/RDT/rdt-quant/ptq_yaml/1B/dit_op_config.json
--- a/RDT/rdt-quant/ptq_yaml/1B/img_adaptor.yaml
+++ b/RDT/rdt-quant/ptq_yaml/1B/img_adaptor.yaml
@ -0,0 +1,32 @@
+model_parameters:
+    onnx_model: '{img_adaptor_name}'
+    march: {opt.march}
+    layer_out_dump: False
+    working_dir: bpu_output
+    output_model_file_prefix: rdt_img_adaptor
+    enable_vpu: True
+input_parameters:
+    input_name: ''
+    input_type_rt: 'featuremap;'
+    input_layout_rt: 'NCHW;'
+    input_type_train: 'featuremap;'
+    input_layout_train: 'NCHW;'
+    norm_type: 'no_preprocess;'
+calibration_parameters:
+    cal_data_dir: '{img_adaptor_cal_name}'
+    cal_data_type: 'float32'
+    calibration_type: 'default'
+    quant_config:
+        model_config:
+            all_node_type: int16
+            model_output_type: int16
+compiler_parameters:
+    extra_params:
+        input_no_padding: true
+        output_no_padding: true
+    jobs: 8
+    compile_mode: 'latency'
+    debug: True
+    advice: 1
+    optimize_level: 'O2'
+    core_num: 2
--- a/RDT/rdt-quant/read_json.py
+++ b/RDT/rdt-quant/read_json.py
@ -0,0 +1,42 @@
+import json
+import sys
+
+def read_config(config_file, key_path):
+    """
+    Read a value from JSON config file.
+    
+    Args:
+        config_file: Path to JSON config file
+        key_path: Dot-separated path to the key (e.g., "evaluation.checkpoint_path")
+    
+    Returns:
+        The value at the specified key path
+    """
+    with open(config_file, 'r') as f:
+        json_config = json.load(f)
+    
+    # Navigate through nested keys
+    keys = key_path.split('.')
+    value = json_config
+    for key in keys:
+        if isinstance(value, dict):
+            value = value.get(key)
+        else:
+            return None
+    
+    return value
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python read_config.py <config_file> <key_path>", file=sys.stderr)
+        sys.exit(1)
+    
+    config_file = sys.argv[1]
+    key_path = sys.argv[2]
+    
+    value = read_config(config_file, key_path)
+    if value is not None:
+        print(value)
+    else:
+        print("", file=sys.stderr)
+        sys.exit(1)
--- a/RDT/rdt170m-run/.dockerignore
+++ b/RDT/rdt170m-run/.dockerignore
@ -0,0 +1,2 @@
+input/*
+output/*
--- a/RDT/rdt170m-run/.gitignore
+++ b/RDT/rdt170m-run/.gitignore
@ -0,0 +1,7 @@
+processed_data/
+training_data/
+checkpoints/
+model_config/*.yml
+wandb/*
+!models/
+!data/
--- a/RDT/rdt170m-run/Dockerfile
+++ b/RDT/rdt170m-run/Dockerfile
@ -0,0 +1,48 @@
+
+FROM registry.d-robotics.cc/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+# ccr-29eug8s3-pub.cnc.bj.baidubce.com/public/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+WORKDIR /app
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV TZ=Asia/Shanghai
+
+RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list && \
+    sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list
+
+RUN apt-get update --allow-unauthenticated && apt-get install -y \
+    software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y \
+    python3.10 \
+    python3.10-dev \
+    python3-pip \
+    python3.10-distutils \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    wget \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+
+COPY . /app/
+
+RUN python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+# RUN pip install torch==2.1.0 torchvision==0.16.0  --index-url https://download.pytorch.org/whl/cu121
+# RUN pip install torch==2.1.0 torchvision==0.16.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
+RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+RUN pip install packaging==24.0
+
+RUN pip install tfds-nightly==4.9.4.dev202402070044
+
+RUN pip install flash_attn-2.7.2.post1+cu12torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+
+# RUN mkdir -p /app/dataset/input /app/dataset/output 
+
+ENTRYPOINT ["bash", "deploy.sh"]
--- a/RDT/rdt170m-run/init.py
+++ b/RDT/rdt170m-run/init.py
@ -0,0 +1 @@
+from .deploy_policy import *
--- a/RDT/rdt170m-run/assets/head.png
+++ b/RDT/rdt170m-run/assets/head.png
--- a/RDT/rdt170m-run/client.py
+++ b/RDT/rdt170m-run/client.py
@ -0,0 +1,300 @@
+#!/usr/bin/env python3
+"""
+RDT 推理服务器测试客户端
+使用模拟数据测试 get_actions 接口
+"""
+
+import numpy as np
+import logging
+import argparse
+import time
+from cloud_helper import Client
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+
+def create_mock_observation(
+    state_dim=6,
+    img_history_size=2,
+    img_height=480,
+    img_width=640,
+    num_cameras=3
+):
+    """创建模拟的观测数据
+    
+    Args:
+        state_dim: 状态向量维度（关节数量）
+        img_history_size: 图像历史长度
+        img_height: 图像高度
+        img_width: 图像宽度
+        num_cameras: 相机数量
+    
+    Returns:
+        observation: 包含状态和图像的观测字典
+    """
+    observation = {}
+    
+    # 1. 创建模拟的机器人状态（关节角度等）
+    # 范围在 [-180, 180] 度之间
+    state = np.random.uniform(-180, 180, size=(state_dim,)).astype(np.float32)
+    observation["state"] = state
+    
+    # 2. 创建模拟的相机图像
+    # 注意：msgpack_numpy 会自动处理 numpy 数组的序列化
+    camera_names = ["cam_high", "cam_left_wrist", "cam_right_wrist"]
+    
+    for i, cam_name in enumerate(camera_names[:num_cameras]):
+        # 创建彩色渐变图像作为模拟数据
+        images = []
+        for t in range(img_history_size):
+            # 为每个时间步创建不同颜色的图像
+            img = np.zeros((img_height, img_width, 3), dtype=np.uint8)
+            
+            # 创建彩色渐变效果
+            color_shift = (t * 50 + i * 100) % 255
+            img[:, :, 0] = np.linspace(color_shift, 255, img_width, dtype=np.uint8)  # R
+            img[:, :, 1] = np.linspace(0, 255 - color_shift, img_height, dtype=np.uint8)[:, None]  # G
+            img[:, :, 2] = 128  # B
+            
+            images.append(img)
+        
+        # 堆叠为 (IMG_HISTORY_SIZE, H, W, 3) 格式
+        observation[f"images.{cam_name}"] = np.stack(images, axis=0)
+    
+    return observation
+
+
+def create_test_batch(
+    observation,
+    instruction="pick up the bottle and place it in the box",
+    use_instruction_index=False
+):
+    """创建完整的测试批次数据
+    
+    Args:
+        observation: 观测数据字典
+        instruction: 指令字符串或索引
+        use_instruction_index: 是否使用指令索引而非字符串
+    
+    Returns:
+        batch: 完整的请求数据
+    """
+    batch = {
+        "observation": observation,
+        "instruction": 0 if use_instruction_index else instruction
+    }
+    return batch
+
+
+def test_single_request(client, args):
+    """测试单次请求"""
+    logger.info("=" * 60)
+    logger.info("开始单次请求测试")
+    logger.info("=" * 60)
+    
+    # 创建模拟数据
+    observation = create_mock_observation(
+        state_dim=args.state_dim,
+        img_history_size=args.img_history_size,
+        img_height=args.img_height,
+        img_width=args.img_width,
+        num_cameras=args.num_cameras
+    )
+    
+    logger.info(f"模拟观测数据:")
+    logger.info(f"  - state shape: {observation['state'].shape}")
+    for key in observation.keys():
+        if key.startswith("images."):
+            logger.info(f"  - {key} shape: {observation[key].shape}")
+    
+    # 创建请求批次
+    batch = create_test_batch(
+        observation,
+        instruction=args.instruction,
+        use_instruction_index=args.use_index
+    )
+    
+    # 发送请求
+    logger.info(f"发送指令: {batch['instruction']}")
+    start_time = time.time()
+    
+    try:
+        action = client.call_endpoint("get_actions", batch)
+        elapsed_time = time.time() - start_time
+        
+        logger.info(f"✓ 请求成功! 耗时: {elapsed_time*1000:.2f} ms")
+        logger.info(f"  - action shape: {action.shape}")
+        logger.info(f"  - action dtype: {action.dtype}")
+        logger.info(f"  - action range: [{action.min():.3f}, {action.max():.3f}]")
+        logger.info(f"  - action preview (前3个时间步的前3个维度):")
+        preview_steps = min(3, action.shape[0])
+        preview_dims = min(3, action.shape[1])
+        for t in range(preview_steps):
+            logger.info(f"    t={t}: {action[t, :preview_dims]}")
+        
+        return True
+    
+    except Exception as e:
+        logger.error(f"✗ 请求失败: {e}")
+        return False
+
+
+def test_multiple_requests(client, args):
+    """测试多次连续请求（性能测试）"""
+    logger.info("=" * 60)
+    logger.info(f"开始连续请求测试 (共 {args.num_requests} 次)")
+    logger.info("=" * 60)
+    
+    # 预先创建观测数据
+    observation = create_mock_observation(
+        state_dim=args.state_dim,
+        img_history_size=args.img_history_size,
+        img_height=args.img_height,
+        img_width=args.img_width,
+        num_cameras=args.num_cameras
+    )
+    
+    batch = create_test_batch(
+        observation,
+        instruction=args.instruction,
+        use_instruction_index=args.use_index
+    )
+    
+    success_count = 0
+    total_time = 0
+    latencies = []
+    
+    for i in range(args.num_requests):
+        try:
+            start_time = time.time()
+            action = client.call_endpoint("get_actions", batch)
+            elapsed_time = time.time() - start_time
+            
+            success_count += 1
+            total_time += elapsed_time
+            latencies.append(elapsed_time)
+            
+            if (i + 1) % 10 == 0:
+                logger.info(f"已完成 {i + 1}/{args.num_requests} 次请求")
+        
+        except Exception as e:
+            logger.error(f"第 {i+1} 次请求失败: {e}")
+    
+    # 统计结果
+    logger.info("=" * 60)
+    logger.info("性能统计:")
+    logger.info(f"  - 总请求数: {args.num_requests}")
+    logger.info(f"  - 成功数: {success_count}")
+    logger.info(f"  - 失败数: {args.num_requests - success_count}")
+    logger.info(f"  - 成功率: {success_count/args.num_requests*100:.1f}%")
+    
+    if latencies:
+        latencies = np.array(latencies)
+        logger.info(f"  - 平均延迟: {np.mean(latencies)*1000:.2f} ms")
+        logger.info(f"  - 中位数延迟: {np.median(latencies)*1000:.2f} ms")
+        logger.info(f"  - 最小延迟: {np.min(latencies)*1000:.2f} ms")
+        logger.info(f"  - 最大延迟: {np.max(latencies)*1000:.2f} ms")
+        logger.info(f"  - 吞吐量: {success_count/total_time:.2f} requests/s")
+
+
+def test_different_instructions(client, args):
+    """测试不同的指令"""
+    logger.info("=" * 60)
+    logger.info("测试不同指令")
+    logger.info("=" * 60)
+    
+    instructions = [
+        "pick up the red cube",
+        "place the bottle on the table",
+        "move to the left",
+        "grasp the bottle",
+        "open the drawer"
+    ]
+    
+    observation = create_mock_observation(
+        state_dim=args.state_dim,
+        img_history_size=args.img_history_size,
+        img_height=args.img_height,
+        img_width=args.img_width,
+        num_cameras=args.num_cameras
+    )
+    
+    for i, instruction in enumerate(instructions):
+        logger.info(f"\n测试指令 {i+1}/{len(instructions)}: '{instruction}'")
+        batch = create_test_batch(observation, instruction=instruction)
+        
+        try:
+            start_time = time.time()
+            action = client.call_endpoint("get_actions", batch)
+            elapsed_time = time.time() - start_time
+            
+            logger.info(f"  ✓ 成功 | 耗时: {elapsed_time*1000:.2f} ms | action shape: {action.shape}")
+        
+        except Exception as e:
+            logger.error(f"  ✗ 失败: {e}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="RDT 推理服务器测试客户端")
+    
+    # 连接参数
+    parser.add_argument("--host", type=str, default="localhost", help="服务器地址")
+    parser.add_argument("--port", type=int, default=8005, help="服务器端口")
+    
+    # 测试模式
+    parser.add_argument("--mode", type=str, default="single",
+                       choices=["single", "multiple", "instructions"],
+                       help="测试模式: single(单次), multiple(多次), instructions(不同指令)")
+    parser.add_argument("--num-requests", type=int, default=50,
+                       help="多次测试的请求数量")
+    
+    # 数据参数
+    parser.add_argument("--state-dim", type=int, default=6, help="状态向量维度")
+    parser.add_argument("--img-history-size", type=int, default=2, help="图像历史长度")
+    parser.add_argument("--img-height", type=int, default=480, help="图像高度")
+    parser.add_argument("--img-width", type=int, default=640, help="图像宽度")
+    parser.add_argument("--num-cameras", type=int, default=3, help="相机数量 (与服务器配置一致)")
+    
+    # 指令参数
+    parser.add_argument("--instruction", type=str, 
+                       default="pick up the bottle and place it in the box",
+                       help="测试指令")
+    parser.add_argument("--use-index", action="store_true",
+                       help="使用指令索引而非字符串")
+    
+    args = parser.parse_args()
+    
+    # 连接服务器
+    logger.info(f"正在连接到 {args.host}:{args.port} ...")
+    try:
+        client = Client(host=args.host, port=args.port)
+        logger.info("✓ 连接成功!")
+    except Exception as e:
+        logger.error(f"✗ 连接失败: {e}")
+        return
+    
+    # 根据模式运行测试
+    try:
+        if args.mode == "single":
+            test_single_request(client, args)
+        elif args.mode == "multiple":
+            test_multiple_requests(client, args)
+        elif args.mode == "instructions":
+            test_different_instructions(client, args)
+    
+    except KeyboardInterrupt:
+        logger.info("\n测试被用户中断")
+    except Exception as e:
+        logger.error(f"测试过程中发生错误: {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()
+
--- a/RDT/rdt170m-run/cloud_helper.py
+++ b/RDT/rdt170m-run/cloud_helper.py
@ -0,0 +1,162 @@
+import zmq
+import msgpack
+import msgpack_numpy as m
+
+import logging
+import time
+
+from typing import Any, Callable
+import zstandard as zstd
+
+
+logger = logging.getLogger(__name__)
+
+
+compresser = zstd.ZstdCompressor(level=12)
+decompresser = zstd.ZstdDecompressor()
+
+
+def _pack(data: Any) -> bytes:
+    return compresser.compress(msgpack.packb(data, default=m.encode, use_bin_type=True))
+
+
+def _unpack(data: bytes) -> Any:
+    return msgpack.unpackb(
+        decompresser.decompress(data), object_hook=m.decode, raw=False
+    )
+
+
+class Server:
+    def __init__(self, host: str = "*", port: int = 5555):
+        self.host = host
+        self.port = port
+
+        self.context = zmq.Context()
+        self.socket = self.context.socket(zmq.REP)
+        self.socket.bind(f"tcp://{self.host}:{self.port}")
+        logger.info(f"Server started at tcp://{self.host}:{self.port}")
+
+        self.endpoints: dict[str, Callable[[Any], Any]] = {}
+
+    def register_endpoint(self, command: str, func: Callable[[Any], Any]):
+        self.endpoints[command] = func
+        logger.info(f"Registered endpoint: {command} -> {func}")
+
+    def return_error(self, message: str) -> None:
+        self.socket.send(_pack({"status": "error", "data": message}))
+
+    def return_ok(self, data: Any) -> None:
+        self.socket.send(_pack({"status": "ok", "data": data}))
+
+    def handle_once(self) -> None:
+        message = self.socket.recv()
+        message = _unpack(message)
+
+        cmd = message.get("command")
+        data = message.get("data")
+
+        logger.info("Received Command: %s", cmd)
+
+        handler = self.endpoints.get(cmd)
+
+        if handler is not None:
+            try:
+                if data is None:
+                    response = handler()
+                else:
+                    response = handler(data)
+                self.return_ok(response)
+            except Exception as e:
+                logger.error(f"Error handling command {cmd}: {e}")
+                self.return_error(str(e))
+        else:
+            logger.warning(f"Unknown command: {cmd}")
+            self.return_error(f"Unknown command: {cmd}")
+
+    def loop_forever(self):
+        try:
+            while True:
+                self.handle_once()
+
+        except KeyboardInterrupt:
+            logger.info("Server shutting down...")
+
+        finally:
+            self.socket.close()
+            self.context.term()
+
+
+class Client:
+    def __init__(self, host: str = "localhost", port: int = 5555):
+        self.context = zmq.Context()
+        self.socket = self.context.socket(zmq.REQ)
+        self.socket.connect(f"tcp://{host}:{port}")
+        logger.info(f"Client connected to tcp://{host}:{port}")
+
+    def call_endpoint(self, command: str, data=None):
+        self.socket.send(_pack({"command": command, "data": data}))
+        message = self.socket.recv()
+        message = _unpack(message)
+
+        if message.get("status") == "ok":
+            return message.get("data")
+        else:
+            logger.error(f"Error from server: {message.get('data')}")
+            raise Exception(f"Error from server: {message.get('data')}")
+
+
+def freq_control(freq: int = 25):
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            start_time = time.time()
+            result = func(*args, **kwargs)
+            end_time = time.time()
+            elapsed_time = end_time - start_time
+            # logger.info(f"'{func.__name__}' tooks {elapsed_time * 1000:.2f} ms")
+            time.sleep(max(0, (1.0 / freq) - elapsed_time))
+            return result
+
+        return wrapper
+
+    return decorator
+
+
+if __name__ == "__main__":
+    import sys
+    from time import sleep
+
+    logging.basicConfig(
+        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+    )
+
+    assert (len(sys.argv) == 2) and ((mode := sys.argv[1]) in ("server", "client")), (
+        "Usage: python service.py [server|client]"
+    )
+
+    ## Protocol:
+    # Request: { "command": str, "data": Any }
+    # Response: { "status": "ok" | "error", "data": Any if status=="ok" else str (ErrorMsg) }
+
+    if mode == "server":
+        server = Server()
+        server.register_endpoint("ping", lambda: "pong")
+        server.register_endpoint("echo", lambda x: x)
+        server.register_endpoint("add", lambda data: data["a"] + data["b"])
+        server.loop_forever()
+
+    elif mode == "client":
+        client = Client()
+        while True:
+            try:
+                response = client.call_endpoint("ping")
+                print(f"Response from server: {response}")
+                response = client.call_endpoint("echo", "Hello, World!")
+                print(f"Response from server: {response}")
+                response = client.call_endpoint("add", {"a": 5, "b": 10})
+                print(f"Response from server: {response}")
+
+                sleep(0.2)
+
+            except Exception as e:
+                print(f"Error: {e}")
+                break
--- a/RDT/rdt170m-run/configs/base.yaml
+++ b/RDT/rdt170m-run/configs/base.yaml
@ -0,0 +1,71 @@
+common:
+  # The number of historical images
+  img_history_size: 2
+  # The number of future actions to predict
+  action_chunk_size: 64
+  # The number of cameras to be used in the model
+  num_cameras: 3
+  # Dimension for state/action, we use the same space for both state and action
+  # This MUST be equal to configs/state_vec.py
+  state_dim: 128
+
+
+dataset:
+  # We will extract the data from raw dataset
+  # and store them in the disk buffer by producer
+  # When training, we will read the data 
+  # randomly from the buffer by consumer
+  # The producer will replace the data which has been 
+  # read by the consumer with new data
+
+  # The path to the buffer (at least 400GB)
+  buf_path: /path/to/buffer
+  # The number of chunks in the buffer
+  buf_num_chunks: 512
+  # The number of samples (step rather than episode) in each chunk
+  buf_chunk_size: 512
+
+  # We will filter the episodes with length less than `epsd_len_thresh_low`
+  epsd_len_thresh_low: 32
+  # For those more than `epsd_len_thresh_high`,
+  # we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
+  # to better balance the training datasets
+  epsd_len_thresh_high: 2048
+  # How to fit the image size
+  image_aspect_ratio: pad
+  # Maximum number of language tokens
+  tokenizer_max_length: 1024
+
+model:
+  # Config for condition adpators
+  lang_adaptor: mlp2x_gelu
+  img_adaptor: mlp2x_gelu
+  state_adaptor: mlp3x_gelu
+  lang_token_dim: 4096
+  img_token_dim: 1152
+  # Dim of action or proprioception vector
+  # A `state` refers to an action or a proprioception vector
+  state_token_dim: 128
+  # Config for RDT structure
+  rdt:
+    # 1B: num_head 32 hidden_size 2048
+    hidden_size: 2048
+    depth: 28
+    num_heads: 32
+    cond_pos_embed_type: multimodal 
+  # For noise scheduler
+  noise_scheduler:
+    type: ddpm
+    num_train_timesteps: 1000
+    num_inference_timesteps: 5
+    beta_schedule: squaredcos_cap_v2  # Critical choice
+    prediction_type: sample
+    clip_sample: False
+  # For EMA (params averaging)
+  # We do not use EMA currently
+  ema:
+    update_after_step: 0
+    inv_gamma: 1.0
+    power: 0.75
+    min_value: 0.0
+    max_value: 0.9999
--- a/RDT/rdt170m-run/configs/calvin_rel_traj_location_bounds_task_ABC_D.json
+++ b/RDT/rdt170m-run/configs/calvin_rel_traj_location_bounds_task_ABC_D.json
@ -0,0 +1,50 @@
+{
+    "A": [
+        [
+            -0.2691913843154907,
+            -0.21995729207992554,
+            -0.182277649641037
+        ],
+        [
+            0.35127854347229004,
+            0.2769763469696045,
+            0.17159393429756165
+        ]
+    ],
+    "B": [
+        [
+            -0.2576896846294403,
+            -0.22244493663311005,
+            -0.20557966828346252
+        ],
+        [
+            0.32854634523391724,
+            0.2922680974006653,
+            0.17373555898666382
+        ]
+    ],
+    "C": [
+        [
+            -0.29205888509750366,
+            -0.24688798189163208,
+            -0.17577645182609558
+        ],
+        [
+            0.25053921341896057,
+            0.3277084231376648,
+            0.16431939601898193
+        ]
+    ],
+    "D": [
+        [
+            -0.25131964683532715,
+            -0.15233077108860016,
+            -0.13294968008995056
+        ],
+        [
+            0.19209328293800354,
+            0.19344553351402283,
+            0.1370421051979065
+        ]
+    ]
+}
--- a/RDT/rdt170m-run/configs/dataset_control_freq.json
+++ b/RDT/rdt170m-run/configs/dataset_control_freq.json
@ -0,0 +1,65 @@
+{
+    "fractal20220817_data": 3,
+    "taco_play": 15,
+    "jaco_play": 10,
+    "berkeley_cable_routing": 10,
+    "nyu_door_opening_surprising_effectiveness": 3,
+    "viola": 20,
+    "berkeley_autolab_ur5": 5,
+    "toto": 30,
+    "kuka": 10,
+    "language_table": 10,
+    "columbia_cairlab_pusht_real": 10,
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 20,
+    "nyu_rot_dataset_converted_externally_to_rlds":3,
+    "stanford_hydra_dataset_converted_externally_to_rlds": 10,
+    "austin_buds_dataset_converted_externally_to_rlds": 20,
+    "nyu_franka_play_dataset_converted_externally_to_rlds": 3,
+    "maniskill_dataset_converted_externally_to_rlds": 20,
+    "furniture_bench_dataset_converted_externally_to_rlds": 10,
+    "ucsd_kitchen_dataset_converted_externally_to_rlds": 2,
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": 3,
+    "austin_sailor_dataset_converted_externally_to_rlds": 20,
+    "austin_sirius_dataset_converted_externally_to_rlds": 20,
+    "bc_z": 10,
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": 10,
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 10,
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
+    "utokyo_xarm_bimanual_converted_externally_to_rlds": 10,
+    "berkeley_mvp_converted_externally_to_rlds": 5,
+    "berkeley_rpt_converted_externally_to_rlds": 30,
+    "kaist_nonprehensile_converted_externally_to_rlds": 10,
+    "stanford_mask_vit_converted_externally_to_rlds": 0,
+    "tokyo_u_lsmo_converted_externally_to_rlds": 10,
+    "dlr_sara_pour_converted_externally_to_rlds": 10,
+    "dlr_sara_grid_clamp_converted_externally_to_rlds": 10,
+    "dlr_edan_shared_control_converted_externally_to_rlds": 5,
+    "asu_table_top_converted_externally_to_rlds": 12.5,
+    "stanford_robocook_converted_externally_to_rlds": 5,
+    "eth_agent_affordances": 66.6,
+    "imperialcollege_sawyer_wrist_cam": 10,
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": 20,
+    "uiuc_d3field": 1,
+    "utaustin_mutex": 20,
+    "berkeley_fanuc_manipulation": 10,
+    "cmu_play_fusion": 5,
+    "cmu_stretch": 10,
+    "berkeley_gnm_recon": 3,
+    "berkeley_gnm_cory_hall": 5,
+    "berkeley_gnm_sac_son": 10,
+    "robo_net": 1,
+    "roboturk_real_towercreation": 10,
+    "roboturk_real_laundrylayout": 10,
+    "roboturk_real_objectsearch": 10,
+    "aloha_mobile": 50,
+    "aloha_static": 50,
+    "roboset": 5,
+    "droid": 15,
+    "fmb": 10,
+    "dobbe": 30,
+    "qut_dexterous_manpulation": 30,
+    "agilex": 25,
+    "rh20t": 10,
+    "calvin": 30,
+    "bridgev2": 5
+}
--- a/RDT/rdt170m-run/configs/dataset_img_keys.json
+++ b/RDT/rdt170m-run/configs/dataset_img_keys.json
@ -0,0 +1,575 @@
+{
+    "fractal20220817_data": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[
+            1,0,0,0
+        ]
+    },
+    "taco_play": {
+        "image_keys": [
+            "rgb_static",
+            "rgb_gripper",
+            "rgb_static",
+            "rgb_static"
+        ],
+        "image_mask":[
+            1,1,0,0
+        ]
+    },
+    "jaco_play": {
+        "image_keys": [
+            "image",
+            "image_wrist",
+            "image_wrist",
+            "image_wrist"
+        ],
+        "image_mask":[
+            1,1,0,0
+        ]
+    },
+    "berkeley_cable_routing": {
+        "image_keys": [
+            "image",
+            "wrist45_image",
+            "wrist225_image",
+            "top_image"
+        ],
+        "image_mask":[1,1,0,1]
+    },
+    "nyu_door_opening_surprising_effectiveness": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "viola": {
+        "image_keys": [
+            "agentview_rgb",
+            "eye_in_hand_rgb",
+            "eye_in_hand_rgb",
+            "eye_in_hand_rgb"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "berkeley_autolab_ur5": {
+        "image_keys": [
+            "image",
+            "hand_image",
+            "hand_image",
+            "hand_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "toto": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "kuka": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "language_table": {
+        "image_keys": [
+            "rgb",
+            "rgb",
+            "rgb",
+            "rgb"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "columbia_cairlab_pusht_real": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "nyu_rot_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "stanford_hydra_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "austin_buds_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "nyu_franka_play_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image_additional_view",
+            "image_additional_view",
+            "image_additional_view"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "maniskill_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "furniture_bench_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "ucsd_kitchen_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "austin_sailor_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "austin_sirius_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "bc_z": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "hand_image",
+            "hand_image",
+            "image2"
+        ],
+        "image_mask":[1,1,0,1]
+    },
+    "utokyo_xarm_bimanual_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "berkeley_mvp_converted_externally_to_rlds": {
+        "image_keys": [
+            "hand_image",
+            "hand_image",
+            "hand_image",
+            "hand_image"
+        ],
+        "image_mask":[0,1,0,0]
+    },
+    "berkeley_rpt_converted_externally_to_rlds": {
+        "image_keys": [
+            "hand_image",
+            "hand_image",
+            "hand_image",
+            "hand_image"
+        ],
+        "image_mask":[0,1,0,0]
+    },
+    "kaist_nonprehensile_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "stanford_mask_vit_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "tokyo_u_lsmo_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "dlr_sara_pour_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "dlr_sara_grid_clamp_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "dlr_edan_shared_control_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "asu_table_top_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "stanford_robocook_converted_externally_to_rlds": {
+        "image_keys": [
+            "image_2",
+            "image_1",
+            "image_3",
+            "image_4"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "eth_agent_affordances": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "imperialcollege_sawyer_wrist_cam": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[0,1,0,0]
+    },
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "uiuc_d3field": {
+        "image_keys": [
+            "image_1",
+            "image_2",
+            "image_3",
+            "image_4"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "utaustin_mutex": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "berkeley_fanuc_manipulation": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "cmu_play_fusion": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "cmu_stretch": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "berkeley_gnm_recon": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "berkeley_gnm_cory_hall": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "berkeley_gnm_sac_son": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "robo_net": {
+        "image_keys": [
+            "image",
+            "image1",
+            "image2",
+            "image2"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "roboturk_real_towercreation": {
+        "image_keys": [
+            "top_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "roboturk_real_laundrylayout": {
+        "image_keys": [
+            "top_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "roboturk_real_objectsearch": {
+        "image_keys": [
+            "top_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "aloha_mobile": {
+        "image_keys": [
+            "cam_high",
+            "cam_right_wrist",
+            "cam_left_wrist",
+            "cam_right_wrist"
+        ],
+        "image_mask":[1,1,1,0]
+    },
+    "aloha_static": {
+        "image_keys": [
+            "cam_high",
+            "cam_right_wrist",
+            "cam_left_wrist",
+            "cam_low"
+        ],
+        "image_mask":[1,1,1,1]
+    },
+    "roboset": {
+        "image_keys": [
+            "rgb_top",
+            "rgb_right",
+            "rgb_left",
+            "rgb_right"
+        ],
+        "image_mask":[1,1,1,0]
+    },
+    "droid": {
+        "image_keys": [
+            "exterior_image_1_left",
+            "wrist_image_left",
+            "wrist_image_left",
+            "exterior_image_2_left"
+        ], 
+        "image_mask":[1,1,0,1]
+    },
+    "fmb": {
+        "image_keys": [
+            "image_side_1",
+            "image_wrist_1",
+            "image_wrist_1",
+            "image_side_2"
+        ], 
+        "image_mask":[1,1,0,1]
+    },
+    "dobbe": {
+        "image_keys": [
+            "wrist_image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ], 
+        "image_mask":[0,1,0,0]
+    },
+    "qut_dexterous_manpulation": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ], 
+        "image_mask":[1,1,0,0]
+    },
+    "agilex": {
+        "image_keys": [
+            "cam_high",
+            "cam_right_wrist",
+            "cam_left_wrist",
+            "cam_right_wrist"
+        ],
+        "image_mask":[1,1,1,0]
+    },
+    "rh20t": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",        
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "calvin": {
+        "image_keys": [
+            "rgb_static",
+            "rgb_gripper",
+            "rgb_gripper",        
+            "rgb_gripper"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "bridgev2": {
+        "image_keys": [
+            "images0",
+            "images0",
+            "images0",
+            "images0"
+        ],
+        "image_mask":[1,0,0,0]
+    }
+}
--- a/RDT/rdt170m-run/configs/dataset_stat.json
+++ b/RDT/rdt170m-run/configs/dataset_stat.json
@ -0,0 +1,525 @@
+{
+    "agilex": {
+        "dataset_name": "agilex",
+        "state_mean": [
+            -0.0036545392947090432,
+            -0.2773659935760079,
+            0.3147616748061523,
+            0.3813313179910183,
+            0.04028575944090457,
+            0.034888520819083294,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "state_std": [
+            0.05763674563578847,
+            0.2580181064167735,
+            0.19785840483767897,
+            0.05020347749331385,
+            0.054529239104671424,
+            0.05020521339363586,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "state_min": [
+            -0.17447535196940103,
+            -0.5522612677680121,
+            -0.3340397516886393,
+            0.21861712137858072,
+            -0.09725829230414497,
+            0.003396739231215583,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "state_max": [
+            0.21961932712131077,
+            0.30613206227620443,
+            0.5444545321994357,
+            0.4866888682047526,
+            0.31486290825737845,
+            0.3355223337809245,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ]
+    }
+}
--- a/RDT/rdt170m-run/configs/finetune_datasets.json
+++ b/RDT/rdt170m-run/configs/finetune_datasets.json
@ -0,0 +1,3 @@
+[
+    "agilex"
+]
--- a/RDT/rdt170m-run/configs/finetune_sample_weights.json
+++ b/RDT/rdt170m-run/configs/finetune_sample_weights.json
@ -0,0 +1,3 @@
+{
+    "agilex": 100
+}
--- a/RDT/rdt170m-run/configs/pretrain_datasets.json
+++ b/RDT/rdt170m-run/configs/pretrain_datasets.json
@ -0,0 +1,48 @@
+[
+    "fractal20220817_data",
+    "jaco_play",
+    "taco_play",
+    "berkeley_cable_routing",
+    "viola",
+    "berkeley_autolab_ur5",
+    "toto",
+    "nyu_door_opening_surprising_effectiveness",
+    "columbia_cairlab_pusht_real",
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds",
+    "austin_buds_dataset_converted_externally_to_rlds",
+    "kuka",
+    "utokyo_xarm_bimanual_converted_externally_to_rlds",
+    "stanford_hydra_dataset_converted_externally_to_rlds",
+    "maniskill_dataset_converted_externally_to_rlds",
+    "ucsd_kitchen_dataset_converted_externally_to_rlds",
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds",
+    "austin_sailor_dataset_converted_externally_to_rlds",
+    "austin_sirius_dataset_converted_externally_to_rlds",
+    "bc_z",
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds",
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds",
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds",
+    "berkeley_mvp_converted_externally_to_rlds",
+    "berkeley_rpt_converted_externally_to_rlds",
+    "kaist_nonprehensile_converted_externally_to_rlds",
+    "tokyo_u_lsmo_converted_externally_to_rlds",
+    "dlr_sara_grid_clamp_converted_externally_to_rlds",
+    "stanford_robocook_converted_externally_to_rlds",
+    "imperialcollege_sawyer_wrist_cam",
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds",
+    "utaustin_mutex",
+    "berkeley_fanuc_manipulation",
+    "cmu_play_fusion",
+    "language_table",
+    "furniture_bench_dataset_converted_externally_to_rlds",
+    "droid",
+    "fmb",
+    "dobbe",
+    "qut_dexterous_manpulation",
+    "aloha_mobile",
+    "aloha_static",
+    "roboset",
+    "rh20t",
+    "calvin",
+    "bridgev2"
+]
--- a/RDT/rdt170m-run/configs/pretrain_sample_weights.json
+++ b/RDT/rdt170m-run/configs/pretrain_sample_weights.json
@ -0,0 +1,48 @@
+{
+    "fractal20220817_data": 271,
+    "taco_play": 60,
+    "jaco_play": 33,
+    "berkeley_cable_routing": 8,
+    "nyu_door_opening_surprising_effectiveness": 10,
+    "viola": 12,
+    "berkeley_autolab_ur5": 32,
+    "toto": 32,
+    "kuka": 50,
+    "language_table": 100,
+    "columbia_cairlab_pusht_real": 12,
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 55,
+    "stanford_hydra_dataset_converted_externally_to_rlds": 24,
+    "austin_buds_dataset_converted_externally_to_rlds": 7,
+    "maniskill_dataset_converted_externally_to_rlds": 174,
+    "furniture_bench_dataset_converted_externally_to_rlds": 71,
+    "ucsd_kitchen_dataset_converted_externally_to_rlds": 12,
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": 37,
+    "austin_sailor_dataset_converted_externally_to_rlds": 15,
+    "austin_sirius_dataset_converted_externally_to_rlds": 24,
+    "bc_z": 208,
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": 9,
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 15,
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
+    "utokyo_xarm_bimanual_converted_externally_to_rlds": 1,
+    "berkeley_mvp_converted_externally_to_rlds": 22,
+    "berkeley_rpt_converted_externally_to_rlds": 30,
+    "kaist_nonprehensile_converted_externally_to_rlds": 14,
+    "tokyo_u_lsmo_converted_externally_to_rlds": 7,
+    "dlr_sara_grid_clamp_converted_externally_to_rlds": 1,
+    "stanford_robocook_converted_externally_to_rlds": 50,
+    "imperialcollege_sawyer_wrist_cam": 13,
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": 25,
+    "utaustin_mutex": 39,
+    "berkeley_fanuc_manipulation": 20,
+    "cmu_play_fusion": 24,
+    "droid": 303,
+    "fmb": 42,
+    "dobbe": 36,
+    "qut_dexterous_manpulation": 14,
+    "aloha_mobile": 150,
+    "aloha_static": 150,
+    "roboset": 135,
+    "rh20t": 331,
+    "calvin": 100,
+    "bridgev2": 224
+}
--- a/RDT/rdt170m-run/configs/state_vec.py
+++ b/RDT/rdt170m-run/configs/state_vec.py
@ -0,0 +1,126 @@
+STATE_VEC_IDX_MAPPING = {
+    # [0, 10): right arm joint positions
+    **{
+        "arm_joint_{}_pos".format(i): i
+        for i in range(10)
+    },
+    **{
+        "right_arm_joint_{}_pos".format(i): i
+        for i in range(10)
+    },
+    # [10, 15): right gripper joint positions
+    **{
+        "gripper_joint_{}_pos".format(i): i + 10
+        for i in range(5)
+    },
+    **{
+        "right_gripper_joint_{}_pos".format(i): i + 10
+        for i in range(5)
+    },
+    "gripper_open": 10,  # alias of right_gripper_joint_0_pos
+    "right_gripper_open": 10,
+    # [15, 25): right arm joint velocities
+    **{
+        "arm_joint_{}_vel".format(i): i + 15
+        for i in range(10)
+    },
+    **{
+        "right_arm_joint_{}_vel".format(i): i + 15
+        for i in range(10)
+    },
+    # [25, 30): right gripper joint velocities
+    **{
+        "gripper_joint_{}_vel".format(i): i + 25
+        for i in range(5)
+    },
+    **{
+        "right_gripper_joint_{}_vel".format(i): i + 25
+        for i in range(5)
+    },
+    "gripper_open_vel": 25,  # alias of right_gripper_joint_0_vel
+    "right_gripper_open_vel": 25,
+    # [30, 33): right end effector positions
+    "eef_pos_x": 30,
+    "right_eef_pos_x": 30,
+    "eef_pos_y": 31,
+    "right_eef_pos_y": 31,
+    "eef_pos_z": 32,
+    "right_eef_pos_z": 32,
+    # [33, 39): right end effector 6D pose
+    "eef_angle_0": 33,
+    "right_eef_angle_0": 33,
+    "eef_angle_1": 34,
+    "right_eef_angle_1": 34,
+    "eef_angle_2": 35,
+    "right_eef_angle_2": 35,
+    "eef_angle_3": 36,
+    "right_eef_angle_3": 36,
+    "eef_angle_4": 37,
+    "right_eef_angle_4": 37,
+    "eef_angle_5": 38,
+    "right_eef_angle_5": 38,
+    # [39, 42): right end effector velocities
+    "eef_vel_x": 39,
+    "right_eef_vel_x": 39,
+    "eef_vel_y": 40,
+    "right_eef_vel_y": 40,
+    "eef_vel_z": 41,
+    "right_eef_vel_z": 41,
+    # [42, 45): right end effector angular velocities
+    "eef_angular_vel_roll": 42,
+    "right_eef_angular_vel_roll": 42,
+    "eef_angular_vel_pitch": 43,
+    "right_eef_angular_vel_pitch": 43,
+    "eef_angular_vel_yaw": 44,
+    "right_eef_angular_vel_yaw": 44,
+    # [45, 50): reserved
+    # [50, 60): left arm joint positions
+    **{
+        "left_arm_joint_{}_pos".format(i): i + 50
+        for i in range(10)
+    },
+    # [60, 65): left gripper joint positions
+    **{
+        "left_gripper_joint_{}_pos".format(i): i + 60
+        for i in range(5)
+    },
+    "left_gripper_open": 60,  # alias of left_gripper_joint_0_pos
+    # [65, 75): left arm joint velocities
+    **{
+        "left_arm_joint_{}_vel".format(i): i + 65
+        for i in range(10)
+    },
+    # [75, 80): left gripper joint velocities
+    **{
+        "left_gripper_joint_{}_vel".format(i): i + 75
+        for i in range(5)
+    },
+    "left_gripper_open_vel": 75,  # alias of left_gripper_joint_0_vel
+    # [80, 83): left end effector positions
+    "left_eef_pos_x": 80,
+    "left_eef_pos_y": 81,
+    "left_eef_pos_z": 82,
+    # [83, 89): left end effector 6D pose
+    "left_eef_angle_0": 83,
+    "left_eef_angle_1": 84,
+    "left_eef_angle_2": 85,
+    "left_eef_angle_3": 86,
+    "left_eef_angle_4": 87,
+    "left_eef_angle_5": 88,
+    # [89, 92): left end effector velocities
+    "left_eef_vel_x": 89,
+    "left_eef_vel_y": 90,
+    "left_eef_vel_z": 91,
+    # [92, 95): left end effector angular velocities
+    "left_eef_angular_vel_roll": 92,
+    "left_eef_angular_vel_pitch": 93,
+    "left_eef_angular_vel_yaw": 94,
+    # [95, 100): reserved
+    # [100, 102): base linear velocities
+    "base_vel_x": 100,
+    "base_vel_y": 101,
+    # [102, 103): base angular velocities
+    "base_angular_vel": 102,
+    # [103, 128): reserved
+}
+STATE_VEC_LEN = 128
--- a/RDT/rdt170m-run/configs/zero2.json
+++ b/RDT/rdt170m-run/configs/zero2.json
@ -0,0 +1,14 @@
+{
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9
+    }
+}
--- a/RDT/rdt170m-run/data/.gitignore
+++ b/RDT/rdt170m-run/data/.gitignore
@ -0,0 +1,2 @@
+# Ignore data files
+datasets
--- a/Show More
+++ b/Show More