d-robotics-vla/RDT/lerobot2rdt/lerobot2rdt.py

#!/usr/bin/env python3
"""
LeRobot到RDT数据转换脚本

LeRobot机器人结构：
- 5个关节 (shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll)
- 1个夹爪 (gripper)
- 总计：6个自由度 (6DOF)

维度映射（匹配RDT训练代码）：
- left_arm_dim = 0 (单臂机器人，左臂不存在)
- right_arm_dim = 6 (5关节 + 1夹爪，映射到RDT的right_arm部分)
- 状态向量：6维 [joint1, joint2, joint3, joint4, joint5, gripper]
- RDT索引映射：right_arm_joint_0_pos到right_arm_joint_5_pos (索引0-5)
"""

import sys
import os
import h5py
import numpy as np
import cv2
import argparse
import yaml
import json
import subprocess
from pathlib import Path
import pandas as pd
import torch

current_dir = os.path.dirname(__file__)
sys.path.append(os.path.join(current_dir, ".."))
from models.multimodal_encoder.t5_encoder import T5Embedder

def extract_frames_from_video(video_path, output_dir, episode_idx):
    if not os.path.exists(video_path):
        print(f"  No video file: {video_path}")
        return []

    temp_dir = os.path.join(output_dir, f"temp_frames_{episode_idx}")
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    output_pattern = os.path.join(temp_dir, "frame_%04d.jpg")

    try:
        cmd = [
            'ffmpeg', '-i', video_path,
            '-vf', 'fps=30',
            '-q:v', '2',
            output_pattern,
            '-y'
        ]

        result = subprocess.run(cmd, capture_output=True, text=True)

        if result.returncode != 0:
            print(f"  Failed to extract frames with ffmpeg: {result.stderr}")
            return []

        frames = []
        frame_files = sorted([f for f in os.listdir(temp_dir) if f.endswith('.jpg')])

        for frame_file in frame_files:
            frame_path = os.path.join(temp_dir, frame_file)
            frame = cv2.imread(frame_path)
            if frame is not None:
                frame_resized = cv2.resize(frame, (640, 480))
                frames.append(frame_resized)

        print(f"  Successfully extracted {len(frames)} frames")

        for frame_file in frame_files:
            os.remove(os.path.join(temp_dir, frame_file))
        os.rmdir(temp_dir)

        return frames

    except Exception as e:
        print(f"  Error extracting frames: {e}")
        return []

def load_lerobot_episode(data_dir, episode_idx, output_dir, cam_high_key="high", cam_right_wrist_key="arm"):
    """加载LeRobot的单个episode数据

    LeRobot数据结构：
    - action: 6维 [shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll, gripper]
    - observation.state: 6维 [shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll, gripper]
    - 图像: 高位相机 + 手臂相机
    """
    parquet_path = os.path.join(data_dir, "data/chunk-000", f"episode_{episode_idx:06d}.parquet")
    if not os.path.exists(parquet_path):
        print(f"Episode {episode_idx} parquet file does not exist: {parquet_path}")
        return None

    df = pd.read_parquet(parquet_path)

    actions = []
    qpos = []

    for i in range(len(df)):
        action = df['action'].iloc[i]
        state = df['observation.state'].iloc[i]

        if isinstance(action, np.ndarray):
            actions.append(action.astype(np.float32))
        else:
            actions.append(np.array(action, dtype=np.float32))

        if isinstance(state, np.ndarray):
            qpos.append(state.astype(np.float32))
        else:
            qpos.append(np.array(state, dtype=np.float32))

    high_cam_path = os.path.join(data_dir, f"videos/chunk-000/observation.images.{cam_high_key}", f"episode_{episode_idx:06d}.mp4")
    arm_cam_path = os.path.join(data_dir, f"videos/chunk-000/observation.images.{cam_right_wrist_key}", f"episode_{episode_idx:06d}.mp4")

    print(f"  Extracting high camera frames...")
    high_images = extract_frames_from_video(high_cam_path, output_dir, episode_idx)

    print(f"  Extracting arm camera frames...")
    arm_images = extract_frames_from_video(arm_cam_path, output_dir, episode_idx)

    target_frames = len(df)
    if len(high_images) > target_frames:
        high_images = high_images[:target_frames]
    if len(arm_images) > target_frames:
        arm_images = arm_images[:target_frames]

    while len(high_images) < target_frames and high_images:
        high_images.append(high_images[-1])
    while len(arm_images) < target_frames and arm_images:
        arm_images.append(arm_images[-1])

    return {
        'actions': np.array(actions),
        'qpos': np.array(qpos),
        'high_images': high_images,
        'arm_images': arm_images,
        'episode_length': len(df)
    }

def images_encoding(imgs):
    if not imgs:
        return [], 0

    encode_data = []
    padded_data = []
    max_len = 0

    for i in range(len(imgs)):
        success, encoded_image = cv2.imencode(".jpg", imgs[i])
        if success:
            jpeg_data = encoded_image.tobytes()
            encode_data.append(jpeg_data)
            max_len = max(max_len, len(jpeg_data))
        else:
            print(f"  Image encoding failed: {i}")
            empty_data = b""
            encode_data.append(empty_data)

    for i in range(len(imgs)):
        padded_data.append(encode_data[i].ljust(max_len, b"\0"))

    return encode_data, max_len

def load_task_instructions(data_dir):
    tasks_file = os.path.join(data_dir, "meta/tasks.jsonl")
    if not os.path.exists(tasks_file):
        print(f"Warning: tasks file not found: {tasks_file}")
        return None

    instructions = []
    with open(tasks_file, 'r') as f:
        for line in f:
            if line.strip():
                task_data = json.loads(line.strip())
                instructions.append(task_data["task"])

    print(f"  加载了 {len(instructions)} 个任务指令")
    return instructions

def encode_language_instruction(instruction_text, t5_embedder, device):
    try:
        text_embeds, attn_mask = t5_embedder.get_text_embeddings([instruction_text])

        valid_embeds = text_embeds[0][attn_mask[0]].float()
        return valid_embeds.cpu().numpy()

    except Exception as e:
        print(f"  Language encoding failed: {e}")
        return np.zeros((1, 4096))

def convert_lerobot_to_rdt(data_dir, output_dir, episode_num, gpu=0, no_language=False, t5_path=None, cam_high_key="high", cam_right_wrist_key="arm"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print(f"Start converting LeRobot data to RDT format...")
    print(f"Data source: {data_dir}")
    print(f"Output directory: {output_dir}")
    print(f"Processing episode number: {episode_num}")
    print(f"GPU device: {gpu}")

    scene_name = os.path.basename(data_dir)

    instructions = None
    if not no_language:
        instructions = load_task_instructions(data_dir)

    t5_embedder = None
    if not no_language and instructions:
        try:
            print(f"  Initializing T5 encoder...")
            t5_embedder = T5Embedder(
                from_pretrained=t5_path,
                device=f"cuda:{gpu}" if torch.cuda.is_available() else "cpu",
                model_max_length=1024,
                use_offload_folder=None,
            )
            print(f"  T5 encoder initialized successfully")
        except Exception as e:
            print(f"  T5 encoder initialization failed: {e}")
            print(f"  Will skip language processing")
            no_language = True

    for i in range(episode_num):
        print(f"Processing episode {i}...")

        episode_data = load_lerobot_episode(data_dir, i, output_dir, cam_high_key=cam_high_key, cam_right_wrist_key=cam_right_wrist_key)
        if episode_data is None:
            print(f"Skipping episode {i}")
            continue

        episode_output_dir = os.path.join(output_dir, f"episode_{i}")
        if not os.path.exists(episode_output_dir):
            os.makedirs(episode_output_dir)

        hdf5_path = os.path.join(episode_output_dir, f"episode_{i}.hdf5")

        with h5py.File(hdf5_path, "w") as f:
            f.create_dataset("action", data=episode_data['actions'])

            obs = f.create_group("observations")
            obs.create_dataset("qpos", data=episode_data['qpos'])

            image = obs.create_group("images")

            if episode_data['high_images']:
                print(f"  Encoding high camera images...")
                high_enc, len_high = images_encoding(episode_data['high_images'])
                if high_enc and len_high > 0:
                    image.create_dataset("cam_high", data=high_enc, dtype=f"S{len_high}")
                    print(f"  Saved high camera images: {len(episode_data['high_images'])} frames")
                else:
                    print(f"  Warning: High camera images encoding failed")

            if episode_data['arm_images']:
                print(f"  Encoding arm camera images...")
                arm_enc, len_arm = images_encoding(episode_data['arm_images'])
                if arm_enc and len_arm > 0:
                    image.create_dataset("cam_right_wrist", data=arm_enc, dtype=f"S{len_arm}")
                    print(f"  Saved arm camera images: {len(episode_data['arm_images'])} frames")
                else:
                    print(f"  Warning: Arm camera images encoding failed")

            # 添加机器人维度信息（LeRobot: 5个关节 + 1个夹爪）
            # 根据process_data.py的逻辑，每个时间步都需要记录维度信息
            # LeRobot是单臂机器人，只有右臂：5个关节 + 1个夹爪 = 6维
            # 左臂：0维（单臂机器人）

            # 为每个时间步记录维度信息
            left_arm_dim = [0] * len(episode_data['actions'])  # 左臂0维（单臂机器人）
            right_arm_dim = [6] * len(episode_data['actions'])  # 右臂6维（5关节+1夹爪）

            obs.create_dataset("left_arm_dim", data=np.array(left_arm_dim))
            obs.create_dataset("right_arm_dim", data=np.array(right_arm_dim))

        print(f"  Episode {i} converted successfully: {hdf5_path}")
        print(f"  Data length: {episode_data['episode_length']}")
        print(f"  Action shape: {episode_data['actions'].shape}")
        print(f"  Qpos shape: {episode_data['qpos'].shape}")
        print(f"  High camera frames: {len(episode_data['high_images'])}")
        print(f"  Arm camera frames: {len(episode_data['arm_images'])}")

        if not no_language and t5_embedder and instructions:
            print(f"  Processing language instructions...")
            try:
                instruction = instructions[0]

                language_features = encode_language_instruction(instruction, t5_embedder, f"cuda:{gpu}")

                instructions_dir = os.path.join(episode_output_dir, "instructions")
                if not os.path.exists(instructions_dir):
                    os.makedirs(instructions_dir)

                lang_embed_path = os.path.join(instructions_dir, "lang_embed_0.pt")
                torch.save(torch.from_numpy(language_features), lang_embed_path)

                print(f"  Language instruction encoded successfully: {instruction}")
                print(f"  Language features saved to: {lang_embed_path}")
                print(f"  Language features shape: {language_features.shape}, data type: {language_features.dtype}")

            except Exception as e:
                print(f"  Language instruction processing failed: {e}")

    print(f"\nConversion completed! Processed {episode_num} episodes")
    print(f"Output directory: {output_dir}")

def main():
    parser = argparse.ArgumentParser(description="Convert LeRobot data to RDT format")
    parser.add_argument("--data_dir", type=str, required=True,
                       help="LeRobot data directory path")
    parser.add_argument("--output_dir", type=str, required=True,
                       help="Output directory path")
    parser.add_argument("--episode_num", type=int, default=10,
                       help="Number of episodes to process")
    parser.add_argument("--gpu", type=int, default=0,
                       help="GPU device ID")
    parser.add_argument("--no_language", action="store_true",
                       help="Skip language processing")
    parser.add_argument("--cam_high_key", type=str, default="cam_high",
                       help="High camera key")
    parser.add_argument("--cam_right_wrist_key", type=str, default="cam_right_wrist",
                       help="Right wrist camera key")
    parser.add_argument("--cam_left_wrist_key", type=str, default="cam_left_wrist",
                       help="Left wrist camera key")
    parser.add_argument("--t5_path", type=str, required=True,
                       help="T5 model path")

    args = parser.parse_args()

    if not os.path.exists(args.data_dir):
        print(f"Error: Data directory does not exist: {args.data_dir}")
        return

    meta_file = os.path.join(args.data_dir, "meta/info.json")
    if not os.path.exists(meta_file):
        print(f"Error: Meta information file not found: {meta_file}")
        return

    try:
        subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
        print("ffmpeg is available, will use ffmpeg to extract video frames")
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("Warning: ffmpeg is not available, image data may not be extracted correctly")
        print("Please install ffmpeg: conda install -c conda-forge ffmpeg=6.1")
        return

    with open(meta_file, 'r') as f:
        meta_info = yaml.safe_load(f)

    total_episodes = meta_info.get('total_episodes', 10)
    if args.episode_num > total_episodes:
        print(f"Warning: Requested episode number ({args.episode_num}) exceeds available number ({total_episodes})")
        args.episode_num = total_episodes

    convert_lerobot_to_rdt(
        args.data_dir,
        args.output_dir,
        args.episode_num,
        args.gpu,
        args.no_language,
        args.t5_path,
        args.cam_high_key,
        args.cam_right_wrist_key,
    )

if __name__ == "__main__":
    main()