d-robotics-rdt/lerobot2rdt/lerobot2rdt.py
2025-11-02 12:25:30 +08:00

369 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
LeRobot到RDT数据转换脚本
LeRobot机器人结构
- 5个关节 (shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll)
- 1个夹爪 (gripper)
- 总计6个自由度 (6DOF)
维度映射匹配RDT训练代码
- left_arm_dim = 0 (单臂机器人,左臂不存在)
- right_arm_dim = 6 (5关节 + 1夹爪映射到RDT的right_arm部分)
- 状态向量6维 [joint1, joint2, joint3, joint4, joint5, gripper]
- RDT索引映射right_arm_joint_0_pos到right_arm_joint_5_pos (索引0-5)
"""
import sys
import os
import h5py
import numpy as np
import cv2
import argparse
import yaml
import json
import subprocess
from pathlib import Path
import pandas as pd
import torch
current_dir = os.path.dirname(__file__)
sys.path.append(os.path.join(current_dir, ".."))
from models.multimodal_encoder.t5_encoder import T5Embedder
def extract_frames_from_video(video_path, output_dir, episode_idx):
if not os.path.exists(video_path):
print(f" No video file: {video_path}")
return []
temp_dir = os.path.join(output_dir, f"temp_frames_{episode_idx}")
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
output_pattern = os.path.join(temp_dir, "frame_%04d.jpg")
try:
cmd = [
'ffmpeg', '-i', video_path,
'-vf', 'fps=30',
'-q:v', '2',
output_pattern,
'-y'
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f" Failed to extract frames with ffmpeg: {result.stderr}")
return []
frames = []
frame_files = sorted([f for f in os.listdir(temp_dir) if f.endswith('.jpg')])
for frame_file in frame_files:
frame_path = os.path.join(temp_dir, frame_file)
frame = cv2.imread(frame_path)
if frame is not None:
frame_resized = cv2.resize(frame, (640, 480))
frames.append(frame_resized)
print(f" Successfully extracted {len(frames)} frames")
for frame_file in frame_files:
os.remove(os.path.join(temp_dir, frame_file))
os.rmdir(temp_dir)
return frames
except Exception as e:
print(f" Error extracting frames: {e}")
return []
def load_lerobot_episode(data_dir, episode_idx, output_dir, cam_high_key="high", cam_right_wrist_key="arm"):
"""加载LeRobot的单个episode数据
LeRobot数据结构
- action: 6维 [shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll, gripper]
- observation.state: 6维 [shoulder_pan, shoulder_lift, elbow_flex, wrist_flex, wrist_roll, gripper]
- 图像: 高位相机 + 手臂相机
"""
parquet_path = os.path.join(data_dir, "data/chunk-000", f"episode_{episode_idx:06d}.parquet")
if not os.path.exists(parquet_path):
print(f"Episode {episode_idx} parquet file does not exist: {parquet_path}")
return None
df = pd.read_parquet(parquet_path)
actions = []
qpos = []
for i in range(len(df)):
action = df['action'].iloc[i]
state = df['observation.state'].iloc[i]
if isinstance(action, np.ndarray):
actions.append(action.astype(np.float32))
else:
actions.append(np.array(action, dtype=np.float32))
if isinstance(state, np.ndarray):
qpos.append(state.astype(np.float32))
else:
qpos.append(np.array(state, dtype=np.float32))
high_cam_path = os.path.join(data_dir, f"videos/chunk-000/observation.images.{cam_high_key}", f"episode_{episode_idx:06d}.mp4")
arm_cam_path = os.path.join(data_dir, f"videos/chunk-000/observation.images.{cam_right_wrist_key}", f"episode_{episode_idx:06d}.mp4")
print(f" Extracting high camera frames...")
high_images = extract_frames_from_video(high_cam_path, output_dir, episode_idx)
print(f" Extracting arm camera frames...")
arm_images = extract_frames_from_video(arm_cam_path, output_dir, episode_idx)
target_frames = len(df)
if len(high_images) > target_frames:
high_images = high_images[:target_frames]
if len(arm_images) > target_frames:
arm_images = arm_images[:target_frames]
while len(high_images) < target_frames and high_images:
high_images.append(high_images[-1])
while len(arm_images) < target_frames and arm_images:
arm_images.append(arm_images[-1])
return {
'actions': np.array(actions),
'qpos': np.array(qpos),
'high_images': high_images,
'arm_images': arm_images,
'episode_length': len(df)
}
def images_encoding(imgs):
if not imgs:
return [], 0
encode_data = []
padded_data = []
max_len = 0
for i in range(len(imgs)):
success, encoded_image = cv2.imencode(".jpg", imgs[i])
if success:
jpeg_data = encoded_image.tobytes()
encode_data.append(jpeg_data)
max_len = max(max_len, len(jpeg_data))
else:
print(f" Image encoding failed: {i}")
empty_data = b""
encode_data.append(empty_data)
for i in range(len(imgs)):
padded_data.append(encode_data[i].ljust(max_len, b"\0"))
return encode_data, max_len
def load_task_instructions(data_dir):
tasks_file = os.path.join(data_dir, "meta/tasks.jsonl")
if not os.path.exists(tasks_file):
print(f"Warning: tasks file not found: {tasks_file}")
return None
instructions = []
with open(tasks_file, 'r') as f:
for line in f:
if line.strip():
task_data = json.loads(line.strip())
instructions.append(task_data["task"])
print(f" 加载了 {len(instructions)} 个任务指令")
return instructions
def encode_language_instruction(instruction_text, t5_embedder, device):
try:
text_embeds, attn_mask = t5_embedder.get_text_embeddings([instruction_text])
valid_embeds = text_embeds[0][attn_mask[0]].float()
return valid_embeds.cpu().numpy()
except Exception as e:
print(f" Language encoding failed: {e}")
return np.zeros((1, 4096))
def convert_lerobot_to_rdt(data_dir, output_dir, episode_num, gpu=0, no_language=False, t5_path=None, cam_high_key="high", cam_right_wrist_key="arm"):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Start converting LeRobot data to RDT format...")
print(f"Data source: {data_dir}")
print(f"Output directory: {output_dir}")
print(f"Processing episode number: {episode_num}")
print(f"GPU device: {gpu}")
scene_name = os.path.basename(data_dir)
instructions = None
if not no_language:
instructions = load_task_instructions(data_dir)
t5_embedder = None
if not no_language and instructions:
try:
print(f" Initializing T5 encoder...")
t5_embedder = T5Embedder(
from_pretrained=t5_path,
device=f"cuda:{gpu}" if torch.cuda.is_available() else "cpu",
model_max_length=1024,
use_offload_folder=None,
)
print(f" T5 encoder initialized successfully")
except Exception as e:
print(f" T5 encoder initialization failed: {e}")
print(f" Will skip language processing")
no_language = True
for i in range(episode_num):
print(f"Processing episode {i}...")
episode_data = load_lerobot_episode(data_dir, i, output_dir, cam_high_key=cam_high_key, cam_right_wrist_key=cam_right_wrist_key)
if episode_data is None:
print(f"Skipping episode {i}")
continue
episode_output_dir = os.path.join(output_dir, f"episode_{i}")
if not os.path.exists(episode_output_dir):
os.makedirs(episode_output_dir)
hdf5_path = os.path.join(episode_output_dir, f"episode_{i}.hdf5")
with h5py.File(hdf5_path, "w") as f:
f.create_dataset("action", data=episode_data['actions'])
obs = f.create_group("observations")
obs.create_dataset("qpos", data=episode_data['qpos'])
image = obs.create_group("images")
if episode_data['high_images']:
print(f" Encoding high camera images...")
high_enc, len_high = images_encoding(episode_data['high_images'])
if high_enc and len_high > 0:
image.create_dataset("cam_high", data=high_enc, dtype=f"S{len_high}")
print(f" Saved high camera images: {len(episode_data['high_images'])} frames")
else:
print(f" Warning: High camera images encoding failed")
if episode_data['arm_images']:
print(f" Encoding arm camera images...")
arm_enc, len_arm = images_encoding(episode_data['arm_images'])
if arm_enc and len_arm > 0:
image.create_dataset("cam_right_wrist", data=arm_enc, dtype=f"S{len_arm}")
print(f" Saved arm camera images: {len(episode_data['arm_images'])} frames")
else:
print(f" Warning: Arm camera images encoding failed")
# 添加机器人维度信息LeRobot: 5个关节 + 1个夹爪
# 根据process_data.py的逻辑每个时间步都需要记录维度信息
# LeRobot是单臂机器人只有右臂5个关节 + 1个夹爪 = 6维
# 左臂0维单臂机器人
# 为每个时间步记录维度信息
left_arm_dim = [0] * len(episode_data['actions']) # 左臂0维单臂机器人
right_arm_dim = [6] * len(episode_data['actions']) # 右臂6维5关节+1夹爪
obs.create_dataset("left_arm_dim", data=np.array(left_arm_dim))
obs.create_dataset("right_arm_dim", data=np.array(right_arm_dim))
print(f" Episode {i} converted successfully: {hdf5_path}")
print(f" Data length: {episode_data['episode_length']}")
print(f" Action shape: {episode_data['actions'].shape}")
print(f" Qpos shape: {episode_data['qpos'].shape}")
print(f" High camera frames: {len(episode_data['high_images'])}")
print(f" Arm camera frames: {len(episode_data['arm_images'])}")
if not no_language and t5_embedder and instructions:
print(f" Processing language instructions...")
try:
instruction = instructions[0]
language_features = encode_language_instruction(instruction, t5_embedder, f"cuda:{gpu}")
instructions_dir = os.path.join(episode_output_dir, "instructions")
if not os.path.exists(instructions_dir):
os.makedirs(instructions_dir)
lang_embed_path = os.path.join(instructions_dir, "lang_embed_0.pt")
torch.save(torch.from_numpy(language_features), lang_embed_path)
print(f" Language instruction encoded successfully: {instruction}")
print(f" Language features saved to: {lang_embed_path}")
print(f" Language features shape: {language_features.shape}, data type: {language_features.dtype}")
except Exception as e:
print(f" Language instruction processing failed: {e}")
print(f"\nConversion completed! Processed {episode_num} episodes")
print(f"Output directory: {output_dir}")
def main():
parser = argparse.ArgumentParser(description="Convert LeRobot data to RDT format")
parser.add_argument("--data_dir", type=str, required=True,
help="LeRobot data directory path")
parser.add_argument("--output_dir", type=str, required=True,
help="Output directory path")
parser.add_argument("--episode_num", type=int, default=10,
help="Number of episodes to process")
parser.add_argument("--gpu", type=int, default=0,
help="GPU device ID")
parser.add_argument("--no_language", action="store_true",
help="Skip language processing")
parser.add_argument("--cam_high_key", type=str, default="cam_high",
help="High camera key")
parser.add_argument("--cam_right_wrist_key", type=str, default="cam_right_wrist",
help="Right wrist camera key")
parser.add_argument("--cam_left_wrist_key", type=str, default="cam_left_wrist",
help="Left wrist camera key")
parser.add_argument("--t5_path", type=str, required=True,
help="T5 model path")
args = parser.parse_args()
if not os.path.exists(args.data_dir):
print(f"Error: Data directory does not exist: {args.data_dir}")
return
meta_file = os.path.join(args.data_dir, "meta/info.json")
if not os.path.exists(meta_file):
print(f"Error: Meta information file not found: {meta_file}")
return
try:
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
print("ffmpeg is available, will use ffmpeg to extract video frames")
except (subprocess.CalledProcessError, FileNotFoundError):
print("Warning: ffmpeg is not available, image data may not be extracted correctly")
print("Please install ffmpeg: conda install -c conda-forge ffmpeg=6.1")
return
with open(meta_file, 'r') as f:
meta_info = yaml.safe_load(f)
total_episodes = meta_info.get('total_episodes', 10)
if args.episode_num > total_episodes:
print(f"Warning: Requested episode number ({args.episode_num}) exceeds available number ({total_episodes})")
args.episode_num = total_episodes
convert_lerobot_to_rdt(
args.data_dir,
args.output_dir,
args.episode_num,
args.gpu,
args.no_language,
args.t5_path,
args.cam_high_key,
args.cam_right_wrist_key,
)
if __name__ == "__main__":
main()