From 1e7bb40565dc49146890ab15db5fb6bee5d96d24 Mon Sep 17 00:00:00 2001 From: GH Date: Tue, 21 Oct 2025 14:32:25 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BD=BF=E7=94=A8v0.3.3=E6=9E=84=E5=BB=BA?= =?UTF-8?q?=E5=85=B7=E8=BA=AB=E5=B9=B3=E5=8F=B0=E9=95=9C=E5=83=8F=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 8 + clean_build.sh | 33 ++++ .../{Dockerfile.internal => Dockerfile.merge} | 61 +++----- docker/{Dockerfile.user => Dockerfile.train} | 22 +-- docker/cloud_helper.py | 143 ++++++++++++++++++ docker/merge.py | 108 +++++++++++++ docker/merge_task.json | 7 + docker/smolvla_executor.py | 131 ++++++++++++++++ docker/smolvla_server.py | 66 ++++++++ docker/train.py | 53 +++++++ docker/train_task.json | 12 ++ pyproject.toml | 6 +- 12 files changed, 600 insertions(+), 50 deletions(-) create mode 100755 clean_build.sh rename docker/{Dockerfile.internal => Dockerfile.merge} (51%) rename docker/{Dockerfile.user => Dockerfile.train} (83%) create mode 100644 docker/cloud_helper.py create mode 100644 docker/merge.py create mode 100644 docker/merge_task.json create mode 100644 docker/smolvla_executor.py create mode 100644 docker/smolvla_server.py create mode 100644 docker/train.py create mode 100644 docker/train_task.json diff --git a/.gitignore b/.gitignore index c4d1f76..5793aa4 100644 --- a/.gitignore +++ b/.gitignore @@ -173,3 +173,11 @@ outputs/ # Dev folders .cache/* + +datasets +20250901 +s100 + +huggingface_models +docker/inputs +docker/outputs \ No newline at end of file diff --git a/clean_build.sh b/clean_build.sh new file mode 100755 index 0000000..20faf07 --- /dev/null +++ b/clean_build.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +set -e + +cd "$(dirname "$0")" + +VERSION=$(date +%Y%m%d)-latest +echo Building: ${VERSION} + + +###### Training Image ###### +# docker build -t dcloud/lerobot-train:${VERSION} -f docker/Dockerfile.train . --build-arg http_proxy=http://192.168.16.68:18000 --build-arg https_proxy=http://192.168.16.68:18000 +# docker run -it --rm --gpus '"device=7"' \ +# -v ${PWD}/docker/inputs:/workspace/inputs \ +# -v ${PWD}/docker/outputs:/workspace/outputs/checkpoints \ +# -v ${PWD}/docker/train_task.json:/workspace/inputs/task.json \ +# --shm-size=128G \ +# dcloud/lerobot-train:${VERSION} + + +###### Merge Image ###### +docker build -t dcloud/lerobot-merge:${VERSION} -f docker/Dockerfile.merge . --build-arg http_proxy=http://192.168.16.68:18000 --build-arg https_proxy=http://192.168.16.68:18000 +docker run -it --rm \ + -v ${PWD}/docker/inputs:/workspace/inputs \ + -v ${PWD}/docker/outputs:/workspace/outputs \ + -v ${PWD}/docker/merge_task.json:/workspace/inputs/task.json \ + --shm-size=128G \ + dcloud/lerobot-merge:${VERSION} + + +# # Remove dangling images +docker rmi $(docker images -f "dangling=true" -q) +docker images | grep lerobot | grep -v ${VERSION} | awk '{print $1":"$2}' | xargs docker rmi diff --git a/docker/Dockerfile.internal b/docker/Dockerfile.merge similarity index 51% rename from docker/Dockerfile.internal rename to docker/Dockerfile.merge index 8c77fe4..ab2e259 100644 --- a/docker/Dockerfile.internal +++ b/docker/Dockerfile.merge @@ -12,39 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -# This Dockerfile is designed for HuggingFace internal CI environments -# that require GPU access. It starts from an NVIDIA CUDA base image. +# This Dockerfile is designed for a lerobot user who wants to +# experiment with the project. It starts from an Python Slim base image. -# docker build -f docker/Dockerfile.internal -t lerobot-internal . +# docker build -f docker/Dockerfile.user -t lerobot-user . +# docker run -it --rm lerobot-user -# Configure the base image for CI with GPU access -# TODO(Steven): Bump these versions -ARG CUDA_VERSION=12.4.1 -ARG OS_VERSION=22.04 -FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION} - -# Define Python version argument +# Configure the base image ARG PYTHON_VERSION=3.10 +FROM python:${PYTHON_VERSION}-slim # Configure environment variables ENV DEBIAN_FRONTEND=noninteractive \ - MUJOCO_GL=egl \ - PATH=/lerobot/.venv/bin:$PATH \ - CUDA_VISIBLE_DEVICES=0 \ - TEST_TYPE=single_gpu \ - DEVICE=cuda + PATH=/lerobot/.venv/bin:$PATH -# Install Python, system dependencies, and uv (as root) +# Install system dependencies and uv (as root) RUN apt-get update && apt-get install -y --no-install-recommends \ - software-properties-common build-essential git curl \ - libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \ - libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \ - && add-apt-repository -y ppa:deadsnakes/ppa \ - && apt-get update \ - && apt-get install -y --no-install-recommends \ - python${PYTHON_VERSION} \ - python${PYTHON_VERSION}-venv \ - python${PYTHON_VERSION}-dev \ + build-essential git curl ffmpeg \ && curl -LsSf https://astral.sh/uv/install.sh | sh \ && mv /root/.local/bin/uv /usr/local/bin/uv \ && useradd --create-home --shell /bin/bash user_lerobot \ @@ -52,9 +36,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && apt-get clean && rm -rf /var/lib/apt/lists/* # Create application directory and set permissions +RUN git clone https://github.com/huggingface/lerobot.git -b v0.3.3 /lerobot WORKDIR /lerobot RUN chown -R user_lerobot:user_lerobot /lerobot +ADD ./huggingface_models /home/user_lerobot/.cache/huggingface +RUN chown -R user_lerobot:user_lerobot /home/user_lerobot/.cache + + # Switch to the non-root user USER user_lerobot @@ -67,18 +56,18 @@ ENV HOME=/home/user_lerobot \ # Create the virtual environment # We use a virtual environment inside the container—even though the container itself \ -# provides isolation—to ensure compatibility with the cluster and to prevent \ -# issues with MuJoCo and OpenGL drivers. -RUN uv venv --python python${PYTHON_VERSION} +# provides isolation—to closely resemble local development and allow users to \ +# run other Python projects in the same container without dependency conflicts. +RUN uv venv # Install Python dependencies for caching -COPY --chown=user_lerobot:user_lerobot pyproject.toml README.md MANIFEST.in ./ -COPY --chown=user_lerobot:user_lerobot src/ src/ -RUN uv pip install --no-cache ".[all]" +COPY --chown=user_lerobot:user_lerobot pyproject.toml ./ +# COPY --chown=user_lerobot:user_lerobot src/ src/ +RUN uv pip install --no-cache ".[smolvla]" -# Copy the rest of the application source code -# Make sure to have the git-LFS files for testing -COPY --chown=user_lerobot:user_lerobot . . +# Cloud Helper +# RUN uv pip install pyzmq msgpack msgpack_numpy zstandard -# Set the default command -CMD ["/bin/bash"] +# Set the default command - Online Inference Mode +ADD docker/merge.py /workspace/merge.py +CMD ["python", "/workspace/merge.py"] diff --git a/docker/Dockerfile.user b/docker/Dockerfile.train similarity index 83% rename from docker/Dockerfile.user rename to docker/Dockerfile.train index 4cfbb43..4ba4137 100644 --- a/docker/Dockerfile.user +++ b/docker/Dockerfile.train @@ -24,13 +24,11 @@ FROM python:${PYTHON_VERSION}-slim # Configure environment variables ENV DEBIAN_FRONTEND=noninteractive \ - MUJOCO_GL=egl \ PATH=/lerobot/.venv/bin:$PATH # Install system dependencies and uv (as root) RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential git curl libglib2.0-0 libegl1-mesa ffmpeg \ - libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \ + build-essential git curl ffmpeg \ && curl -LsSf https://astral.sh/uv/install.sh | sh \ && mv /root/.local/bin/uv /usr/local/bin/uv \ && useradd --create-home --shell /bin/bash user_lerobot \ @@ -38,9 +36,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && apt-get clean && rm -rf /var/lib/apt/lists/* # Create application directory and set permissions +RUN git clone https://github.com/huggingface/lerobot.git -b v0.3.3 /lerobot WORKDIR /lerobot RUN chown -R user_lerobot:user_lerobot /lerobot +ADD ./huggingface_models /home/user_lerobot/.cache/huggingface +RUN chown -R user_lerobot:user_lerobot /home/user_lerobot/.cache + + # Switch to the non-root user USER user_lerobot @@ -59,12 +62,9 @@ RUN uv venv # Install Python dependencies for caching COPY --chown=user_lerobot:user_lerobot pyproject.toml README.md MANIFEST.in ./ -COPY --chown=user_lerobot:user_lerobot src/ src/ -RUN uv pip install --no-cache ".[all]" +# COPY --chown=user_lerobot:user_lerobot src/ src/ +RUN uv pip install --no-cache ".[smolvla]" -# Copy the rest of the application code -# Make sure to have the git-LFS files for testing -COPY --chown=user_lerobot:user_lerobot . . - -# Set the default command -CMD ["/bin/bash"] +# Set the default command - Training mode +ADD docker/train.py /workspace/train.py +CMD ["python", "/workspace/train.py"] diff --git a/docker/cloud_helper.py b/docker/cloud_helper.py new file mode 100644 index 0000000..6b425ee --- /dev/null +++ b/docker/cloud_helper.py @@ -0,0 +1,143 @@ +import zmq +import msgpack +import msgpack_numpy as m + +from typing import Any, Callable +import logging + +logger = logging.getLogger(__name__) + +import zstandard as zstd + +compresser = zstd.ZstdCompressor(level=3) +decompresser = zstd.ZstdDecompressor() + + +def _pack(data: Any) -> bytes: + return compresser.compress(msgpack.packb(data, default=m.encode, use_bin_type=True)) + + +def _unpack(data: bytes) -> Any: + return msgpack.unpackb( + decompresser.decompress(data), object_hook=m.decode, raw=False + ) + + +class Server: + def __init__(self, host: str = "*", port: int = 5555): + self.host = host + self.port = port + + self.context = zmq.Context() + self.socket = self.context.socket(zmq.REP) + self.socket.bind(f"tcp://{self.host}:{self.port}") + logger.info(f"Server started at tcp://{self.host}:{self.port}") + + self.endpoints: dict[str, Callable[[Any], Any]] = {} + + def register_endpoint(self, command: str, func: Callable[[Any], Any]): + self.endpoints[command] = func + logger.info(f"Registered endpoint: {command} -> {func}") + + def return_error(self, message: str) -> None: + self.socket.send(_pack({"status": "error", "data": message})) + + def return_ok(self, data: Any) -> None: + self.socket.send(_pack({"status": "ok", "data": data})) + + def handle_once(self) -> None: + message = self.socket.recv() + message = _unpack(message) + + cmd = message.get("command") + data = message.get("data") + + logger.info("Received Command: %s", cmd) + + handler = self.endpoints.get(cmd) + + if handler is not None: + try: + if data is None: + response = handler() + else: + response = handler(data) + self.return_ok(response) + except Exception as e: + logger.error(f"Error handling command {cmd}: {e}") + self.return_error(str(e)) + else: + logger.warning(f"Unknown command: {cmd}") + self.return_error(f"Unknown command: {cmd}") + + def loop_forever(self): + try: + while True: + self.handle_once() + + except KeyboardInterrupt: + logger.info("Server shutting down...") + + finally: + self.socket.close() + self.context.term() + + +class Client: + def __init__(self, host: str = "localhost", port: int = 5555): + self.context = zmq.Context() + self.socket = self.context.socket(zmq.REQ) + self.socket.connect(f"tcp://{host}:{port}") + logger.info(f"Client connected to tcp://{host}:{port}") + + def call_endpoint(self, command: str, data=None): + self.socket.send(_pack({"command": command, "data": data})) + message = self.socket.recv() + message = _unpack(message) + + if message.get("status") == "ok": + return message.get("data") + else: + logger.error(f"Error from server: {message.get('data')}") + raise Exception(f"Error from server: {message.get('data')}") + + +if __name__ == "__main__": + import sys + from time import sleep + + logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" + ) + + assert (len(sys.argv) == 2) and ((mode := sys.argv[1]) in ("server", "client")), ( + "Usage: python service.py [server|client]" + ) + + ## Protocol: + # Request: { "command": str, "data": Any } + # Response: { "status": "ok" | "error", "data": Any if status=="ok" else str (ErrorMsg) } + + if mode == "server": + server = Server() + server.register_endpoint("ping", lambda: "pong") + server.register_endpoint("echo", lambda x: x) + server.register_endpoint("add", lambda data: data["a"] + data["b"]) + server.loop_forever() + + elif mode == "client": + client = Client() + while True: + try: + response = client.call_endpoint("ping") + print(f"Response from server: {response}") + response = client.call_endpoint("echo", "Hello, World!") + print(f"Response from server: {response}") + response = client.call_endpoint("add", {"a": 5, "b": 10}) + print(f"Response from server: {response}") + + sleep(0.2) + + except Exception as e: + print(f"Error: {e}") + break diff --git a/docker/merge.py b/docker/merge.py new file mode 100644 index 0000000..be619ea --- /dev/null +++ b/docker/merge.py @@ -0,0 +1,108 @@ +import json + +from pathlib import Path +from tqdm import tqdm + +from lerobot.datasets.lerobot_dataset import LeRobotDataset + + +with open("/workspace/inputs/task.json", "r") as f: + task_config = json.load(f) + +src_dataset_paths = [i for i in Path(task_config["train"]["input_data_path"]).iterdir() if i.is_dir()] + +EPS = 1e-2 + +# Feature Check +features = {} +keys_to_check = ["action", "observation.state", "observation.images"] +for p in src_dataset_paths: + dataset = LeRobotDataset(repo_id="O24H/Src", root=p) + if not features: + features = { + k: v for k, v in dataset.features.items() if any(k.startswith(prefix) for prefix in keys_to_check) + } + else: + for k in features.keys(): + assert k in dataset.features, f"Feature key {k} not found in dataset {p}" + # pprint(dataset.features[k]) + # pprint(features[k]) + # assert dataset.features[k] == features[k], f"Feature key {k} mismatch in dataset {p}" + +# Initialize Target Dataset +target_path = Path(task_config["train"]["output_data_path"]) +# assert not target_path.exists(), f"Output path {target_path} already exists!" +if target_path.exists(): + import os + + os.system(f"rm -rf {target_path}") + +### using images to store all data rather than videos: +### 35s per episode -> 20s per episode but size will be ~40x larger 6M -> 260M +# for i in features.keys(): +# if i.startswith("observation.images"): +# if not features[i]["dtype"] == "image": +# features[i]["dtype"] = "image" +# try: +# features[i].pop("info") +# except KeyError: +# pass +# target = LeRobotDataset.create( +# repo_id="O24H/Target", +# fps=30, +# root=target_path, +# robot_type="so101_follower", +# features=features, +# image_writer_processes=8, +# image_writer_threads=16, +# use_videos=False +# ) + +# [TODO] use the largest dataset as the base rather than creating a new one +target = LeRobotDataset.create( + repo_id="O24H/Target", + fps=30, + root=target_path, + robot_type="so101_follower", + features=features, + image_writer_processes=8, + image_writer_threads=16, +) + +for p in src_dataset_paths: + src = LeRobotDataset(repo_id="O24H/Src", root=p) + + for eps_idx in tqdm(range(src.num_episodes), desc=f"Processing episode in {p.name}"): + frame_idx = range( + src.episode_data_index["from"][eps_idx].item(), + src.episode_data_index["to"][eps_idx].item(), + ) + + eps_data = [src.__getitem__(i) for i in frame_idx] + + diff_actions = [eps_data[i]["action"] - eps_data[i - 1]["action"] for i in range(1, len(eps_data))] + keep_idx = [i + 1 for i, a in enumerate(diff_actions) if (a.abs() > EPS).any()] + + compress_ratio = len(keep_idx) / len(frame_idx) + print(f"Episode {eps_idx}: compress ratio {compress_ratio:.2f}") + + if len(keep_idx) < 32: + continue + # Skip too short episodes after compression + + for o in keep_idx: + batch = eps_data[o] + + image_keys = [k for k in batch.keys() if k.startswith("observation.images.")] + + frame = { + "action": batch["action"], + "observation.state": batch["observation.state"], + } + + for k in image_keys: + frame[k] = batch[k].permute(1, 2, 0).contiguous() # CHW -> HWC + + target.add_frame(frame, task=batch["task"]) + + target.save_episode() diff --git a/docker/merge_task.json b/docker/merge_task.json new file mode 100644 index 0000000..a75eea4 --- /dev/null +++ b/docker/merge_task.json @@ -0,0 +1,7 @@ +{ + "task_id": "b5c75014c1142feab3ee395b4a0bcc0", + "train": { + "input_data_path": "/workspace/inputs/", + "output_data_path": "/workspace/outputs/pick_orange_mixed" + } +} \ No newline at end of file diff --git a/docker/smolvla_executor.py b/docker/smolvla_executor.py new file mode 100644 index 0000000..5f27238 --- /dev/null +++ b/docker/smolvla_executor.py @@ -0,0 +1,131 @@ +from cloud_helper import Client + +from collections import deque +from lerobot.cameras.opencv.configuration_opencv import OpenCVCameraConfig +from lerobot.robots import Robot +from lerobot.robots.so101_follower.so101_follower import SO101Follower +from lerobot.robots.so101_follower.config_so101_follower import SO101FollowerConfig +import numpy as np + +import logging +import time + +from lerobot.utils import buffer + +logger = logging.getLogger(__name__) + + +def freq_control(func, freq: int = 25): + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + elapsed_time = end_time - start_time + # logger.info(f"'{func.__name__}' tooks {elapsed_time * 1000:.2f} ms") + sleep_time = max(0, (1.0 / freq) - elapsed_time) + time.sleep(sleep_time) + return result + + return wrapper + + +class SmolVLAExecutor: + def __init__(self, robot: Robot, runtime: Client, task: str, control_freq: int = 25): + self.robot = robot + self.runtime = runtime + self._action_queue = deque() + self._cache = {} + self.task = task + self.joint_names = [ + "shoulder_pan", + "shoulder_lift", + "elbow_flex", + "wrist_flex", + "wrist_roll", + "gripper", + ] + + def get_actions(self, instruction: str = ""): + observation = self.robot.get_observation() + + batch = { + "observation": { + "images.front": observation["front"], + "images.wrist": observation["wrist"], + "state": np.array([observation[key + ".pos"] for key in self.joint_names], dtype="float32"), + }, + "instruction": instruction if instruction else self.task, + } + + actions_array = self.runtime.call_endpoint("get_actions", batch) # (B, chunk_size, action_dim) + + if actions_array is None: + logger.warning("Server returned None") + raise ConnectionError("Failed to receive response from RDT server") + + actions_array = ( + actions_array.squeeze(0) if len(actions_array.shape) == 3 else actions_array + ) # (chunk_size, action_dim) + + return actions_array + + def apply_filter(self, window_size: int = 3): + action_buffer = np.array(self._action_queue) # (n_steps, action_dim) + n_steps, batch_size, action_dim = action_buffer.shape + + for b in range(batch_size): + for d in range(action_dim): + series = action_buffer[:, b, d] + + if window_size > 1: + # Apply a simple moving average filter + padded_series = np.pad(series, (window_size // 2, window_size // 2), mode="edge") + smoothed_series = np.convolve( + padded_series, np.ones(window_size) / window_size, mode="valid" + ) + series[:] = smoothed_series + + action_buffer = self._action_queue = deque(action_buffer.tolist()) + + @freq_control(25) + def loop_once(self): + if len(self._action_queue) <= 1: + new_actions = self.get_actions() + self._action_queue.extend(new_actions.transpose(0, 1)) + + # Apply the filter + self.apply_filter() + + action_values = self._action_queue.popleft() + + action_dict = {f"{joint}.pos": float(action_values[i]) for i, joint in enumerate(self.joint_names)} + self.robot.send_action(action_dict) + + def run(self): + while True: + self.loop_once() + +if __name__ == "__main__": + + logging.basicConfig(level=logging.INFO) + + robot = SO101Follower( + SO101FollowerConfig( + port="/dev/ttyACM1", + cameras={ + "wrist": OpenCVCameraConfig(index_or_path=8, width=640, height=480, fps=25), + "front": OpenCVCameraConfig(index_or_path=4, width=640, height=480, fps=30), + }, + ) + ) + robot.connect() + + client = Client(host="120.48.81.132", port=50000) + + executor = SmolVLAExecutor( + robot=robot, + runtime=client, + task="pick the red marker to the bin", + control_freq=25, + ) + executor.run() diff --git a/docker/smolvla_server.py b/docker/smolvla_server.py new file mode 100644 index 0000000..3480a40 --- /dev/null +++ b/docker/smolvla_server.py @@ -0,0 +1,66 @@ +import torch +import os + +from cloud_helper import Server +from lerobot.policies.factory import get_policy_class + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["HF_HUB_OFFLINE"] = "1" + + +class LerobotInferenceServer: + def __init__( + self, + checkpoint: str, + policy_type: str = "smolvla", + host: str = "localhost", + port: int = 5555, + device="cuda", + ): + self.server = Server(host, port) + self.policy_type = policy_type + policy_class = get_policy_class(self.policy_type) + self.policy = policy_class.from_pretrained(checkpoint) + self.device = device + self.policy.to(self.device) + print(f"Loaded {self.policy_type.upper()} policy from {checkpoint}") + + def get_actions(self, batch): + # batch = { + # "observation": { + # "state": ..., + # "images.front": ..., HWC uint8 + # "images.wrist": ..., + # }, + # "instruction": ..., + # } + + obs = {} + + for k, v in batch["observation"].items(): + if k.startswith("images.") and v is not None: + img = v.astype("float32") / 255.0 + img = img.transpose(2, 0, 1) # HWC -> CHW + img = torch.from_numpy(img).unsqueeze(0).to(self.device) + obs[f"observation.{k}"] = img + elif k == "state": + tensor = torch.from_numpy(v.astype("float32")).unsqueeze(0).to(self.device) + obs[f"observation.{k}"] = tensor + obs["task"] = batch["instruction"] + + action_chunk = self.policy.predict_action_chunk(obs) + + return action_chunk.cpu().numpy() # (B, chunk_size, action_dim) + + def run(self): + self.server.register_endpoint("get_actions", self.get_actions) + print(f"Lerobot {self.policy_type.upper()} Server is running...") + self.server.loop_forever() + + +if __name__ == "__main__": + smolvla_checkpoint = "./20250901/pick_red_marker_smolvla/checkpoints/last/pretrained_model" + server = LerobotInferenceServer( + checkpoint=smolvla_checkpoint, policy_type="smolvla", host="0.0.0.0", port=50000 + ) + server.run() diff --git a/docker/train.py b/docker/train.py new file mode 100644 index 0000000..b142619 --- /dev/null +++ b/docker/train.py @@ -0,0 +1,53 @@ +import json +import os + +with open("/workspace/inputs/task.json") as f: + task_configs = json.load(f) + + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Lerobot supports only one GPU for training +os.environ["HF_HUB_OFFLINE"] = "1" +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +assert "train" in task_configs, "Not a validate train config" +assert task_configs["train"]["model"] in ["act", "smolvla"], "Only act and smolvla are supported for training" + +use_policy = ( + "--policy.path=lerobot/smolvla_base" + if task_configs["train"]["model"] == "smolvla" + else "--policy.type=act" +) +task_id = task_configs["task_id"] +data_path = task_configs["train"]["input_data_path"] +ckpt_path = task_configs["train"]["checkpoint_path"] +bs = task_configs["train"]["batch_size"] +epochs = task_configs["train"]["epochs"] + +use_resume = task_configs["train"].get("resume", False) +if use_resume: + resume_path = f'--policy.path="{task_configs["train"]["checkpoint_path"]}/pretrained_model"' + # eg: ${checkpoint_path}/checkpoints/last + +with open(data_path + "/meta/info.json", "r") as f: + dataset_info = json.load(f) + total_frames = dataset_info["total_frames"] + +steps_per_epoch = total_frames // bs + 1 +steps = steps_per_epoch * epochs +print( + "Lerobot only support steps, calculating steps from epochs...", + f"Steps per epoch: {steps_per_epoch}, Total steps: {steps}", +) + +train_cmd = f"""lerobot-train \ + {resume_path if use_resume else use_policy} \ + --policy.push_to_hub=false \ + --dataset.repo_id=D-Robotics/{task_id} \ + --dataset.root={data_path} \ + --batch_size={bs} \ + --output_dir={ckpt_path} \ + --steps={steps} --save_freq={steps_per_epoch} \ +""" + +print("Executing command:\n", train_cmd) +os.system(train_cmd) diff --git a/docker/train_task.json b/docker/train_task.json new file mode 100644 index 0000000..6db760f --- /dev/null +++ b/docker/train_task.json @@ -0,0 +1,12 @@ +{ + "task_id": "b5c75014c1142feab3ee395b4a0bcc0", + "gpu_id": "0", + "train": { + "model":"smolvla", + "epochs":20, + "batch_size":64, + "log_path": "/workspace/logs", + "checkpoint_path": "/workspace/outputs/checkpoints", + "input_data_path": "/workspace/inputs/pick_red_marker" + } +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 4696a2a..ffb9795 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,9 +74,9 @@ dependencies = [ "pyserial>=3.5", "wandb>=0.20.0", - "torch>=2.2.1,<2.8.0", # TODO: Bumb dependency - "torchcodec>=0.2.1,<0.6.0; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')", # TODO: Bumb dependency - "torchvision>=0.21.0,<0.23.0", # TODO: Bumb dependency + "torch==2.6.0", # TODO: Bumb dependency + "torchcodec==0.2.1; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')", # TODO: Bumb dependency + "torchvision==0.21.0", # TODO: Bumb dependency "draccus==0.10.0", # TODO: Remove == "gymnasium>=0.29.1,<1.0.0", # TODO: Bumb dependency