From 1e7bb40565dc49146890ab15db5fb6bee5d96d24 Mon Sep 17 00:00:00 2001
From: GH <guannan.he@d-robotics.cc>
Date: Tue, 21 Oct 2025 14:32:25 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BD=BF=E7=94=A8v0.3.3=E6=9E=84=E5=BB=BA?=
 =?UTF-8?q?=E5=85=B7=E8=BA=AB=E5=B9=B3=E5=8F=B0=E9=95=9C=E5=83=8F=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                    |   8 +
 clean_build.sh                                |  33 ++++
 .../{Dockerfile.internal => Dockerfile.merge} |  61 +++-----
 docker/{Dockerfile.user => Dockerfile.train}  |  22 +--
 docker/cloud_helper.py                        | 143 ++++++++++++++++++
 docker/merge.py                               | 108 +++++++++++++
 docker/merge_task.json                        |   7 +
 docker/smolvla_executor.py                    | 131 ++++++++++++++++
 docker/smolvla_server.py                      |  66 ++++++++
 docker/train.py                               |  53 +++++++
 docker/train_task.json                        |  12 ++
 pyproject.toml                                |   6 +-
 12 files changed, 600 insertions(+), 50 deletions(-)
 create mode 100755 clean_build.sh
 rename docker/{Dockerfile.internal => Dockerfile.merge} (51%)
 rename docker/{Dockerfile.user => Dockerfile.train} (83%)
 create mode 100644 docker/cloud_helper.py
 create mode 100644 docker/merge.py
 create mode 100644 docker/merge_task.json
 create mode 100644 docker/smolvla_executor.py
 create mode 100644 docker/smolvla_server.py
 create mode 100644 docker/train.py
 create mode 100644 docker/train_task.json

diff --git a/.gitignore b/.gitignore
index c4d1f76..5793aa4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -173,3 +173,11 @@ outputs/
 
 # Dev folders
 .cache/*
+
+datasets
+20250901
+s100
+
+huggingface_models
+docker/inputs
+docker/outputs
\ No newline at end of file
diff --git a/clean_build.sh b/clean_build.sh
new file mode 100755
index 0000000..20faf07
--- /dev/null
+++ b/clean_build.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -e
+
+cd "$(dirname "$0")"
+
+VERSION=$(date +%Y%m%d)-latest
+echo Building: ${VERSION}
+
+
+###### Training Image ######
+# docker build -t dcloud/lerobot-train:${VERSION} -f docker/Dockerfile.train . --build-arg http_proxy=http://192.168.16.68:18000 --build-arg https_proxy=http://192.168.16.68:18000
+# docker run -it --rm --gpus '"device=7"' \
+#     -v ${PWD}/docker/inputs:/workspace/inputs \
+#     -v ${PWD}/docker/outputs:/workspace/outputs/checkpoints \
+#     -v ${PWD}/docker/train_task.json:/workspace/inputs/task.json \
+#     --shm-size=128G \
+#     dcloud/lerobot-train:${VERSION}
+
+
+###### Merge Image ######
+docker build -t dcloud/lerobot-merge:${VERSION} -f docker/Dockerfile.merge . --build-arg http_proxy=http://192.168.16.68:18000 --build-arg https_proxy=http://192.168.16.68:18000
+docker run -it --rm \
+    -v ${PWD}/docker/inputs:/workspace/inputs \
+    -v ${PWD}/docker/outputs:/workspace/outputs \
+    -v ${PWD}/docker/merge_task.json:/workspace/inputs/task.json \
+    --shm-size=128G \
+    dcloud/lerobot-merge:${VERSION}
+
+
+# # Remove dangling images
+docker rmi $(docker images -f "dangling=true" -q)
+docker images | grep lerobot | grep -v ${VERSION} | awk '{print $1":"$2}' | xargs docker rmi
diff --git a/docker/Dockerfile.internal b/docker/Dockerfile.merge
similarity index 51%
rename from docker/Dockerfile.internal
rename to docker/Dockerfile.merge
index 8c77fe4..ab2e259 100644
--- a/docker/Dockerfile.internal
+++ b/docker/Dockerfile.merge
@@ -12,39 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This Dockerfile is designed for HuggingFace internal CI environments
-# that require GPU access. It starts from an NVIDIA CUDA base image.
+# This Dockerfile is designed for a lerobot user who wants to
+# experiment with the project. It starts from an Python Slim base image.
 
-# docker build -f docker/Dockerfile.internal -t lerobot-internal .
+# docker build -f docker/Dockerfile.user -t lerobot-user .
+# docker run -it --rm lerobot-user
 
-# Configure the base image for CI with GPU access
-# TODO(Steven): Bump these versions
-ARG CUDA_VERSION=12.4.1
-ARG OS_VERSION=22.04
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}
-
-# Define Python version argument
+# Configure the base image
 ARG PYTHON_VERSION=3.10
+FROM python:${PYTHON_VERSION}-slim
 
 # Configure environment variables
 ENV DEBIAN_FRONTEND=noninteractive \
-    MUJOCO_GL=egl \
-    PATH=/lerobot/.venv/bin:$PATH \
-    CUDA_VISIBLE_DEVICES=0 \
-    TEST_TYPE=single_gpu \
-    DEVICE=cuda
+    PATH=/lerobot/.venv/bin:$PATH
 
-# Install Python, system dependencies, and uv (as root)
+# Install system dependencies and uv (as root)
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    software-properties-common build-essential git curl \
-    libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
-    libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \
-    && add-apt-repository -y ppa:deadsnakes/ppa \
-    && apt-get update \
-    && apt-get install -y --no-install-recommends \
-       python${PYTHON_VERSION} \
-       python${PYTHON_VERSION}-venv \
-       python${PYTHON_VERSION}-dev \
+    build-essential git curl ffmpeg \
     && curl -LsSf https://astral.sh/uv/install.sh | sh \
     && mv /root/.local/bin/uv /usr/local/bin/uv \
     && useradd --create-home --shell /bin/bash user_lerobot \
@@ -52,9 +36,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && apt-get clean && rm -rf /var/lib/apt/lists/*
 
 # Create application directory and set permissions
+RUN git clone https://github.com/huggingface/lerobot.git -b v0.3.3 /lerobot
 WORKDIR /lerobot
 RUN chown -R user_lerobot:user_lerobot /lerobot
 
+ADD ./huggingface_models /home/user_lerobot/.cache/huggingface
+RUN chown -R user_lerobot:user_lerobot /home/user_lerobot/.cache
+
+
 # Switch to the non-root user
 USER user_lerobot
 
@@ -67,18 +56,18 @@ ENV HOME=/home/user_lerobot \
 
 # Create the virtual environment
 # We use a virtual environment inside the container—even though the container itself \
-# provides isolation—to ensure compatibility with the cluster and to prevent \
-# issues with MuJoCo and OpenGL drivers.
-RUN uv venv --python python${PYTHON_VERSION}
+# provides isolation—to closely resemble local development and allow users to \
+# run other Python projects in the same container without dependency conflicts.
+RUN uv venv
 
 # Install Python dependencies for caching
-COPY --chown=user_lerobot:user_lerobot pyproject.toml README.md MANIFEST.in ./
-COPY --chown=user_lerobot:user_lerobot src/ src/
-RUN uv pip install --no-cache ".[all]"
+COPY --chown=user_lerobot:user_lerobot pyproject.toml ./
+# COPY --chown=user_lerobot:user_lerobot src/ src/
+RUN uv pip install --no-cache ".[smolvla]"
 
-# Copy the rest of the application source code
-# Make sure to have the git-LFS files for testing
-COPY --chown=user_lerobot:user_lerobot . .
+# Cloud Helper
+# RUN uv pip install pyzmq msgpack msgpack_numpy zstandard 
 
-# Set the default command
-CMD ["/bin/bash"]
+# Set the default command - Online Inference Mode
+ADD docker/merge.py /workspace/merge.py
+CMD ["python", "/workspace/merge.py"]
diff --git a/docker/Dockerfile.user b/docker/Dockerfile.train
similarity index 83%
rename from docker/Dockerfile.user
rename to docker/Dockerfile.train
index 4cfbb43..4ba4137 100644
--- a/docker/Dockerfile.user
+++ b/docker/Dockerfile.train
@@ -24,13 +24,11 @@ FROM python:${PYTHON_VERSION}-slim
 
 # Configure environment variables
 ENV DEBIAN_FRONTEND=noninteractive \
-    MUJOCO_GL=egl \
     PATH=/lerobot/.venv/bin:$PATH
 
 # Install system dependencies and uv (as root)
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential git curl libglib2.0-0 libegl1-mesa ffmpeg \
-    libusb-1.0-0-dev speech-dispatcher libgeos-dev portaudio19-dev \
+    build-essential git curl ffmpeg \
     && curl -LsSf https://astral.sh/uv/install.sh | sh \
     && mv /root/.local/bin/uv /usr/local/bin/uv \
     && useradd --create-home --shell /bin/bash user_lerobot \
@@ -38,9 +36,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && apt-get clean && rm -rf /var/lib/apt/lists/*
 
 # Create application directory and set permissions
+RUN git clone https://github.com/huggingface/lerobot.git -b v0.3.3 /lerobot
 WORKDIR /lerobot
 RUN chown -R user_lerobot:user_lerobot /lerobot
 
+ADD ./huggingface_models /home/user_lerobot/.cache/huggingface
+RUN chown -R user_lerobot:user_lerobot /home/user_lerobot/.cache
+
+
 # Switch to the non-root user
 USER user_lerobot
 
@@ -59,12 +62,9 @@ RUN uv venv
 
 # Install Python dependencies for caching
 COPY --chown=user_lerobot:user_lerobot pyproject.toml README.md MANIFEST.in ./
-COPY --chown=user_lerobot:user_lerobot src/ src/
-RUN uv pip install --no-cache ".[all]"
+# COPY --chown=user_lerobot:user_lerobot src/ src/
+RUN uv pip install --no-cache ".[smolvla]"
 
-# Copy the rest of the application code
-# Make sure to have the git-LFS files for testing
-COPY --chown=user_lerobot:user_lerobot . .
-
-# Set the default command
-CMD ["/bin/bash"]
+# Set the default command - Training mode
+ADD docker/train.py /workspace/train.py
+CMD ["python", "/workspace/train.py"]
diff --git a/docker/cloud_helper.py b/docker/cloud_helper.py
new file mode 100644
index 0000000..6b425ee
--- /dev/null
+++ b/docker/cloud_helper.py
@@ -0,0 +1,143 @@
+import zmq
+import msgpack
+import msgpack_numpy as m
+
+from typing import Any, Callable
+import logging
+
+logger = logging.getLogger(__name__)
+
+import zstandard as zstd
+
+compresser = zstd.ZstdCompressor(level=3)
+decompresser = zstd.ZstdDecompressor()
+
+
+def _pack(data: Any) -> bytes:
+    return compresser.compress(msgpack.packb(data, default=m.encode, use_bin_type=True))
+
+
+def _unpack(data: bytes) -> Any:
+    return msgpack.unpackb(
+        decompresser.decompress(data), object_hook=m.decode, raw=False
+    )
+
+
+class Server:
+    def __init__(self, host: str = "*", port: int = 5555):
+        self.host = host
+        self.port = port
+
+        self.context = zmq.Context()
+        self.socket = self.context.socket(zmq.REP)
+        self.socket.bind(f"tcp://{self.host}:{self.port}")
+        logger.info(f"Server started at tcp://{self.host}:{self.port}")
+
+        self.endpoints: dict[str, Callable[[Any], Any]] = {}
+
+    def register_endpoint(self, command: str, func: Callable[[Any], Any]):
+        self.endpoints[command] = func
+        logger.info(f"Registered endpoint: {command} -> {func}")
+
+    def return_error(self, message: str) -> None:
+        self.socket.send(_pack({"status": "error", "data": message}))
+
+    def return_ok(self, data: Any) -> None:
+        self.socket.send(_pack({"status": "ok", "data": data}))
+
+    def handle_once(self) -> None:
+        message = self.socket.recv()
+        message = _unpack(message)
+
+        cmd = message.get("command")
+        data = message.get("data")
+
+        logger.info("Received Command: %s", cmd)
+
+        handler = self.endpoints.get(cmd)
+
+        if handler is not None:
+            try:
+                if data is None:
+                    response = handler()
+                else:
+                    response = handler(data)
+                self.return_ok(response)
+            except Exception as e:
+                logger.error(f"Error handling command {cmd}: {e}")
+                self.return_error(str(e))
+        else:
+            logger.warning(f"Unknown command: {cmd}")
+            self.return_error(f"Unknown command: {cmd}")
+
+    def loop_forever(self):
+        try:
+            while True:
+                self.handle_once()
+
+        except KeyboardInterrupt:
+            logger.info("Server shutting down...")
+
+        finally:
+            self.socket.close()
+            self.context.term()
+
+
+class Client:
+    def __init__(self, host: str = "localhost", port: int = 5555):
+        self.context = zmq.Context()
+        self.socket = self.context.socket(zmq.REQ)
+        self.socket.connect(f"tcp://{host}:{port}")
+        logger.info(f"Client connected to tcp://{host}:{port}")
+
+    def call_endpoint(self, command: str, data=None):
+        self.socket.send(_pack({"command": command, "data": data}))
+        message = self.socket.recv()
+        message = _unpack(message)
+
+        if message.get("status") == "ok":
+            return message.get("data")
+        else:
+            logger.error(f"Error from server: {message.get('data')}")
+            raise Exception(f"Error from server: {message.get('data')}")
+
+
+if __name__ == "__main__":
+    import sys
+    from time import sleep
+
+    logging.basicConfig(
+        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+    )
+
+    assert (len(sys.argv) == 2) and ((mode := sys.argv[1]) in ("server", "client")), (
+        "Usage: python service.py [server|client]"
+    )
+
+    ## Protocol:
+    # Request: { "command": str, "data": Any }
+    # Response: { "status": "ok" | "error", "data": Any if status=="ok" else str (ErrorMsg) }
+
+    if mode == "server":
+        server = Server()
+        server.register_endpoint("ping", lambda: "pong")
+        server.register_endpoint("echo", lambda x: x)
+        server.register_endpoint("add", lambda data: data["a"] + data["b"])
+        server.loop_forever()
+
+    elif mode == "client":
+        client = Client()
+        while True:
+            try:
+                response = client.call_endpoint("ping")
+                print(f"Response from server: {response}")
+                response = client.call_endpoint("echo", "Hello, World!")
+                print(f"Response from server: {response}")
+                response = client.call_endpoint("add", {"a": 5, "b": 10})
+                print(f"Response from server: {response}")
+
+                sleep(0.2)
+
+            except Exception as e:
+                print(f"Error: {e}")
+                break
diff --git a/docker/merge.py b/docker/merge.py
new file mode 100644
index 0000000..be619ea
--- /dev/null
+++ b/docker/merge.py
@@ -0,0 +1,108 @@
+import json
+
+from pathlib import Path
+from tqdm import tqdm
+
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+
+
+with open("/workspace/inputs/task.json", "r") as f:
+    task_config = json.load(f)
+
+src_dataset_paths = [i for i in Path(task_config["train"]["input_data_path"]).iterdir() if i.is_dir()]
+
+EPS = 1e-2
+
+# Feature Check
+features = {}
+keys_to_check = ["action", "observation.state", "observation.images"]
+for p in src_dataset_paths:
+    dataset = LeRobotDataset(repo_id="O24H/Src", root=p)
+    if not features:
+        features = {
+            k: v for k, v in dataset.features.items() if any(k.startswith(prefix) for prefix in keys_to_check)
+        }
+    else:
+        for k in features.keys():
+            assert k in dataset.features, f"Feature key {k} not found in dataset {p}"
+            # pprint(dataset.features[k])
+            # pprint(features[k])
+            # assert dataset.features[k] == features[k], f"Feature key {k} mismatch in dataset {p}"
+
+# Initialize Target Dataset
+target_path = Path(task_config["train"]["output_data_path"])
+# assert not target_path.exists(), f"Output path {target_path} already exists!"
+if target_path.exists():
+    import os
+
+    os.system(f"rm -rf {target_path}")
+
+### using images to store all data rather than videos:
+### 35s per episode -> 20s per episode but size will be ~40x larger 6M -> 260M
+# for i in features.keys():
+#     if i.startswith("observation.images"):
+#         if not features[i]["dtype"] == "image":
+#             features[i]["dtype"] = "image"
+#             try:
+#                 features[i].pop("info")
+#             except KeyError:
+#                 pass
+# target = LeRobotDataset.create(
+#     repo_id="O24H/Target",
+#     fps=30,
+#     root=target_path,
+#     robot_type="so101_follower",
+#     features=features,
+#     image_writer_processes=8,
+#     image_writer_threads=16,
+#     use_videos=False
+# )
+
+# [TODO] use the largest dataset as the base rather than creating a new one
+target = LeRobotDataset.create(
+    repo_id="O24H/Target",
+    fps=30,
+    root=target_path,
+    robot_type="so101_follower",
+    features=features,
+    image_writer_processes=8,
+    image_writer_threads=16,
+)
+
+for p in src_dataset_paths:
+    src = LeRobotDataset(repo_id="O24H/Src", root=p)
+
+    for eps_idx in tqdm(range(src.num_episodes), desc=f"Processing episode in {p.name}"):
+        frame_idx = range(
+            src.episode_data_index["from"][eps_idx].item(),
+            src.episode_data_index["to"][eps_idx].item(),
+        )
+
+        eps_data = [src.__getitem__(i) for i in frame_idx]
+
+        diff_actions = [eps_data[i]["action"] - eps_data[i - 1]["action"] for i in range(1, len(eps_data))]
+        keep_idx = [i + 1 for i, a in enumerate(diff_actions) if (a.abs() > EPS).any()]
+
+        compress_ratio = len(keep_idx) / len(frame_idx)
+        print(f"Episode {eps_idx}: compress ratio {compress_ratio:.2f}")
+
+        if len(keep_idx) < 32:
+            continue
+            # Skip too short episodes after compression
+
+        for o in keep_idx:
+            batch = eps_data[o]
+
+            image_keys = [k for k in batch.keys() if k.startswith("observation.images.")]
+
+            frame = {
+                "action": batch["action"],
+                "observation.state": batch["observation.state"],
+            }
+
+            for k in image_keys:
+                frame[k] = batch[k].permute(1, 2, 0).contiguous()  # CHW -> HWC
+
+            target.add_frame(frame, task=batch["task"])
+
+        target.save_episode()
diff --git a/docker/merge_task.json b/docker/merge_task.json
new file mode 100644
index 0000000..a75eea4
--- /dev/null
+++ b/docker/merge_task.json
@@ -0,0 +1,7 @@
+{
+    "task_id": "b5c75014c1142feab3ee395b4a0bcc0",
+    "train": {
+        "input_data_path": "/workspace/inputs/",
+        "output_data_path": "/workspace/outputs/pick_orange_mixed"
+    }
+}
\ No newline at end of file
diff --git a/docker/smolvla_executor.py b/docker/smolvla_executor.py
new file mode 100644
index 0000000..5f27238
--- /dev/null
+++ b/docker/smolvla_executor.py
@@ -0,0 +1,131 @@
+from cloud_helper import Client
+
+from collections import deque
+from lerobot.cameras.opencv.configuration_opencv import OpenCVCameraConfig
+from lerobot.robots import Robot
+from lerobot.robots.so101_follower.so101_follower import SO101Follower
+from lerobot.robots.so101_follower.config_so101_follower import SO101FollowerConfig
+import numpy as np
+
+import logging
+import time
+
+from lerobot.utils import buffer
+
+logger = logging.getLogger(__name__)
+
+
+def freq_control(func, freq: int = 25):
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        # logger.info(f"'{func.__name__}' tooks {elapsed_time * 1000:.2f} ms")
+        sleep_time = max(0, (1.0 / freq) - elapsed_time)
+        time.sleep(sleep_time)
+        return result
+
+    return wrapper
+
+
+class SmolVLAExecutor:
+    def __init__(self, robot: Robot, runtime: Client, task: str, control_freq: int = 25):
+        self.robot = robot
+        self.runtime = runtime
+        self._action_queue = deque()
+        self._cache = {}
+        self.task = task
+        self.joint_names = [
+            "shoulder_pan",
+            "shoulder_lift",
+            "elbow_flex",
+            "wrist_flex",
+            "wrist_roll",
+            "gripper",
+        ]
+
+    def get_actions(self, instruction: str = ""):
+        observation = self.robot.get_observation()
+
+        batch = {
+            "observation": {
+                "images.front": observation["front"],
+                "images.wrist": observation["wrist"],
+                "state": np.array([observation[key + ".pos"] for key in self.joint_names], dtype="float32"),
+            },
+            "instruction": instruction if instruction else self.task,
+        }
+
+        actions_array = self.runtime.call_endpoint("get_actions", batch)  # (B, chunk_size, action_dim)
+
+        if actions_array is None:
+            logger.warning("Server returned None")
+            raise ConnectionError("Failed to receive response from RDT server")
+
+        actions_array = (
+            actions_array.squeeze(0) if len(actions_array.shape) == 3 else actions_array
+        )  # (chunk_size, action_dim)
+
+        return actions_array
+
+    def apply_filter(self, window_size: int = 3):
+        action_buffer = np.array(self._action_queue)  # (n_steps, action_dim)
+        n_steps, batch_size, action_dim = action_buffer.shape
+
+        for b in range(batch_size):
+            for d in range(action_dim):
+                series = action_buffer[:, b, d]
+
+                if window_size > 1:
+                    # Apply a simple moving average filter
+                    padded_series = np.pad(series, (window_size // 2, window_size // 2), mode="edge")
+                    smoothed_series = np.convolve(
+                        padded_series, np.ones(window_size) / window_size, mode="valid"
+                    )
+                    series[:] = smoothed_series
+
+        action_buffer = self._action_queue = deque(action_buffer.tolist())
+
+    @freq_control(25)
+    def loop_once(self):
+        if len(self._action_queue) <= 1:
+            new_actions = self.get_actions()
+            self._action_queue.extend(new_actions.transpose(0, 1))
+
+            # Apply the filter
+            self.apply_filter()
+
+        action_values = self._action_queue.popleft()
+
+        action_dict = {f"{joint}.pos": float(action_values[i]) for i, joint in enumerate(self.joint_names)}
+        self.robot.send_action(action_dict)
+
+    def run(self):
+        while True:
+            self.loop_once()
+
+if __name__ == "__main__":
+
+    logging.basicConfig(level=logging.INFO)
+
+    robot = SO101Follower(
+        SO101FollowerConfig(
+            port="/dev/ttyACM1",
+            cameras={
+                "wrist": OpenCVCameraConfig(index_or_path=8, width=640, height=480, fps=25),
+                "front": OpenCVCameraConfig(index_or_path=4, width=640, height=480, fps=30),
+            },
+        )
+    )
+    robot.connect()
+
+    client = Client(host="120.48.81.132", port=50000)
+
+    executor = SmolVLAExecutor(
+        robot=robot,
+        runtime=client,
+        task="pick the red marker to the bin",
+        control_freq=25,
+    )
+    executor.run()
diff --git a/docker/smolvla_server.py b/docker/smolvla_server.py
new file mode 100644
index 0000000..3480a40
--- /dev/null
+++ b/docker/smolvla_server.py
@@ -0,0 +1,66 @@
+import torch
+import os
+
+from cloud_helper import Server
+from lerobot.policies.factory import get_policy_class
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["HF_HUB_OFFLINE"] = "1"
+
+
+class LerobotInferenceServer:
+    def __init__(
+        self,
+        checkpoint: str,
+        policy_type: str = "smolvla",
+        host: str = "localhost",
+        port: int = 5555,
+        device="cuda",
+    ):
+        self.server = Server(host, port)
+        self.policy_type = policy_type
+        policy_class = get_policy_class(self.policy_type)
+        self.policy = policy_class.from_pretrained(checkpoint)
+        self.device = device
+        self.policy.to(self.device)
+        print(f"Loaded {self.policy_type.upper()} policy from {checkpoint}")
+
+    def get_actions(self, batch):
+        # batch = {
+        #     "observation": {
+        #         "state": ...,
+        #         "images.front": ..., HWC uint8
+        #         "images.wrist": ...,
+        #     },
+        #     "instruction": ...,
+        # }
+
+        obs = {}
+
+        for k, v in batch["observation"].items():
+            if k.startswith("images.") and v is not None:
+                img = v.astype("float32") / 255.0
+                img = img.transpose(2, 0, 1)  # HWC -> CHW
+                img = torch.from_numpy(img).unsqueeze(0).to(self.device)
+                obs[f"observation.{k}"] = img
+            elif k == "state":
+                tensor = torch.from_numpy(v.astype("float32")).unsqueeze(0).to(self.device)
+                obs[f"observation.{k}"] = tensor
+        obs["task"] = batch["instruction"]
+
+        action_chunk = self.policy.predict_action_chunk(obs)
+
+        return action_chunk.cpu().numpy() # (B, chunk_size, action_dim)
+
+    def run(self):
+        self.server.register_endpoint("get_actions", self.get_actions)
+        print(f"Lerobot {self.policy_type.upper()} Server is running...")
+        self.server.loop_forever()
+
+
+if __name__ == "__main__":
+    smolvla_checkpoint = "./20250901/pick_red_marker_smolvla/checkpoints/last/pretrained_model"
+    server = LerobotInferenceServer(
+        checkpoint=smolvla_checkpoint, policy_type="smolvla", host="0.0.0.0", port=50000
+    )
+    server.run()
diff --git a/docker/train.py b/docker/train.py
new file mode 100644
index 0000000..b142619
--- /dev/null
+++ b/docker/train.py
@@ -0,0 +1,53 @@
+import json
+import os
+
+with open("/workspace/inputs/task.json") as f:
+    task_configs = json.load(f)
+
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Lerobot supports only one GPU for training
+os.environ["HF_HUB_OFFLINE"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+assert "train" in task_configs, "Not a validate train config"
+assert task_configs["train"]["model"] in ["act", "smolvla"], "Only act and smolvla are supported for training"
+
+use_policy = (
+    "--policy.path=lerobot/smolvla_base"
+    if task_configs["train"]["model"] == "smolvla"
+    else "--policy.type=act"
+)
+task_id = task_configs["task_id"]
+data_path = task_configs["train"]["input_data_path"]
+ckpt_path = task_configs["train"]["checkpoint_path"]
+bs = task_configs["train"]["batch_size"]
+epochs = task_configs["train"]["epochs"]
+
+use_resume = task_configs["train"].get("resume", False)
+if use_resume:
+    resume_path = f'--policy.path="{task_configs["train"]["checkpoint_path"]}/pretrained_model"'
+    # eg: ${checkpoint_path}/checkpoints/last
+
+with open(data_path + "/meta/info.json", "r") as f:
+    dataset_info = json.load(f)
+    total_frames = dataset_info["total_frames"]
+
+steps_per_epoch = total_frames // bs + 1
+steps = steps_per_epoch * epochs
+print(
+    "Lerobot only support steps, calculating steps from epochs...",
+    f"Steps per epoch: {steps_per_epoch}, Total steps: {steps}",
+)
+
+train_cmd = f"""lerobot-train \
+    {resume_path if use_resume else use_policy} \
+    --policy.push_to_hub=false \
+    --dataset.repo_id=D-Robotics/{task_id} \
+    --dataset.root={data_path} \
+    --batch_size={bs} \
+    --output_dir={ckpt_path} \
+    --steps={steps} --save_freq={steps_per_epoch} \
+"""
+
+print("Executing command:\n", train_cmd)
+os.system(train_cmd)
diff --git a/docker/train_task.json b/docker/train_task.json
new file mode 100644
index 0000000..6db760f
--- /dev/null
+++ b/docker/train_task.json
@@ -0,0 +1,12 @@
+{
+    "task_id": "b5c75014c1142feab3ee395b4a0bcc0",
+    "gpu_id": "0",
+    "train": {
+        "model":"smolvla",
+        "epochs":20,
+        "batch_size":64,
+        "log_path": "/workspace/logs",
+        "checkpoint_path": "/workspace/outputs/checkpoints",
+        "input_data_path": "/workspace/inputs/pick_red_marker"
+    }
+}
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 4696a2a..ffb9795 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,9 +74,9 @@ dependencies = [
     "pyserial>=3.5",
     "wandb>=0.20.0",
 
-    "torch>=2.2.1,<2.8.0", # TODO: Bumb dependency
-    "torchcodec>=0.2.1,<0.6.0; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')", # TODO: Bumb dependency
-    "torchvision>=0.21.0,<0.23.0", # TODO: Bumb dependency
+    "torch==2.6.0", # TODO: Bumb dependency
+    "torchcodec==0.2.1; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')", # TODO: Bumb dependency
+    "torchvision==0.21.0", # TODO: Bumb dependency
 
     "draccus==0.10.0", # TODO: Remove ==
     "gymnasium>=0.29.1,<1.0.0", # TODO: Bumb dependency