chore(layout): Expose the camera setting for interface parallel_sim. (#39)

* chore(layout): Expose the camera setting for interface parallel_sim and update layout file.
2025-09-10 20:58:35 +08:00 · 2025-09-10 20:58:35 +08:00 · 1272b80926
commit 1272b80926
parent cf3b919b65
10 changed files with 87 additions and 36 deletions
--- a/README.md
+++ b/README.md
@ -258,7 +258,7 @@ Example: generate multiple parallel simulation envs with `gym.make` and record s
 python embodied_gen/scripts/parallel_sim.py \
 --layout_file "outputs/layouts_gen/task_0000/layout.json" \
 --output_dir "outputs/parallel_sim/task_0000" \
--num_envs 20
+--num_envs 16
 ```

 ### 🖼️ Real-to-Sim Digital Twin
--- a/embodied_gen/envs/pick_embodiedgen.py
+++ b/embodied_gen/envs/pick_embodiedgen.py
@ -16,7 +16,6 @@

 import json
 import os
-from copy import deepcopy

 import numpy as np
 import sapien
@ -26,6 +25,7 @@ from mani_skill.envs.sapien_env import BaseEnv
 from mani_skill.sensors.camera import CameraConfig
 from mani_skill.utils import sapien_utils
 from mani_skill.utils.building import actors
+from mani_skill.utils.building.ground import build_ground
 from mani_skill.utils.registration import register_env
 from mani_skill.utils.structs.actor import Actor
 from mani_skill.utils.structs.pose import Pose
@ -78,6 +78,14 @@ class PickEmbodiedGen(BaseEnv):
        # Add small offset in z-axis to avoid collision.
        self.objs_z_offset = kwargs.pop("objs_z_offset", 0.002)
        self.robot_z_offset = kwargs.pop("robot_z_offset", 0.002)
+        self.camera_cfg = kwargs.pop("camera_cfg", None)
+        if self.camera_cfg is None:
+            self.camera_cfg = dict(
+                camera_eye=[0.9, 0.0, 1.1],
+                camera_target_pt=[0.0, 0.0, 0.9],
+                image_hw=[256, 256],
+                fovy_deg=75,
+            )

        self.layouts = self.init_env_layouts(
            layout_file, num_envs, replace_objs
@ -106,22 +114,30 @@ class PickEmbodiedGen(BaseEnv):
    def init_env_layouts(
        layout_file: str, num_envs: int, replace_objs: bool
    ) -> list[LayoutInfo]:
-        layout = LayoutInfo.from_dict(json.load(open(layout_file, "r")))
        layouts = []
        for env_idx in range(num_envs):
            if replace_objs and env_idx > 0:
-                layout = bfs_placement(deepcopy(layout))
-            layouts.append(layout)
+                layout_info = bfs_placement(layout_file)
+            else:
+                layout_info = json.load(open(layout_file, "r"))
+                layout_info = LayoutInfo.from_dict(layout_info)
+
+            layout_path = layout_file.replace(".json", f"_env{env_idx}.json")
+            with open(layout_path, "w") as f:
+                json.dump(layout_info.to_dict(), f, indent=4)
+
+            layouts.append(layout_path)

        return layouts

    @staticmethod
    def compute_robot_init_pose(
-        layouts: list[LayoutInfo], num_envs: int, z_offset: float = 0.0
+        layouts: list[str], num_envs: int, z_offset: float = 0.0
    ) -> list[list[float]]:
        robot_pose = []
        for env_idx in range(num_envs):
-            layout = layouts[env_idx]
+            layout = json.load(open(layouts[env_idx], "r"))
+            layout = LayoutInfo.from_dict(layout)
            robot_node = layout.relation[Scene3DItemEnum.ROBOT.value]
            x, y, z, qx, qy, qz, qw = layout.position[robot_node]
            robot_pose.append([x, y, z + z_offset, qw, qx, qy, qz])
@ -154,19 +170,27 @@ class PickEmbodiedGen(BaseEnv):
    @property
    def _default_human_render_camera_configs(self):
        pose = sapien_utils.look_at(
-            eye=[0.9, 0.0, 1.1], target=[0.0, 0.0, 0.9]
+            eye=self.camera_cfg["camera_eye"],
+            target=self.camera_cfg["camera_target_pt"],
        )

        return CameraConfig(
-            "render_camera", pose, 256, 256, np.deg2rad(75), 0.01, 100
+            "render_camera",
+            pose,
+            self.camera_cfg["image_hw"][1],
+            self.camera_cfg["image_hw"][0],
+            np.deg2rad(self.camera_cfg["fovy_deg"]),
+            0.01,
+            100,
        )

    def _load_agent(self, options: dict):
+        self.ground = build_ground(self.scene)
        super()._load_agent(options, sapien.Pose(p=[-10, 0, 10]))

    def _load_scene(self, options: dict):
        all_objects = []
-        logger.info(f"Loading assets and decomposition mesh collisions...")
+        logger.info(f"Loading EmbodiedGen assets...")
        for env_idx in range(self.num_envs):
            env_actors = load_assets_from_layout_file(
                self.scene,
@ -229,7 +253,7 @@ class PickEmbodiedGen(BaseEnv):
            self.agent.controller.controllers["gripper"].reset()

    def render_gs3d_images(
-        self, layouts: list[LayoutInfo], num_envs: int, init_quat: list[float]
+        self, layouts: list[str], num_envs: int, init_quat: list[float]
    ) -> dict[str, np.ndarray]:
        sim_coord_align = (
            torch.tensor(SIM_COORD_ALIGN).to(torch.float32).to(self.device)
@ -237,12 +261,18 @@ class PickEmbodiedGen(BaseEnv):
        cameras = self.scene.sensors.copy()
        cameras.update(self.scene.human_render_cameras)

-        bg_node = layouts[0].relation[Scene3DItemEnum.BACKGROUND.value]
-        gs_path = os.path.join(layouts[0].assets[bg_node], "gs_model.ply")
+        # Preload the background Gaussian Splatting model.
+        asset_root = os.path.dirname(layouts[0])
+        layout = LayoutInfo.from_dict(json.load(open(layouts[0], "r")))
+        bg_node = layout.relation[Scene3DItemEnum.BACKGROUND.value]
+        gs_path = os.path.join(
+            asset_root, layout.assets[bg_node], "gs_model.ply"
+        )
        raw_gs: GaussianOperator = GaussianOperator.load_from_ply(gs_path)
        bg_images = dict()
        for env_idx in tqdm(range(num_envs), desc="Pre-rendering Background"):
-            layout = layouts[env_idx]
+            layout = json.load(open(layouts[env_idx], "r"))
+            layout = LayoutInfo.from_dict(layout)
            x, y, z, qx, qy, qz, qw = layout.position[bg_node]
            qx, qy, qz, qw = quaternion_multiply([qx, qy, qz, qw], init_quat)
            init_pose = torch.tensor([x, y, z, qx, qy, qz, qw])
--- a/embodied_gen/scripts/compose_layout.py
+++ b/embodied_gen/scripts/compose_layout.py
@ -50,10 +50,7 @@ def entrypoint(**kwargs):
    out_scene_path = f"{output_dir}/Iscene.glb"
    out_layout_path = f"{output_dir}/layout.json"

-    with open(args.layout_path, "r") as f:
-        layout_info = LayoutInfo.from_dict(json.load(f))
-
-    layout_info = bfs_placement(layout_info, seed=args.seed)
+    layout_info = bfs_placement(args.layout_path, seed=args.seed)
    with open(out_layout_path, "w") as f:
        json.dump(layout_info.to_dict(), f, indent=4)

--- a/embodied_gen/scripts/gen_layout.py
+++ b/embodied_gen/scripts/gen_layout.py
@ -119,11 +119,15 @@ def entrypoint() -> None:
        match_scene_path = f"{os.path.dirname(args.bg_list)}/{match_key}"
        bg_save_dir = os.path.join(output_root, "background")
        copytree(match_scene_path, bg_save_dir, dirs_exist_ok=True)
-        layout_info.assets[bg_node] = bg_save_dir
+        layout_info.assets[bg_node] = "background"

        # BFS layout placement.
+        layout_path = f"{output_root}/layout.json"
+        with open(layout_path, "w") as f:
+            json.dump(layout_info.to_dict(), f, indent=4)
+
        layout_info = bfs_placement(
-            layout_info,
+            layout_path,
            limit_reach_range=True if args.insert_robot else False,
            seed=args.seed_layout,
        )
--- a/embodied_gen/scripts/parallel_sim.py
+++ b/embodied_gen/scripts/parallel_sim.py
@ -20,7 +20,7 @@ from embodied_gen.utils.monkey_patches import monkey_patch_maniskill
 monkey_patch_maniskill()
 import json
 from collections import defaultdict
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Literal

 import gymnasium as gym
@ -69,6 +69,18 @@ class ParallelSimConfig:
    reach_target_only: bool = True
    """Whether to only reach target without full action"""

+    # Camera settings
+    camera_eye: list[float] = field(default_factory=lambda: [0.9, 0.0, 1.1])
+    """Camera eye position [x, y, z] in global coordiante system"""
+    camera_target_pt: list[float] = field(
+        default_factory=lambda: [0.0, 0.0, 0.9]
+    )
+    """Camera target(look-at) point [x, y, z] in global coordiante system"""
+    image_hw: list[int] = field(default_factory=lambda: [512, 512])
+    """Rendered image height and width [height, width]"""
+    fovy_deg: float = 75
+    """Camera vertical field of view in degrees"""
+

 def entrypoint(**kwargs):
    if kwargs is None or len(kwargs) == 0:
@ -83,6 +95,12 @@ def entrypoint(**kwargs):
        enable_shadow=cfg.enable_shadow,
        layout_file=cfg.layout_file,
        control_mode=cfg.control_mode,
+        camera_cfg=dict(
+            camera_eye=cfg.camera_eye,
+            camera_target_pt=cfg.camera_target_pt,
+            image_hw=cfg.image_hw,
+            fovy_deg=cfg.fovy_deg,
+        ),
    )
    env = RecordEpisode(
        env,
--- a/embodied_gen/scripts/simulate_sapien.py
+++ b/embodied_gen/scripts/simulate_sapien.py
@ -91,17 +91,16 @@ def entrypoint(**kwargs):
        fovy_deg=cfg.fovy_deg,
    )
    with open(cfg.layout_path, "r") as f:
-        layout_data = json.load(f)
-        layout_data: LayoutInfo = LayoutInfo.from_dict(layout_data)
+        layout_data: LayoutInfo = LayoutInfo.from_dict(json.load(f))

    actors = load_assets_from_layout_file(
        scene_manager.scene,
-        layout_data,
+        cfg.layout_path,
        cfg.z_offset,
        cfg.init_quat,
    )
    agent = load_mani_skill_robot(
-        scene_manager.scene, layout_data, cfg.control_freq
+        scene_manager.scene, cfg.layout_path, cfg.control_freq
    )

    frames = defaultdict(list)
@ -134,8 +133,9 @@ def entrypoint(**kwargs):
    if "Foreground" not in cfg.render_keys:
        return

+    asset_root = os.path.dirname(cfg.layout_path)
    bg_node = layout_data.relation[Scene3DItemEnum.BACKGROUND.value]
-    gs_path = f"{layout_data.assets[bg_node]}/gs_model.ply"
+    gs_path = f"{asset_root}/{layout_data.assets[bg_node]}/gs_model.ply"
    gs_model: GaussianOperator = GaussianOperator.load_from_ply(gs_path)
    x, y, z, qx, qy, qz, qw = layout_data.position[bg_node]
    qx, qy, qz, qw = quaternion_multiply([qx, qy, qz, qw], cfg.init_quat)
--- a/embodied_gen/scripts/textto3d.py
+++ b/embodied_gen/scripts/textto3d.py
@ -187,7 +187,7 @@ def text_to_3d(**kwargs) -> dict:
            logger.warning(
                f"Node {node}, {TXTGEN_CHECKER.__class__.__name__}: {qa_result}"
            )
-            results["assets"][node] = f"{node_save_dir}/result"
+            results["assets"][node] = f"asset3d/{save_node}/result"
            results["quality"][node] = qa_result

            if qa_flag is None or qa_flag is True:
--- a/embodied_gen/utils/geometry.py
+++ b/embodied_gen/utils/geometry.py
@ -14,6 +14,7 @@
 # implied. See the License for the specific language governing
 # permissions and limitations under the License.

+import json
 import os
 import random
 from collections import defaultdict, deque
@ -32,7 +33,6 @@ from embodied_gen.utils.enum import LayoutInfo, Scene3DItemEnum
 from embodied_gen.utils.log import logger

 __all__ = [
-    "bfs_placement",
    "with_seed",
    "matrix_to_pose",
    "pose_to_matrix",
@ -222,7 +222,7 @@ def check_reachable(

@with_seed("seed")
 def bfs_placement(
-    layout_info: LayoutInfo,
+    layout_file: str,
    floor_margin: float = 0,
    beside_margin: float = 0.1,
    max_attempts: int = 3000,
@ -232,6 +232,8 @@ def bfs_placement(
    robot_dim: float = 0.12,
    seed: int = None,
 ) -> LayoutInfo:
+    layout_info = LayoutInfo.from_dict(json.load(open(layout_file, "r")))
+    asset_dir = os.path.dirname(layout_file)
    object_mapping = layout_info.objs_mapping
    position = {}  # node: [x, y, z, qx, qy, qz, qw]
    parent_bbox_xy = {}
@ -254,6 +256,7 @@ def bfs_placement(
        mesh_path = (
            f"{layout_info.assets[node]}/mesh/{node.replace(' ', '_')}.obj"
        )
+        mesh_path = os.path.join(asset_dir, mesh_path)
        mesh_info[node]["path"] = mesh_path
        mesh = trimesh.load(mesh_path)
        vertices = mesh.vertices
--- a/embodied_gen/utils/monkey_patches.py
+++ b/embodied_gen/utils/monkey_patches.py
@ -175,7 +175,7 @@ def monkey_patch_maniskill():
                seg_labels = camera.get_obs(
                    rgb=False, depth=False, segmentation=True, position=False
                )["segmentation"]
-                masks = np.where((seg_labels.cpu() > 0), 255, 0).astype(
+                masks = np.where((seg_labels.cpu() > 1), 255, 0).astype(
                    np.uint8
                )
                masks = torch.tensor(masks).to(color.device)
--- a/embodied_gen/utils/simulation.py
+++ b/embodied_gen/utils/simulation.py
@ -124,7 +124,7 @@ def load_actor_from_urdf(

 def load_assets_from_layout_file(
    scene: ManiSkillScene | sapien.Scene,
-    layout: LayoutInfo | str,
+    layout: str,
    z_offset: float = 0.0,
    init_quat: list[float] = [0, 0, 0, 1],
    env_idx: int = None,
@ -133,19 +133,18 @@ def load_assets_from_layout_file(

    Args:
        scene (sapien.Scene | ManiSkillScene): The SAPIEN or ManiSkill scene to load assets into.
-        layout (LayoutInfo): The layout information data.
+        layout (str): The layout file path.
        z_offset (float): Offset to apply to the Z-coordinate of non-context objects.
        init_quat (List[float]): Initial quaternion (x, y, z, w) for orientation adjustment.
        env_idx (int): Environment index for multi-environment setup.
    """
-    if isinstance(layout, str) and layout.endswith(".json"):
-        layout = LayoutInfo.from_dict(json.load(open(layout, "r")))
-
+    asset_root = os.path.dirname(layout)
+    layout = LayoutInfo.from_dict(json.load(open(layout, "r")))
    actors = dict()
    for node in layout.assets:
        file_dir = layout.assets[node]
        file_name = f"{node.replace(' ', '_')}.urdf"
-        urdf_file = os.path.join(file_dir, file_name)
+        urdf_file = os.path.join(asset_root, file_dir, file_name)

        if layout.objs_mapping[node] == Scene3DItemEnum.BACKGROUND.value:
            continue