refactor(pipe): Adapt to huggingface space. (#3)

Adapt to huggingface space.
2025-06-12 11:59:29 +08:00 · 2025-06-12 11:59:29 +08:00 · e29807bd62
commit e29807bd62
parent 18075659de
8 changed files with 92 additions and 95 deletions
--- a/README.md
+++ b/README.md
@ -25,8 +25,8 @@
 ## 🚀 Quick Start
 ```sh
-git clone https://github.com/HorizonRobotics/EmbodiedGen
+git clone https://github.com/HorizonRobotics/EmbodiedGen.git
-cd EmbodiedGen
+cd EmbodiedGen && git submodule update --init --recursive
 conda create -n embodiedgen python=3.10.13 -y
 conda activate embodiedgen
 pip install -r requirements.txt --use-deprecated=legacy-resolver
@ -42,7 +42,7 @@ Update the API key in file: `embodied_gen/utils/gpt_config.yaml`.
 You can choose between two backends for the GPT agent:
 - **`gpt-4o`** (Recommended) – Use this if you have access to **Azure OpenAI**.
- **`qwen2.5-vl`** – An open alternative with free usage via [OpenRouter](https://openrouter.ai/settings/keys) (50 free requests per day)
+- **`qwen2.5-vl`** – An alternative with free usage via OpenRouter, apply a free key [here](https://openrouter.ai/settings/keys) and update `api_key` in `embodied_gen/utils/gpt_config.yaml` (50 free requests per day)
 ---
--- a/apps/common.py
+++ b/apps/common.py
@ -35,7 +35,7 @@ from gradio.themes.utils.colors import gray, neutral, slate, stone, teal, zinc
 from PIL import Image
 from embodied_gen.data.backproject_v2 import entrypoint as backproject_api
 from embodied_gen.data.differentiable_render import entrypoint as render_api
-from embodied_gen.data.utils import trellis_preprocess
+from embodied_gen.data.utils import trellis_preprocess, zip_files
 from embodied_gen.models.delight_model import DelightingModel
 from embodied_gen.models.gs_model import GaussianOperator
 from embodied_gen.models.segment_model import (
@ -64,7 +64,7 @@ from embodied_gen.validators.quality_checkers import (
    ImageSegChecker,
    MeshGeoChecker,
 )
-from embodied_gen.validators.urdf_convertor import URDFGenerator, zip_files
+from embodied_gen.validators.urdf_convertor import URDFGenerator
 current_file_path = os.path.abspath(__file__)
 current_dir = os.path.dirname(current_file_path)
--- a/embodied_gen/data/differentiable_render.py
+++ b/embodied_gen/data/differentiable_render.py
@ -24,7 +24,10 @@ from collections import defaultdict
 from typing import List, Union
 import cv2
 import imageio
 import numpy as np
 import nvdiffrast.torch as dr
 import PIL.Image as Image
 import torch
 from tqdm import tqdm
 from embodied_gen.data.utils import (
@ -39,10 +42,6 @@ from embodied_gen.data.utils import (
    render_pbr,
    save_images,
 )
 from embodied_gen.utils.process_media import (
    create_gif_from_images,
    create_mp4_from_images,
 )
 os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
 os.environ["TORCH_EXTENSIONS_DIR"] = os.path.expanduser(
@ -54,7 +53,66 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
-__all__ = ["ImageRender"]
+__all__ = [
    "ImageRender",
    "create_mp4_from_images",
    "create_gif_from_images",
 ]
 def create_mp4_from_images(
    images: list[np.ndarray],
    output_path: str,
    fps: int = 10,
    prompt: str = None,
 ):
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.5
    font_thickness = 1
    color = (255, 255, 255)
    position = (20, 25)
    with imageio.get_writer(output_path, fps=fps) as writer:
        for image in images:
            image = image.clip(min=0, max=1)
            image = (255.0 * image).astype(np.uint8)
            image = image[..., :3]
            if prompt is not None:
                cv2.putText(
                    image,
                    prompt,
                    position,
                    font,
                    font_scale,
                    color,
                    font_thickness,
                )
            writer.append_data(image)
    logger.info(f"MP4 video saved to {output_path}")
 def create_gif_from_images(
    images: list[np.ndarray], output_path: str, fps: int = 10
 ) -> None:
    pil_images = []
    for image in images:
        image = image.clip(min=0, max=1)
        image = (255.0 * image).astype(np.uint8)
        image = Image.fromarray(image, mode="RGBA")
        pil_images.append(image.convert("RGB"))
    duration = 1000 // fps
    pil_images[0].save(
        output_path,
        save_all=True,
        append_images=pil_images[1:],
        duration=duration,
        loop=0,
    )
    logger.info(f"GIF saved to {output_path}")
 class ImageRender(object):
--- a/embodied_gen/data/utils.py
+++ b/embodied_gen/data/utils.py
@ -139,7 +139,9 @@ class DiffrastRender(object):
        vertices: torch.Tensor,
        matrix: torch.Tensor,
    ) -> torch.Tensor:
-        verts_ones = torch.ones((len(vertices), 1)).to(vertices)
+        verts_ones = torch.ones(
            (len(vertices), 1), device=vertices.device, dtype=vertices.dtype
        )
        verts_homo = torch.cat([vertices, verts_ones], dim=-1)
        trans_vertices = torch.matmul(verts_homo, matrix.permute(0, 2, 1))
--- a/embodied_gen/utils/gpt_clients.py
+++ b/embodied_gen/utils/gpt_clients.py
@ -185,9 +185,8 @@ if __name__ == "__main__":
            text_prompt="What is the content in each image?",
            image_base64=combine_images_to_base64(
                [
-                    "outputs/text2image/demo_objects/bed/sample_0.jpg",
+                    "apps/assets/example_image/sample_02.jpg",
-                    "outputs/imageto3d/v2/cups/sample_69/URDF_sample_69/qa_renders/image_color/003.png",  # noqa
+                    "apps/assets/example_image/sample_03.jpg",
                    "outputs/text2image/demo_objects/cardboard/sample_1.jpg",
                ]
            ),  # input raw image_path if only one image
        )
@ -196,10 +195,8 @@ if __name__ == "__main__":
        response = GPT_CLIENT.query(
            text_prompt="What is the content in the images?",
            image_base64=[
-                Image.open("outputs/text2image/demo_objects/bed/sample_0.jpg"),
+                Image.open("apps/assets/example_image/sample_02.jpg"),
-                Image.open(
+                Image.open("apps/assets/example_image/sample_03.jpg"),
                    "outputs/imageto3d/v2/cups/sample_69/URDF_sample_69/qa_renders/image_color/003.png"  # noqa
                ),
            ],
        )
        print(response)
--- a/embodied_gen/utils/gpt_config.yaml
+++ b/embodied_gen/utils/gpt_config.yaml
@ -9,6 +9,6 @@ gpt-4o:
 qwen2.5-vl:
  endpoint: https://openrouter.ai/api/v1
-  api_key: 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
+  api_key: sk-or-v1-xxx
  api_version: null
  model_name: qwen/qwen2.5-vl-72b-instruct:free
--- a/embodied_gen/utils/process_media.py
+++ b/embodied_gen/utils/process_media.py
@ -19,7 +19,6 @@ import base64
 import logging
 import math
 import os
 import subprocess
 import sys
 from glob import glob
 from io import BytesIO
@ -33,6 +32,7 @@ import spaces
 import torch
 from moviepy.editor import VideoFileClip, clips_array
 from tqdm import tqdm
 from embodied_gen.data.differentiable_render import entrypoint as render_api
 current_file_path = os.path.abspath(__file__)
 current_dir = os.path.dirname(current_file_path)
@ -56,8 +56,6 @@ __all__ = [
    "combine_images_to_base64",
    "render_mesh",
    "render_video",
    "create_mp4_from_images",
    "create_gif_from_images",
 ]
@ -75,34 +73,25 @@ def render_asset3d(
    gen_viewnormal_mp4: bool = False,
    gen_glonormal_mp4: bool = False,
 ) -> list[str]:
-    command = [
+    input_args = dict(
-        "python3",
+        mesh_path=mesh_path,
-        "embodied_gen/data/differentiable_render.py",
+        output_root=output_root,
-        "--mesh_path",
+        uuid=output_subdir,
-        mesh_path,
+        distance=distance,
-        "--output_root",
+        num_images=num_images,
-        output_root,
+        elevation=elevation,
-        "--uuid",
+        pbr_light_factor=pbr_light_factor,
-        output_subdir,
+        with_mtl=True,
-        "--distance",
+    )
        str(distance),
        "--num_images",
        str(num_images),
        "--elevation",
        *map(str, elevation),
        "--pbr_light_factor",
        str(pbr_light_factor),
        "--with_mtl",
    ]
    if gen_color_mp4:
-        command.append("--gen_color_mp4")
+        input_args["gen_color_mp4"] = True
    if gen_viewnormal_mp4:
-        command.append("--gen_viewnormal_mp4")
+        input_args["gen_viewnormal_mp4"] = True
    if gen_glonormal_mp4:
-        command.append("--gen_glonormal_mp4")
+        input_args["gen_glonormal_mp4"] = True
    try:
-        subprocess.run(command, check=True)
+        _ = render_api(**input_args)
-    except subprocess.CalledProcessError as e:
+    except Exception as e:
        logger.error(f"Error occurred during rendering: {e}.")
    dst_paths = glob(os.path.join(output_root, output_subdir, return_key))
@ -263,54 +252,6 @@ def render_video(
    return result
 def create_mp4_from_images(images, output_path, fps=10, prompt=None):
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.5
    font_thickness = 1
    color = (255, 255, 255)
    position = (20, 25)
    with imageio.get_writer(output_path, fps=fps) as writer:
        for image in images:
            image = image.clip(min=0, max=1)
            image = (255.0 * image).astype(np.uint8)
            image = image[..., :3]
            if prompt is not None:
                cv2.putText(
                    image,
                    prompt,
                    position,
                    font,
                    font_scale,
                    color,
                    font_thickness,
                )
            writer.append_data(image)
    logger.info(f"MP4 video saved to {output_path}")
 def create_gif_from_images(images, output_path, fps=10):
    pil_images = []
    for image in images:
        image = image.clip(min=0, max=1)
        image = (255.0 * image).astype(np.uint8)
        image = Image.fromarray(image, mode="RGBA")
        pil_images.append(image.convert("RGB"))
    duration = 1000 // fps
    pil_images[0].save(
        output_path,
        save_all=True,
        append_images=pil_images[1:],
        duration=duration,
        loop=0,
    )
    logger.info(f"GIF saved to {output_path}")
 if __name__ == "__main__":
    # Example usage:
    merge_video_video(
--- a/embodied_gen/validators/urdf_convertor.py
+++ b/embodied_gen/validators/urdf_convertor.py
@ -24,7 +24,6 @@ from xml.dom.minidom import parseString
 import numpy as np
 import trimesh
 from embodied_gen.data.utils import zip_files
 from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient
 from embodied_gen.utils.process_media import render_asset3d
 from embodied_gen.utils.tags import VERSION