From e29807bd623fdb94355275a1c278b4353c3c1428 Mon Sep 17 00:00:00 2001 From: Xinjie Date: Thu, 12 Jun 2025 11:59:29 +0800 Subject: [PATCH] refactor(pipe): Adapt to huggingface space. (#3) Adapt to huggingface space. --- README.md | 6 +- apps/common.py | 4 +- embodied_gen/data/differentiable_render.py | 68 ++++++++++++++-- embodied_gen/data/utils.py | 4 +- embodied_gen/utils/gpt_clients.py | 11 +-- embodied_gen/utils/gpt_config.yaml | 2 +- embodied_gen/utils/process_media.py | 91 ++++------------------ embodied_gen/validators/urdf_convertor.py | 1 - 8 files changed, 92 insertions(+), 95 deletions(-) diff --git a/README.md b/README.md index c86525d..f2a8d28 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,8 @@ ## 🚀 Quick Start ```sh -git clone https://github.com/HorizonRobotics/EmbodiedGen -cd EmbodiedGen +git clone https://github.com/HorizonRobotics/EmbodiedGen.git +cd EmbodiedGen && git submodule update --init --recursive conda create -n embodiedgen python=3.10.13 -y conda activate embodiedgen pip install -r requirements.txt --use-deprecated=legacy-resolver @@ -42,7 +42,7 @@ Update the API key in file: `embodied_gen/utils/gpt_config.yaml`. You can choose between two backends for the GPT agent: - **`gpt-4o`** (Recommended) – Use this if you have access to **Azure OpenAI**. -- **`qwen2.5-vl`** – An open alternative with free usage via [OpenRouter](https://openrouter.ai/settings/keys) (50 free requests per day) +- **`qwen2.5-vl`** – An alternative with free usage via OpenRouter, apply a free key [here](https://openrouter.ai/settings/keys) and update `api_key` in `embodied_gen/utils/gpt_config.yaml` (50 free requests per day) --- diff --git a/apps/common.py b/apps/common.py index 11b7d6a..bbef969 100644 --- a/apps/common.py +++ b/apps/common.py @@ -35,7 +35,7 @@ from gradio.themes.utils.colors import gray, neutral, slate, stone, teal, zinc from PIL import Image from embodied_gen.data.backproject_v2 import entrypoint as backproject_api from embodied_gen.data.differentiable_render import entrypoint as render_api -from embodied_gen.data.utils import trellis_preprocess +from embodied_gen.data.utils import trellis_preprocess, zip_files from embodied_gen.models.delight_model import DelightingModel from embodied_gen.models.gs_model import GaussianOperator from embodied_gen.models.segment_model import ( @@ -64,7 +64,7 @@ from embodied_gen.validators.quality_checkers import ( ImageSegChecker, MeshGeoChecker, ) -from embodied_gen.validators.urdf_convertor import URDFGenerator, zip_files +from embodied_gen.validators.urdf_convertor import URDFGenerator current_file_path = os.path.abspath(__file__) current_dir = os.path.dirname(current_file_path) diff --git a/embodied_gen/data/differentiable_render.py b/embodied_gen/data/differentiable_render.py index 18b0a86..4d749a4 100644 --- a/embodied_gen/data/differentiable_render.py +++ b/embodied_gen/data/differentiable_render.py @@ -24,7 +24,10 @@ from collections import defaultdict from typing import List, Union import cv2 +import imageio +import numpy as np import nvdiffrast.torch as dr +import PIL.Image as Image import torch from tqdm import tqdm from embodied_gen.data.utils import ( @@ -39,10 +42,6 @@ from embodied_gen.data.utils import ( render_pbr, save_images, ) -from embodied_gen.utils.process_media import ( - create_gif_from_images, - create_mp4_from_images, -) os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1" os.environ["TORCH_EXTENSIONS_DIR"] = os.path.expanduser( @@ -54,7 +53,66 @@ logging.basicConfig( logger = logging.getLogger(__name__) -__all__ = ["ImageRender"] +__all__ = [ + "ImageRender", + "create_mp4_from_images", + "create_gif_from_images", +] + + +def create_mp4_from_images( + images: list[np.ndarray], + output_path: str, + fps: int = 10, + prompt: str = None, +): + font = cv2.FONT_HERSHEY_SIMPLEX + font_scale = 0.5 + font_thickness = 1 + color = (255, 255, 255) + position = (20, 25) + + with imageio.get_writer(output_path, fps=fps) as writer: + for image in images: + image = image.clip(min=0, max=1) + image = (255.0 * image).astype(np.uint8) + image = image[..., :3] + if prompt is not None: + cv2.putText( + image, + prompt, + position, + font, + font_scale, + color, + font_thickness, + ) + + writer.append_data(image) + + logger.info(f"MP4 video saved to {output_path}") + + +def create_gif_from_images( + images: list[np.ndarray], output_path: str, fps: int = 10 +) -> None: + pil_images = [] + for image in images: + image = image.clip(min=0, max=1) + image = (255.0 * image).astype(np.uint8) + image = Image.fromarray(image, mode="RGBA") + pil_images.append(image.convert("RGB")) + + duration = 1000 // fps + pil_images[0].save( + output_path, + save_all=True, + append_images=pil_images[1:], + duration=duration, + loop=0, + ) + + logger.info(f"GIF saved to {output_path}") class ImageRender(object): diff --git a/embodied_gen/data/utils.py b/embodied_gen/data/utils.py index 31a65f8..b8d632c 100644 --- a/embodied_gen/data/utils.py +++ b/embodied_gen/data/utils.py @@ -139,7 +139,9 @@ class DiffrastRender(object): vertices: torch.Tensor, matrix: torch.Tensor, ) -> torch.Tensor: - verts_ones = torch.ones((len(vertices), 1)).to(vertices) + verts_ones = torch.ones( + (len(vertices), 1), device=vertices.device, dtype=vertices.dtype + ) verts_homo = torch.cat([vertices, verts_ones], dim=-1) trans_vertices = torch.matmul(verts_homo, matrix.permute(0, 2, 1)) diff --git a/embodied_gen/utils/gpt_clients.py b/embodied_gen/utils/gpt_clients.py index 7b3f72b..f7ce067 100644 --- a/embodied_gen/utils/gpt_clients.py +++ b/embodied_gen/utils/gpt_clients.py @@ -185,9 +185,8 @@ if __name__ == "__main__": text_prompt="What is the content in each image?", image_base64=combine_images_to_base64( [ - "outputs/text2image/demo_objects/bed/sample_0.jpg", - "outputs/imageto3d/v2/cups/sample_69/URDF_sample_69/qa_renders/image_color/003.png", # noqa - "outputs/text2image/demo_objects/cardboard/sample_1.jpg", + "apps/assets/example_image/sample_02.jpg", + "apps/assets/example_image/sample_03.jpg", ] ), # input raw image_path if only one image ) @@ -196,10 +195,8 @@ if __name__ == "__main__": response = GPT_CLIENT.query( text_prompt="What is the content in the images?", image_base64=[ - Image.open("outputs/text2image/demo_objects/bed/sample_0.jpg"), - Image.open( - "outputs/imageto3d/v2/cups/sample_69/URDF_sample_69/qa_renders/image_color/003.png" # noqa - ), + Image.open("apps/assets/example_image/sample_02.jpg"), + Image.open("apps/assets/example_image/sample_03.jpg"), ], ) print(response) diff --git a/embodied_gen/utils/gpt_config.yaml b/embodied_gen/utils/gpt_config.yaml index 2e966bf..67b78b3 100644 --- a/embodied_gen/utils/gpt_config.yaml +++ b/embodied_gen/utils/gpt_config.yaml @@ -9,6 +9,6 @@ gpt-4o: qwen2.5-vl: endpoint: https://openrouter.ai/api/v1 - api_key: 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 + api_key: sk-or-v1-xxx api_version: null model_name: qwen/qwen2.5-vl-72b-instruct:free diff --git a/embodied_gen/utils/process_media.py b/embodied_gen/utils/process_media.py index c5708e6..2d47c69 100644 --- a/embodied_gen/utils/process_media.py +++ b/embodied_gen/utils/process_media.py @@ -19,7 +19,6 @@ import base64 import logging import math import os -import subprocess import sys from glob import glob from io import BytesIO @@ -33,6 +32,7 @@ import spaces import torch from moviepy.editor import VideoFileClip, clips_array from tqdm import tqdm +from embodied_gen.data.differentiable_render import entrypoint as render_api current_file_path = os.path.abspath(__file__) current_dir = os.path.dirname(current_file_path) @@ -56,8 +56,6 @@ __all__ = [ "combine_images_to_base64", "render_mesh", "render_video", - "create_mp4_from_images", - "create_gif_from_images", ] @@ -75,34 +73,25 @@ def render_asset3d( gen_viewnormal_mp4: bool = False, gen_glonormal_mp4: bool = False, ) -> list[str]: - command = [ - "python3", - "embodied_gen/data/differentiable_render.py", - "--mesh_path", - mesh_path, - "--output_root", - output_root, - "--uuid", - output_subdir, - "--distance", - str(distance), - "--num_images", - str(num_images), - "--elevation", - *map(str, elevation), - "--pbr_light_factor", - str(pbr_light_factor), - "--with_mtl", - ] + input_args = dict( + mesh_path=mesh_path, + output_root=output_root, + uuid=output_subdir, + distance=distance, + num_images=num_images, + elevation=elevation, + pbr_light_factor=pbr_light_factor, + with_mtl=True, + ) if gen_color_mp4: - command.append("--gen_color_mp4") + input_args["gen_color_mp4"] = True if gen_viewnormal_mp4: - command.append("--gen_viewnormal_mp4") + input_args["gen_viewnormal_mp4"] = True if gen_glonormal_mp4: - command.append("--gen_glonormal_mp4") + input_args["gen_glonormal_mp4"] = True try: - subprocess.run(command, check=True) - except subprocess.CalledProcessError as e: + _ = render_api(**input_args) + except Exception as e: logger.error(f"Error occurred during rendering: {e}.") dst_paths = glob(os.path.join(output_root, output_subdir, return_key)) @@ -263,54 +252,6 @@ def render_video( return result -def create_mp4_from_images(images, output_path, fps=10, prompt=None): - font = cv2.FONT_HERSHEY_SIMPLEX - font_scale = 0.5 - font_thickness = 1 - color = (255, 255, 255) - position = (20, 25) - - with imageio.get_writer(output_path, fps=fps) as writer: - for image in images: - image = image.clip(min=0, max=1) - image = (255.0 * image).astype(np.uint8) - image = image[..., :3] - if prompt is not None: - cv2.putText( - image, - prompt, - position, - font, - font_scale, - color, - font_thickness, - ) - - writer.append_data(image) - - logger.info(f"MP4 video saved to {output_path}") - - -def create_gif_from_images(images, output_path, fps=10): - pil_images = [] - for image in images: - image = image.clip(min=0, max=1) - image = (255.0 * image).astype(np.uint8) - image = Image.fromarray(image, mode="RGBA") - pil_images.append(image.convert("RGB")) - - duration = 1000 // fps - pil_images[0].save( - output_path, - save_all=True, - append_images=pil_images[1:], - duration=duration, - loop=0, - ) - - logger.info(f"GIF saved to {output_path}") - - if __name__ == "__main__": # Example usage: merge_video_video( diff --git a/embodied_gen/validators/urdf_convertor.py b/embodied_gen/validators/urdf_convertor.py index eed9e07..a830519 100644 --- a/embodied_gen/validators/urdf_convertor.py +++ b/embodied_gen/validators/urdf_convertor.py @@ -24,7 +24,6 @@ from xml.dom.minidom import parseString import numpy as np import trimesh -from embodied_gen.data.utils import zip_files from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient from embodied_gen.utils.process_media import render_asset3d from embodied_gen.utils.tags import VERSION