refactor(pipe): Adapt to huggingface space. (#3)

Adapt to huggingface space.
This commit is contained in:
Xinjie 2025-06-12 11:59:29 +08:00 committed by GitHub
parent 18075659de
commit e29807bd62
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 92 additions and 95 deletions

View File

@ -25,8 +25,8 @@
## 🚀 Quick Start ## 🚀 Quick Start
```sh ```sh
git clone https://github.com/HorizonRobotics/EmbodiedGen git clone https://github.com/HorizonRobotics/EmbodiedGen.git
cd EmbodiedGen cd EmbodiedGen && git submodule update --init --recursive
conda create -n embodiedgen python=3.10.13 -y conda create -n embodiedgen python=3.10.13 -y
conda activate embodiedgen conda activate embodiedgen
pip install -r requirements.txt --use-deprecated=legacy-resolver pip install -r requirements.txt --use-deprecated=legacy-resolver
@ -42,7 +42,7 @@ Update the API key in file: `embodied_gen/utils/gpt_config.yaml`.
You can choose between two backends for the GPT agent: You can choose between two backends for the GPT agent:
- **`gpt-4o`** (Recommended) Use this if you have access to **Azure OpenAI**. - **`gpt-4o`** (Recommended) Use this if you have access to **Azure OpenAI**.
- **`qwen2.5-vl`** An open alternative with free usage via [OpenRouter](https://openrouter.ai/settings/keys) (50 free requests per day) - **`qwen2.5-vl`** An alternative with free usage via OpenRouter, apply a free key [here](https://openrouter.ai/settings/keys) and update `api_key` in `embodied_gen/utils/gpt_config.yaml` (50 free requests per day)
--- ---

View File

@ -35,7 +35,7 @@ from gradio.themes.utils.colors import gray, neutral, slate, stone, teal, zinc
from PIL import Image from PIL import Image
from embodied_gen.data.backproject_v2 import entrypoint as backproject_api from embodied_gen.data.backproject_v2 import entrypoint as backproject_api
from embodied_gen.data.differentiable_render import entrypoint as render_api from embodied_gen.data.differentiable_render import entrypoint as render_api
from embodied_gen.data.utils import trellis_preprocess from embodied_gen.data.utils import trellis_preprocess, zip_files
from embodied_gen.models.delight_model import DelightingModel from embodied_gen.models.delight_model import DelightingModel
from embodied_gen.models.gs_model import GaussianOperator from embodied_gen.models.gs_model import GaussianOperator
from embodied_gen.models.segment_model import ( from embodied_gen.models.segment_model import (
@ -64,7 +64,7 @@ from embodied_gen.validators.quality_checkers import (
ImageSegChecker, ImageSegChecker,
MeshGeoChecker, MeshGeoChecker,
) )
from embodied_gen.validators.urdf_convertor import URDFGenerator, zip_files from embodied_gen.validators.urdf_convertor import URDFGenerator
current_file_path = os.path.abspath(__file__) current_file_path = os.path.abspath(__file__)
current_dir = os.path.dirname(current_file_path) current_dir = os.path.dirname(current_file_path)

View File

@ -24,7 +24,10 @@ from collections import defaultdict
from typing import List, Union from typing import List, Union
import cv2 import cv2
import imageio
import numpy as np
import nvdiffrast.torch as dr import nvdiffrast.torch as dr
import PIL.Image as Image
import torch import torch
from tqdm import tqdm from tqdm import tqdm
from embodied_gen.data.utils import ( from embodied_gen.data.utils import (
@ -39,10 +42,6 @@ from embodied_gen.data.utils import (
render_pbr, render_pbr,
save_images, save_images,
) )
from embodied_gen.utils.process_media import (
create_gif_from_images,
create_mp4_from_images,
)
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1" os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
os.environ["TORCH_EXTENSIONS_DIR"] = os.path.expanduser( os.environ["TORCH_EXTENSIONS_DIR"] = os.path.expanduser(
@ -54,7 +53,66 @@ logging.basicConfig(
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
__all__ = ["ImageRender"] __all__ = [
"ImageRender",
"create_mp4_from_images",
"create_gif_from_images",
]
def create_mp4_from_images(
images: list[np.ndarray],
output_path: str,
fps: int = 10,
prompt: str = None,
):
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.5
font_thickness = 1
color = (255, 255, 255)
position = (20, 25)
with imageio.get_writer(output_path, fps=fps) as writer:
for image in images:
image = image.clip(min=0, max=1)
image = (255.0 * image).astype(np.uint8)
image = image[..., :3]
if prompt is not None:
cv2.putText(
image,
prompt,
position,
font,
font_scale,
color,
font_thickness,
)
writer.append_data(image)
logger.info(f"MP4 video saved to {output_path}")
def create_gif_from_images(
images: list[np.ndarray], output_path: str, fps: int = 10
) -> None:
pil_images = []
for image in images:
image = image.clip(min=0, max=1)
image = (255.0 * image).astype(np.uint8)
image = Image.fromarray(image, mode="RGBA")
pil_images.append(image.convert("RGB"))
duration = 1000 // fps
pil_images[0].save(
output_path,
save_all=True,
append_images=pil_images[1:],
duration=duration,
loop=0,
)
logger.info(f"GIF saved to {output_path}")
class ImageRender(object): class ImageRender(object):

View File

@ -139,7 +139,9 @@ class DiffrastRender(object):
vertices: torch.Tensor, vertices: torch.Tensor,
matrix: torch.Tensor, matrix: torch.Tensor,
) -> torch.Tensor: ) -> torch.Tensor:
verts_ones = torch.ones((len(vertices), 1)).to(vertices) verts_ones = torch.ones(
(len(vertices), 1), device=vertices.device, dtype=vertices.dtype
)
verts_homo = torch.cat([vertices, verts_ones], dim=-1) verts_homo = torch.cat([vertices, verts_ones], dim=-1)
trans_vertices = torch.matmul(verts_homo, matrix.permute(0, 2, 1)) trans_vertices = torch.matmul(verts_homo, matrix.permute(0, 2, 1))

View File

@ -185,9 +185,8 @@ if __name__ == "__main__":
text_prompt="What is the content in each image?", text_prompt="What is the content in each image?",
image_base64=combine_images_to_base64( image_base64=combine_images_to_base64(
[ [
"outputs/text2image/demo_objects/bed/sample_0.jpg", "apps/assets/example_image/sample_02.jpg",
"outputs/imageto3d/v2/cups/sample_69/URDF_sample_69/qa_renders/image_color/003.png", # noqa "apps/assets/example_image/sample_03.jpg",
"outputs/text2image/demo_objects/cardboard/sample_1.jpg",
] ]
), # input raw image_path if only one image ), # input raw image_path if only one image
) )
@ -196,10 +195,8 @@ if __name__ == "__main__":
response = GPT_CLIENT.query( response = GPT_CLIENT.query(
text_prompt="What is the content in the images?", text_prompt="What is the content in the images?",
image_base64=[ image_base64=[
Image.open("outputs/text2image/demo_objects/bed/sample_0.jpg"), Image.open("apps/assets/example_image/sample_02.jpg"),
Image.open( Image.open("apps/assets/example_image/sample_03.jpg"),
"outputs/imageto3d/v2/cups/sample_69/URDF_sample_69/qa_renders/image_color/003.png" # noqa
),
], ],
) )
print(response) print(response)

View File

@ -9,6 +9,6 @@ gpt-4o:
qwen2.5-vl: qwen2.5-vl:
endpoint: https://openrouter.ai/api/v1 endpoint: https://openrouter.ai/api/v1
api_key: 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 api_key: sk-or-v1-xxx
api_version: null api_version: null
model_name: qwen/qwen2.5-vl-72b-instruct:free model_name: qwen/qwen2.5-vl-72b-instruct:free

View File

@ -19,7 +19,6 @@ import base64
import logging import logging
import math import math
import os import os
import subprocess
import sys import sys
from glob import glob from glob import glob
from io import BytesIO from io import BytesIO
@ -33,6 +32,7 @@ import spaces
import torch import torch
from moviepy.editor import VideoFileClip, clips_array from moviepy.editor import VideoFileClip, clips_array
from tqdm import tqdm from tqdm import tqdm
from embodied_gen.data.differentiable_render import entrypoint as render_api
current_file_path = os.path.abspath(__file__) current_file_path = os.path.abspath(__file__)
current_dir = os.path.dirname(current_file_path) current_dir = os.path.dirname(current_file_path)
@ -56,8 +56,6 @@ __all__ = [
"combine_images_to_base64", "combine_images_to_base64",
"render_mesh", "render_mesh",
"render_video", "render_video",
"create_mp4_from_images",
"create_gif_from_images",
] ]
@ -75,34 +73,25 @@ def render_asset3d(
gen_viewnormal_mp4: bool = False, gen_viewnormal_mp4: bool = False,
gen_glonormal_mp4: bool = False, gen_glonormal_mp4: bool = False,
) -> list[str]: ) -> list[str]:
command = [ input_args = dict(
"python3", mesh_path=mesh_path,
"embodied_gen/data/differentiable_render.py", output_root=output_root,
"--mesh_path", uuid=output_subdir,
mesh_path, distance=distance,
"--output_root", num_images=num_images,
output_root, elevation=elevation,
"--uuid", pbr_light_factor=pbr_light_factor,
output_subdir, with_mtl=True,
"--distance", )
str(distance),
"--num_images",
str(num_images),
"--elevation",
*map(str, elevation),
"--pbr_light_factor",
str(pbr_light_factor),
"--with_mtl",
]
if gen_color_mp4: if gen_color_mp4:
command.append("--gen_color_mp4") input_args["gen_color_mp4"] = True
if gen_viewnormal_mp4: if gen_viewnormal_mp4:
command.append("--gen_viewnormal_mp4") input_args["gen_viewnormal_mp4"] = True
if gen_glonormal_mp4: if gen_glonormal_mp4:
command.append("--gen_glonormal_mp4") input_args["gen_glonormal_mp4"] = True
try: try:
subprocess.run(command, check=True) _ = render_api(**input_args)
except subprocess.CalledProcessError as e: except Exception as e:
logger.error(f"Error occurred during rendering: {e}.") logger.error(f"Error occurred during rendering: {e}.")
dst_paths = glob(os.path.join(output_root, output_subdir, return_key)) dst_paths = glob(os.path.join(output_root, output_subdir, return_key))
@ -263,54 +252,6 @@ def render_video(
return result return result
def create_mp4_from_images(images, output_path, fps=10, prompt=None):
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.5
font_thickness = 1
color = (255, 255, 255)
position = (20, 25)
with imageio.get_writer(output_path, fps=fps) as writer:
for image in images:
image = image.clip(min=0, max=1)
image = (255.0 * image).astype(np.uint8)
image = image[..., :3]
if prompt is not None:
cv2.putText(
image,
prompt,
position,
font,
font_scale,
color,
font_thickness,
)
writer.append_data(image)
logger.info(f"MP4 video saved to {output_path}")
def create_gif_from_images(images, output_path, fps=10):
pil_images = []
for image in images:
image = image.clip(min=0, max=1)
image = (255.0 * image).astype(np.uint8)
image = Image.fromarray(image, mode="RGBA")
pil_images.append(image.convert("RGB"))
duration = 1000 // fps
pil_images[0].save(
output_path,
save_all=True,
append_images=pil_images[1:],
duration=duration,
loop=0,
)
logger.info(f"GIF saved to {output_path}")
if __name__ == "__main__": if __name__ == "__main__":
# Example usage: # Example usage:
merge_video_video( merge_video_video(

View File

@ -24,7 +24,6 @@ from xml.dom.minidom import parseString
import numpy as np import numpy as np
import trimesh import trimesh
from embodied_gen.data.utils import zip_files
from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient
from embodied_gen.utils.process_media import render_asset3d from embodied_gen.utils.process_media import render_asset3d
from embodied_gen.utils.tags import VERSION from embodied_gen.utils.tags import VERSION