refactor(pipe): Adapt to huggingface space. (#3)

Adapt to huggingface space.
This commit is contained in:
Xinjie 2025-06-12 11:59:29 +08:00 committed by GitHub
parent 18075659de
commit e29807bd62
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 92 additions and 95 deletions

View File

@ -25,8 +25,8 @@
## 🚀 Quick Start
```sh
git clone https://github.com/HorizonRobotics/EmbodiedGen
cd EmbodiedGen
git clone https://github.com/HorizonRobotics/EmbodiedGen.git
cd EmbodiedGen && git submodule update --init --recursive
conda create -n embodiedgen python=3.10.13 -y
conda activate embodiedgen
pip install -r requirements.txt --use-deprecated=legacy-resolver
@ -42,7 +42,7 @@ Update the API key in file: `embodied_gen/utils/gpt_config.yaml`.
You can choose between two backends for the GPT agent:
- **`gpt-4o`** (Recommended) Use this if you have access to **Azure OpenAI**.
- **`qwen2.5-vl`** An open alternative with free usage via [OpenRouter](https://openrouter.ai/settings/keys) (50 free requests per day)
- **`qwen2.5-vl`** An alternative with free usage via OpenRouter, apply a free key [here](https://openrouter.ai/settings/keys) and update `api_key` in `embodied_gen/utils/gpt_config.yaml` (50 free requests per day)
---

View File

@ -35,7 +35,7 @@ from gradio.themes.utils.colors import gray, neutral, slate, stone, teal, zinc
from PIL import Image
from embodied_gen.data.backproject_v2 import entrypoint as backproject_api
from embodied_gen.data.differentiable_render import entrypoint as render_api
from embodied_gen.data.utils import trellis_preprocess
from embodied_gen.data.utils import trellis_preprocess, zip_files
from embodied_gen.models.delight_model import DelightingModel
from embodied_gen.models.gs_model import GaussianOperator
from embodied_gen.models.segment_model import (
@ -64,7 +64,7 @@ from embodied_gen.validators.quality_checkers import (
ImageSegChecker,
MeshGeoChecker,
)
from embodied_gen.validators.urdf_convertor import URDFGenerator, zip_files
from embodied_gen.validators.urdf_convertor import URDFGenerator
current_file_path = os.path.abspath(__file__)
current_dir = os.path.dirname(current_file_path)

View File

@ -24,7 +24,10 @@ from collections import defaultdict
from typing import List, Union
import cv2
import imageio
import numpy as np
import nvdiffrast.torch as dr
import PIL.Image as Image
import torch
from tqdm import tqdm
from embodied_gen.data.utils import (
@ -39,10 +42,6 @@ from embodied_gen.data.utils import (
render_pbr,
save_images,
)
from embodied_gen.utils.process_media import (
create_gif_from_images,
create_mp4_from_images,
)
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
os.environ["TORCH_EXTENSIONS_DIR"] = os.path.expanduser(
@ -54,7 +53,66 @@ logging.basicConfig(
logger = logging.getLogger(__name__)
__all__ = ["ImageRender"]
__all__ = [
"ImageRender",
"create_mp4_from_images",
"create_gif_from_images",
]
def create_mp4_from_images(
images: list[np.ndarray],
output_path: str,
fps: int = 10,
prompt: str = None,
):
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.5
font_thickness = 1
color = (255, 255, 255)
position = (20, 25)
with imageio.get_writer(output_path, fps=fps) as writer:
for image in images:
image = image.clip(min=0, max=1)
image = (255.0 * image).astype(np.uint8)
image = image[..., :3]
if prompt is not None:
cv2.putText(
image,
prompt,
position,
font,
font_scale,
color,
font_thickness,
)
writer.append_data(image)
logger.info(f"MP4 video saved to {output_path}")
def create_gif_from_images(
images: list[np.ndarray], output_path: str, fps: int = 10
) -> None:
pil_images = []
for image in images:
image = image.clip(min=0, max=1)
image = (255.0 * image).astype(np.uint8)
image = Image.fromarray(image, mode="RGBA")
pil_images.append(image.convert("RGB"))
duration = 1000 // fps
pil_images[0].save(
output_path,
save_all=True,
append_images=pil_images[1:],
duration=duration,
loop=0,
)
logger.info(f"GIF saved to {output_path}")
class ImageRender(object):

View File

@ -139,7 +139,9 @@ class DiffrastRender(object):
vertices: torch.Tensor,
matrix: torch.Tensor,
) -> torch.Tensor:
verts_ones = torch.ones((len(vertices), 1)).to(vertices)
verts_ones = torch.ones(
(len(vertices), 1), device=vertices.device, dtype=vertices.dtype
)
verts_homo = torch.cat([vertices, verts_ones], dim=-1)
trans_vertices = torch.matmul(verts_homo, matrix.permute(0, 2, 1))

View File

@ -185,9 +185,8 @@ if __name__ == "__main__":
text_prompt="What is the content in each image?",
image_base64=combine_images_to_base64(
[
"outputs/text2image/demo_objects/bed/sample_0.jpg",
"outputs/imageto3d/v2/cups/sample_69/URDF_sample_69/qa_renders/image_color/003.png", # noqa
"outputs/text2image/demo_objects/cardboard/sample_1.jpg",
"apps/assets/example_image/sample_02.jpg",
"apps/assets/example_image/sample_03.jpg",
]
), # input raw image_path if only one image
)
@ -196,10 +195,8 @@ if __name__ == "__main__":
response = GPT_CLIENT.query(
text_prompt="What is the content in the images?",
image_base64=[
Image.open("outputs/text2image/demo_objects/bed/sample_0.jpg"),
Image.open(
"outputs/imageto3d/v2/cups/sample_69/URDF_sample_69/qa_renders/image_color/003.png" # noqa
),
Image.open("apps/assets/example_image/sample_02.jpg"),
Image.open("apps/assets/example_image/sample_03.jpg"),
],
)
print(response)

View File

@ -9,6 +9,6 @@ gpt-4o:
qwen2.5-vl:
endpoint: https://openrouter.ai/api/v1
api_key: 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
api_key: sk-or-v1-xxx
api_version: null
model_name: qwen/qwen2.5-vl-72b-instruct:free

View File

@ -19,7 +19,6 @@ import base64
import logging
import math
import os
import subprocess
import sys
from glob import glob
from io import BytesIO
@ -33,6 +32,7 @@ import spaces
import torch
from moviepy.editor import VideoFileClip, clips_array
from tqdm import tqdm
from embodied_gen.data.differentiable_render import entrypoint as render_api
current_file_path = os.path.abspath(__file__)
current_dir = os.path.dirname(current_file_path)
@ -56,8 +56,6 @@ __all__ = [
"combine_images_to_base64",
"render_mesh",
"render_video",
"create_mp4_from_images",
"create_gif_from_images",
]
@ -75,34 +73,25 @@ def render_asset3d(
gen_viewnormal_mp4: bool = False,
gen_glonormal_mp4: bool = False,
) -> list[str]:
command = [
"python3",
"embodied_gen/data/differentiable_render.py",
"--mesh_path",
mesh_path,
"--output_root",
output_root,
"--uuid",
output_subdir,
"--distance",
str(distance),
"--num_images",
str(num_images),
"--elevation",
*map(str, elevation),
"--pbr_light_factor",
str(pbr_light_factor),
"--with_mtl",
]
input_args = dict(
mesh_path=mesh_path,
output_root=output_root,
uuid=output_subdir,
distance=distance,
num_images=num_images,
elevation=elevation,
pbr_light_factor=pbr_light_factor,
with_mtl=True,
)
if gen_color_mp4:
command.append("--gen_color_mp4")
input_args["gen_color_mp4"] = True
if gen_viewnormal_mp4:
command.append("--gen_viewnormal_mp4")
input_args["gen_viewnormal_mp4"] = True
if gen_glonormal_mp4:
command.append("--gen_glonormal_mp4")
input_args["gen_glonormal_mp4"] = True
try:
subprocess.run(command, check=True)
except subprocess.CalledProcessError as e:
_ = render_api(**input_args)
except Exception as e:
logger.error(f"Error occurred during rendering: {e}.")
dst_paths = glob(os.path.join(output_root, output_subdir, return_key))
@ -263,54 +252,6 @@ def render_video(
return result
def create_mp4_from_images(images, output_path, fps=10, prompt=None):
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.5
font_thickness = 1
color = (255, 255, 255)
position = (20, 25)
with imageio.get_writer(output_path, fps=fps) as writer:
for image in images:
image = image.clip(min=0, max=1)
image = (255.0 * image).astype(np.uint8)
image = image[..., :3]
if prompt is not None:
cv2.putText(
image,
prompt,
position,
font,
font_scale,
color,
font_thickness,
)
writer.append_data(image)
logger.info(f"MP4 video saved to {output_path}")
def create_gif_from_images(images, output_path, fps=10):
pil_images = []
for image in images:
image = image.clip(min=0, max=1)
image = (255.0 * image).astype(np.uint8)
image = Image.fromarray(image, mode="RGBA")
pil_images.append(image.convert("RGB"))
duration = 1000 // fps
pil_images[0].save(
output_path,
save_all=True,
append_images=pil_images[1:],
duration=duration,
loop=0,
)
logger.info(f"GIF saved to {output_path}")
if __name__ == "__main__":
# Example usage:
merge_video_video(

View File

@ -24,7 +24,6 @@ from xml.dom.minidom import parseString
import numpy as np
import trimesh
from embodied_gen.data.utils import zip_files
from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient
from embodied_gen.utils.process_media import render_asset3d
from embodied_gen.utils.tags import VERSION