feat(texture): Optimize backprojected texture quality and add texture-cli. (#38)

* feat(sim): Add auto scale in convex decomposition.
* feat(texture): Optimize back-projected texture  quality.
* feat(texture): Add `texture-cli`.
This commit is contained in:
Xinjie 2025-09-08 11:15:04 +08:00 committed by GitHub
parent 768d1fbb1d
commit cf3b919b65
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 266 additions and 67 deletions

View File

@ -147,15 +147,12 @@ python apps/texture_edit.py
### ⚡ API ### ⚡ API
Support Chinese and English prompts. Support Chinese and English prompts.
```sh ```sh
bash embodied_gen/scripts/texture_gen.sh \ texture-cli --mesh_path "apps/assets/example_texture/meshes/robot_text.obj" \
--mesh_path "apps/assets/example_texture/meshes/robot_text.obj" \ "apps/assets/example_texture/meshes/horse.obj" \
--prompt "举着牌子的写实风格机器人大眼睛牌子上写着“Hello”的文字" \ --prompt "举着牌子的写实风格机器人大眼睛牌子上写着“Hello”的文字" \
--output_root "outputs/texture_gen/robot_text" "A gray horse head with flying mane and brown eyes" \
--output_root "outputs/texture_gen" \
bash embodied_gen/scripts/texture_gen.sh \ --seed 0
--mesh_path "apps/assets/example_texture/meshes/horse.obj" \
--prompt "A gray horse head with flying mane and brown eyes" \
--output_root "outputs/texture_gen/gray_horse"
``` ```
--- ---
@ -185,7 +182,7 @@ CUDA_VISIBLE_DEVICES=0 scene3d-cli \
🚧 *Coming Soon* 🚧 *Coming Soon*
<img src="apps/assets/articulate.gif" alt="articulate" style="width: 430px;"> <img src="apps/assets/articulate.gif" alt="articulate" style="width: 500px;">
--- ---

View File

@ -503,7 +503,12 @@ def extract_3d_representations_v2(
device="cpu", device="cpu",
) )
color_path = os.path.join(user_dir, "color.png") color_path = os.path.join(user_dir, "color.png")
render_gs_api(aligned_gs_path, color_path) render_gs_api(
input_gs=aligned_gs_path,
output_path=color_path,
elevation=[20, -10, 60, -50],
num_images=12,
)
mesh = trimesh.Trimesh( mesh = trimesh.Trimesh(
vertices=mesh_model.vertices.cpu().numpy(), vertices=mesh_model.vertices.cpu().numpy(),
@ -524,6 +529,8 @@ def extract_3d_representations_v2(
skip_fix_mesh=False, skip_fix_mesh=False,
delight=enable_delight, delight=enable_delight,
texture_wh=[texture_size, texture_size], texture_wh=[texture_size, texture_size],
elevation=[20, -10, 60, -50],
num_images=12,
) )
mesh_glb_path = os.path.join(user_dir, f"{filename}.glb") mesh_glb_path = os.path.join(user_dir, f"{filename}.glb")

View File

@ -33,6 +33,7 @@ from embodied_gen.data.mesh_operator import MeshFixer
from embodied_gen.data.utils import ( from embodied_gen.data.utils import (
CameraSetting, CameraSetting,
DiffrastRender, DiffrastRender,
as_list,
get_images_from_grid, get_images_from_grid,
init_kal_camera, init_kal_camera,
normalize_vertices_array, normalize_vertices_array,
@ -41,6 +42,7 @@ from embodied_gen.data.utils import (
) )
from embodied_gen.models.delight_model import DelightingModel from embodied_gen.models.delight_model import DelightingModel
from embodied_gen.models.sr_model import ImageRealESRGAN from embodied_gen.models.sr_model import ImageRealESRGAN
from embodied_gen.utils.process_media import vcat_pil_images
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
@ -541,8 +543,9 @@ def parse_args():
parser = argparse.ArgumentParser(description="Backproject texture") parser = argparse.ArgumentParser(description="Backproject texture")
parser.add_argument( parser.add_argument(
"--color_path", "--color_path",
nargs="+",
type=str, type=str,
help="Multiview color image in 6x512x512 file path", help="Multiview color image in grid file paths",
) )
parser.add_argument( parser.add_argument(
"--mesh_path", "--mesh_path",
@ -559,7 +562,7 @@ def parse_args():
) )
parser.add_argument( parser.add_argument(
"--elevation", "--elevation",
nargs=2, nargs="+",
type=float, type=float,
default=[20.0, -10.0], default=[20.0, -10.0],
help="Elevation angles for the camera (default: [20.0, -10.0])", help="Elevation angles for the camera (default: [20.0, -10.0])",
@ -647,19 +650,23 @@ def entrypoint(
fov=math.radians(args.fov), fov=math.radians(args.fov),
device=args.device, device=args.device,
) )
view_weights = [1, 0.1, 0.02, 0.1, 1, 0.02]
color_grid = Image.open(args.color_path) args.color_path = as_list(args.color_path)
if args.delight: if args.delight and delight_model is None:
if delight_model is None:
delight_model = DelightingModel() delight_model = DelightingModel()
save_dir = os.path.dirname(args.output_path)
os.makedirs(save_dir, exist_ok=True) color_grid = [Image.open(color_path) for color_path in args.color_path]
color_grid = vcat_pil_images(color_grid, image_mode="RGBA")
if args.delight:
color_grid = delight_model(color_grid) color_grid = delight_model(color_grid)
if not args.no_save_delight_img: if not args.no_save_delight_img:
color_grid.save(f"{save_dir}/color_grid_delight.png") save_dir = os.path.dirname(args.output_path)
os.makedirs(save_dir, exist_ok=True)
color_grid.save(f"{save_dir}/color_delight.png")
multiviews = get_images_from_grid(color_grid, img_size=512) multiviews = get_images_from_grid(color_grid, img_size=512)
view_weights = [1, 0.1, 0.02, 0.1, 1, 0.02]
view_weights += [0.01] * (len(multiviews) - len(view_weights))
# Use RealESRGAN_x4plus for x4 (512->2048) image super resolution. # Use RealESRGAN_x4plus for x4 (512->2048) image super resolution.
if imagesr_model is None: if imagesr_model is None:
@ -688,7 +695,7 @@ def entrypoint(
texture_backer = TextureBacker( texture_backer = TextureBacker(
camera_params=camera_params, camera_params=camera_params,
view_weights=view_weights, view_weights=view_weights,
render_wh=camera_params.resolution_hw, render_wh=args.resolution_hw,
texture_wh=args.texture_wh, texture_wh=args.texture_wh,
smooth_texture=not args.no_smooth_texture, smooth_texture=not args.no_smooth_texture,
) )

View File

@ -503,7 +503,7 @@ def parse_args():
help="Whether to generate global normal .mp4 rendering file.", help="Whether to generate global normal .mp4 rendering file.",
) )
parser.add_argument( parser.add_argument(
"--prompts", "--video_prompts",
type=str, type=str,
nargs="+", nargs="+",
default=None, default=None,
@ -579,7 +579,7 @@ def entrypoint(**kwargs) -> None:
mesh_path=args.mesh_path, mesh_path=args.mesh_path,
output_root=args.output_root, output_root=args.output_root,
uuid=args.uuid, uuid=args.uuid,
prompts=args.prompts, prompts=args.video_prompts,
) )
return return

View File

@ -28,7 +28,7 @@ import numpy as np
import nvdiffrast.torch as dr import nvdiffrast.torch as dr
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from PIL import Image from PIL import Image, ImageEnhance
try: try:
from kolors.models.modeling_chatglm import ChatGLMModel from kolors.models.modeling_chatglm import ChatGLMModel
@ -698,6 +698,8 @@ def as_list(obj):
return obj return obj
elif isinstance(obj, set): elif isinstance(obj, set):
return list(obj) return list(obj)
elif obj is None:
return obj
else: else:
return [obj] return [obj]
@ -742,6 +744,8 @@ def _compute_az_el_by_camera_params(
): ):
num_view = camera_params.num_images // len(camera_params.elevation) num_view = camera_params.num_images // len(camera_params.elevation)
view_interval = 2 * np.pi / num_view / 2 view_interval = 2 * np.pi / num_view / 2
if num_view == 1:
view_interval = np.pi / 2
azimuths = [] azimuths = []
elevations = [] elevations = []
for idx, el in enumerate(camera_params.elevation): for idx, el in enumerate(camera_params.elevation):
@ -758,8 +762,13 @@ def _compute_az_el_by_camera_params(
return azimuths, elevations return azimuths, elevations
def init_kal_camera(camera_params: CameraSetting) -> Camera: def init_kal_camera(
azimuths, elevations = _compute_az_el_by_camera_params(camera_params) camera_params: CameraSetting,
flip_az: bool = False,
) -> Camera:
azimuths, elevations = _compute_az_el_by_camera_params(
camera_params, flip_az
)
cam_pts = _compute_cam_pts_by_az_el( cam_pts = _compute_cam_pts_by_az_el(
azimuths, elevations, camera_params.distance azimuths, elevations, camera_params.distance
) )
@ -856,13 +865,38 @@ def get_images_from_grid(
image = Image.open(image) image = Image.open(image)
view_images = np.array(image) view_images = np.array(image)
view_images = np.concatenate( height, width, _ = view_images.shape
[view_images[:img_size, ...], view_images[img_size:, ...]], axis=1 rows = height // img_size
) cols = width // img_size
images = np.split(view_images, view_images.shape[1] // img_size, axis=1) blocks = []
images = [Image.fromarray(img) for img in images] for i in range(rows):
for j in range(cols):
block = view_images[
i * img_size : (i + 1) * img_size,
j * img_size : (j + 1) * img_size,
:,
]
blocks.append(Image.fromarray(block))
return images return blocks
def enhance_image(
image: Image.Image,
contrast_factor: float = 1.3,
color_factor: float = 1.2,
brightness_factor: float = 0.95,
) -> Image.Image:
enhancer_contrast = ImageEnhance.Contrast(image)
img_contrasted = enhancer_contrast.enhance(contrast_factor)
enhancer_color = ImageEnhance.Color(img_contrasted)
img_colored = enhancer_color.enhance(color_factor)
enhancer_brightness = ImageEnhance.Brightness(img_colored)
enhanced_image = enhancer_brightness.enhance(brightness_factor)
return enhanced_image
def post_process_texture(texture: np.ndarray, iter: int = 1) -> np.ndarray: def post_process_texture(texture: np.ndarray, iter: int = 1) -> np.ndarray:
@ -872,7 +906,14 @@ def post_process_texture(texture: np.ndarray, iter: int = 1) -> np.ndarray:
texture, d=5, sigmaColor=20, sigmaSpace=20 texture, d=5, sigmaColor=20, sigmaSpace=20
) )
return texture texture = enhance_image(
image=Image.fromarray(texture),
contrast_factor=1.3,
color_factor=1.2,
brightness_factor=0.95,
)
return np.array(texture)
def quat_mult(q1, q2): def quat_mult(q1, q2):

View File

@ -29,6 +29,7 @@ from diffusers import (
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from PIL import Image from PIL import Image
from embodied_gen.models.segment_model import RembgRemover from embodied_gen.models.segment_model import RembgRemover
from embodied_gen.utils.log import logger
__all__ = [ __all__ = [
"DelightingModel", "DelightingModel",
@ -84,6 +85,7 @@ class DelightingModel(object):
def _lazy_init_pipeline(self): def _lazy_init_pipeline(self):
if self.pipeline is None: if self.pipeline is None:
logger.info("Loading Delighting Model...")
pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained( pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
self.model_path, self.model_path,
torch_dtype=torch.float16, torch_dtype=torch.float16,

View File

@ -43,7 +43,7 @@ __all__ = [
] ]
DISTRACTOR_NUM = 3 # Maximum number of distractor objects allowed DISTRACTOR_NUM = 2 # Maximum number of distractor objects allowed
LAYOUT_DISASSEMBLE_PROMPT = f""" LAYOUT_DISASSEMBLE_PROMPT = f"""
You are an intelligent 3D scene planner. Given a natural language You are an intelligent 3D scene planner. Given a natural language
description of a robotic task, output a structured description of description of a robotic task, output a structured description of

View File

@ -29,6 +29,7 @@ from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import (
) )
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from embodied_gen.models.text_model import download_kolors_weights from embodied_gen.models.text_model import download_kolors_weights
from embodied_gen.utils.log import logger
__all__ = [ __all__ = [
"build_texture_gen_pipe", "build_texture_gen_pipe",
@ -42,7 +43,7 @@ def build_texture_gen_pipe(
device: str = "cuda", device: str = "cuda",
) -> DiffusionPipeline: ) -> DiffusionPipeline:
download_kolors_weights(f"{base_ckpt_dir}/Kolors") download_kolors_weights(f"{base_ckpt_dir}/Kolors")
logger.info(f"Load Kolors weights...")
tokenizer = ChatGLMTokenizer.from_pretrained( tokenizer = ChatGLMTokenizer.from_pretrained(
f"{base_ckpt_dir}/Kolors/text_encoder" f"{base_ckpt_dir}/Kolors/text_encoder"
) )

View File

@ -0,0 +1,123 @@
import os
import shutil
from dataclasses import dataclass
import tyro
from embodied_gen.data.backproject_v2 import entrypoint as backproject_api
from embodied_gen.data.differentiable_render import entrypoint as drender_api
from embodied_gen.data.utils import as_list
from embodied_gen.models.delight_model import DelightingModel
from embodied_gen.models.sr_model import ImageRealESRGAN
from embodied_gen.scripts.render_mv import (
build_texture_gen_pipe,
)
from embodied_gen.scripts.render_mv import infer_pipe as render_mv_api
from embodied_gen.utils.log import logger
@dataclass
class TextureGenConfig:
mesh_path: str | list[str]
prompt: str | list[str]
output_root: str
controlnet_cond_scale: float = 0.7
guidance_scale: float = 9
strength: float = 0.9
num_inference_steps: int = 40
delight: bool = True
seed: int = 0
base_ckpt_dir: str = "./weights"
texture_size: int = 2048
ip_adapt_scale: float = 0.0
ip_img_path: str | list[str] | None = None
def entrypoint() -> None:
cfg = tyro.cli(TextureGenConfig)
cfg.mesh_path = as_list(cfg.mesh_path)
cfg.prompt = as_list(cfg.prompt)
cfg.ip_img_path = as_list(cfg.ip_img_path)
assert len(cfg.mesh_path) == len(cfg.prompt)
# Pre-load models.
if cfg.ip_adapt_scale > 0:
PIPELINE = build_texture_gen_pipe(
base_ckpt_dir="./weights",
ip_adapt_scale=cfg.ip_adapt_scale,
device="cuda",
)
else:
PIPELINE = build_texture_gen_pipe(
base_ckpt_dir="./weights",
ip_adapt_scale=0,
device="cuda",
)
DELIGHT = None
if cfg.delight:
DELIGHT = DelightingModel()
IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
for idx in range(len(cfg.mesh_path)):
mesh_path = cfg.mesh_path[idx]
prompt = cfg.prompt[idx]
uuid = os.path.splitext(os.path.basename(mesh_path))[0]
output_root = os.path.join(cfg.output_root, uuid)
drender_api(
mesh_path=mesh_path,
output_root=f"{output_root}/condition",
uuid=uuid,
)
render_mv_api(
index_file=f"{output_root}/condition/index.json",
controlnet_cond_scale=cfg.controlnet_cond_scale,
guidance_scale=cfg.guidance_scale,
strength=cfg.strength,
num_inference_steps=cfg.num_inference_steps,
ip_adapt_scale=cfg.ip_adapt_scale,
ip_img_path=(
None if cfg.ip_img_path is None else cfg.ip_img_path[idx]
),
prompt=prompt,
save_dir=f"{output_root}/multi_view",
sub_idxs=[[0, 1, 2], [3, 4, 5]],
pipeline=PIPELINE,
seed=cfg.seed,
)
textured_mesh = backproject_api(
delight_model=DELIGHT,
imagesr_model=IMAGESR_MODEL,
mesh_path=mesh_path,
color_path=f"{output_root}/multi_view/color_sample0.png",
output_path=f"{output_root}/texture_mesh/{uuid}.obj",
save_glb_path=f"{output_root}/texture_mesh/{uuid}.glb",
skip_fix_mesh=True,
delight=cfg.delight,
no_save_delight_img=True,
texture_wh=[cfg.texture_size, cfg.texture_size],
)
drender_api(
mesh_path=f"{output_root}/texture_mesh/{uuid}.obj",
output_root=f"{output_root}/texture_mesh",
uuid=uuid,
num_images=90,
elevation=[20],
with_mtl=True,
gen_color_mp4=True,
pbr_light_factor=1.2,
)
# Re-organize folders
shutil.rmtree(f"{output_root}/condition")
shutil.copy(
f"{output_root}/texture_mesh/{uuid}/color.mp4",
f"{output_root}/color.mp4",
)
shutil.rmtree(f"{output_root}/texture_mesh/{uuid}")
logger.info(
f"Successfully generate textured mesh in {output_root}/texture_mesh"
)
if __name__ == "__main__":
entrypoint()

View File

@ -108,6 +108,9 @@ def parse_args():
default=2, default=2,
) )
parser.add_argument("--disable_decompose_convex", action="store_true") parser.add_argument("--disable_decompose_convex", action="store_true")
parser.add_argument(
"--texture_wh", type=int, nargs=2, default=[2048, 2048]
)
args, unknown = parser.parse_known_args() args, unknown = parser.parse_known_args()
return args return args
@ -209,11 +212,17 @@ def entrypoint(**kwargs):
device="cpu", device="cpu",
) )
color_path = os.path.join(output_root, "color.png") color_path = os.path.join(output_root, "color.png")
render_gs_api(aligned_gs_path, color_path) render_gs_api(
input_gs=aligned_gs_path,
geo_flag, geo_result = GEO_CHECKER( output_path=color_path,
[color_path], text=asset_node elevation=[20, -10, 60, -50],
num_images=12,
) )
color_img = Image.open(color_path)
keep_height = int(color_img.height * 2 / 3)
crop_img = color_img.crop((0, 0, color_img.width, keep_height))
geo_flag, geo_result = GEO_CHECKER([crop_img], text=asset_node)
logger.warning( logger.warning(
f"{GEO_CHECKER.__class__.__name__}: {geo_result} for {seg_path}" f"{GEO_CHECKER.__class__.__name__}: {geo_result} for {seg_path}"
) )
@ -246,7 +255,9 @@ def entrypoint(**kwargs):
output_path=mesh_obj_path, output_path=mesh_obj_path,
skip_fix_mesh=False, skip_fix_mesh=False,
delight=True, delight=True,
texture_wh=[2048, 2048], texture_wh=args.texture_wh,
elevation=[20, -10, 60, -50],
num_images=12,
) )
mesh_glb_path = os.path.join(output_root, f"{filename}.glb") mesh_glb_path = os.path.join(output_root, f"{filename}.glb")

View File

@ -18,12 +18,11 @@
import argparse import argparse
import logging import logging
import math import math
import os
import cv2 import cv2
import numpy as np
import spaces import spaces
import torch import torch
from PIL import Image
from tqdm import tqdm from tqdm import tqdm
from embodied_gen.data.utils import ( from embodied_gen.data.utils import (
CameraSetting, CameraSetting,
@ -31,6 +30,7 @@ from embodied_gen.data.utils import (
normalize_vertices_array, normalize_vertices_array,
) )
from embodied_gen.models.gs_model import GaussianOperator from embodied_gen.models.gs_model import GaussianOperator
from embodied_gen.utils.process_media import combine_images_to_grid
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
@ -104,7 +104,7 @@ def load_gs_model(
# Normalize vertices to [-1, 1], center to (0, 0, 0). # Normalize vertices to [-1, 1], center to (0, 0, 0).
_, scale, center = normalize_vertices_array(gs_model._means) _, scale, center = normalize_vertices_array(gs_model._means)
scale, center = float(scale), center.tolist() scale, center = float(scale), center.tolist()
transpose = [*[-v for v in center], *pre_quat] transpose = [*[v for v in center], *pre_quat]
instance_pose = torch.tensor(transpose).to(gs_model.device) instance_pose = torch.tensor(transpose).to(gs_model.device)
gs_model = gs_model.get_gaussians(instance_pose=instance_pose) gs_model = gs_model.get_gaussians(instance_pose=instance_pose)
gs_model.rescale(scale) gs_model.rescale(scale)
@ -113,12 +113,11 @@ def load_gs_model(
@spaces.GPU @spaces.GPU
def entrypoint(input_gs: str = None, output_path: str = None) -> None: def entrypoint(**kwargs) -> None:
args = parse_args() args = parse_args()
if isinstance(input_gs, str): for k, v in kwargs.items():
args.input_gs = input_gs if hasattr(args, k) and v is not None:
if isinstance(output_path, str): setattr(args, k, v)
args.output_path = output_path
# Setup camera parameters # Setup camera parameters
camera_params = CameraSetting( camera_params = CameraSetting(
@ -129,7 +128,7 @@ def entrypoint(input_gs: str = None, output_path: str = None) -> None:
fov=math.radians(args.fov), fov=math.radians(args.fov),
device=args.device, device=args.device,
) )
camera = init_kal_camera(camera_params) camera = init_kal_camera(camera_params, flip_az=True)
matrix_mv = camera.view_matrix() # (n_cam 4 4) world2cam matrix_mv = camera.view_matrix() # (n_cam 4 4) world2cam
matrix_mv[:, :3, 3] = -matrix_mv[:, :3, 3] matrix_mv[:, :3, 3] = -matrix_mv[:, :3, 3]
w2cs = matrix_mv.to(camera_params.device) w2cs = matrix_mv.to(camera_params.device)
@ -153,21 +152,11 @@ def entrypoint(input_gs: str = None, output_path: str = None) -> None:
(args.image_size, args.image_size), (args.image_size, args.image_size),
interpolation=cv2.INTER_AREA, interpolation=cv2.INTER_AREA,
) )
images.append(color) color = cv2.cvtColor(color, cv2.COLOR_BGRA2RGBA)
images.append(Image.fromarray(color))
# Cat color images into grid image and save. combine_images_to_grid(images, image_mode="RGBA")[0].save(args.output_path)
select_idxs = [[0, 2, 1], [5, 4, 3]] # fix order for 6 views
grid_image = []
for row_idxs in select_idxs:
row_image = []
for row_idx in row_idxs:
row_image.append(images[row_idx])
row_image = np.concatenate(row_image, axis=1)
grid_image.append(row_image)
grid_image = np.concatenate(grid_image, axis=0)
os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
cv2.imwrite(args.output_path, grid_image)
logger.info(f"Saved grid image to {args.output_path}") logger.info(f"Saved grid image to {args.output_path}")

View File

@ -170,7 +170,8 @@ def entrypoint(**kwargs):
for node in actions: for node in actions:
if actions[node] is None: if actions[node] is None:
continue continue
for action in tqdm(actions[node]): logger.info(f"Render SIM grasping in camera {idx} for {node}...")
for action in actions[node]:
grasp_frames = scene_manager.step_action( grasp_frames = scene_manager.step_action(
agent, agent,
torch.Tensor(action[None, ...]), torch.Tensor(action[None, ...]),

View File

@ -28,6 +28,7 @@ if [[ -z "$mesh_path" || -z "$prompt" || -z "$output_root" ]]; then
exit 1 exit 1
fi fi
echo "Will be deprecated, recommended to use 'texture-cli' instead."
uuid=$(basename "$output_root") uuid=$(basename "$output_root")
# Step 1: drender-cli for condition rendering # Step 1: drender-cli for condition rendering
drender-cli --mesh_path ${mesh_path} \ drender-cli --mesh_path ${mesh_path} \

View File

@ -49,6 +49,7 @@ __all__ = [
"is_image_file", "is_image_file",
"parse_text_prompts", "parse_text_prompts",
"check_object_edge_truncated", "check_object_edge_truncated",
"vcat_pil_images",
] ]
@ -166,6 +167,7 @@ def combine_images_to_grid(
images: list[str | Image.Image], images: list[str | Image.Image],
cat_row_col: tuple[int, int] = None, cat_row_col: tuple[int, int] = None,
target_wh: tuple[int, int] = (512, 512), target_wh: tuple[int, int] = (512, 512),
image_mode: str = "RGB",
) -> list[Image.Image]: ) -> list[Image.Image]:
n_images = len(images) n_images = len(images)
if n_images == 1: if n_images == 1:
@ -178,13 +180,13 @@ def combine_images_to_grid(
n_row, n_col = cat_row_col n_row, n_col = cat_row_col
images = [ images = [
Image.open(p).convert("RGB") if isinstance(p, str) else p Image.open(p).convert(image_mode) if isinstance(p, str) else p
for p in images for p in images
] ]
images = [img.resize(target_wh) for img in images] images = [img.resize(target_wh) for img in images]
grid_w, grid_h = n_col * target_wh[0], n_row * target_wh[1] grid_w, grid_h = n_col * target_wh[0], n_row * target_wh[1]
grid = Image.new("RGB", (grid_w, grid_h), (0, 0, 0)) grid = Image.new(image_mode, (grid_w, grid_h), (0, 0, 0))
for idx, img in enumerate(images): for idx, img in enumerate(images):
row, col = divmod(idx, n_col) row, col = divmod(idx, n_col)
@ -435,6 +437,21 @@ def check_object_edge_truncated(
return not (top or bottom or left or right) return not (top or bottom or left or right)
def vcat_pil_images(
images: list[Image.Image], image_mode: str = "RGB"
) -> Image.Image:
widths, heights = zip(*(img.size for img in images))
total_height = sum(heights)
max_width = max(widths)
new_image = Image.new(image_mode, (max_width, total_height))
y_offset = 0
for image in images:
new_image.paste(image, (0, y_offset))
y_offset += image.size[1]
return new_image
if __name__ == "__main__": if __name__ == "__main__":
image_paths = [ image_paths = [
"outputs/layouts_sim/task_0000/images/pen.png", "outputs/layouts_sim/task_0000/images/pen.png",

View File

@ -249,7 +249,7 @@ class SemanticConsistChecker(BaseChecker):
fewer than four legs or if the legs are unevenly distributed, are not allowed. Do not assume fewer than four legs or if the legs are unevenly distributed, are not allowed. Do not assume
hidden legs unless they are clearly visible.) hidden legs unless they are clearly visible.)
- Geometric completeness is required: the object must not have missing, truncated, or cropped parts. - Geometric completeness is required: the object must not have missing, truncated, or cropped parts.
- The image must contain exactly one object. Multiple distinct objects are not allowed. - The image must contain exactly one object. Multiple distinct objects (e.g. multiple pens) are not allowed.
A single composite object (e.g., a chair with legs) is acceptable. A single composite object (e.g., a chair with legs) is acceptable.
- The object should be shown from a slightly angled (three-quarter) perspective, - The object should be shown from a slightly angled (three-quarter) perspective,
not a flat, front-facing view showing only one surface. not a flat, front-facing view showing only one surface.

View File

@ -266,7 +266,7 @@ class URDFGenerator(object):
if self.decompose_convex: if self.decompose_convex:
try: try:
d_params = dict( d_params = dict(
threshold=0.05, max_convex_hull=64, verbose=False threshold=0.05, max_convex_hull=100, verbose=False
) )
filename = f"{os.path.splitext(obj_name)[0]}_collision.ply" filename = f"{os.path.splitext(obj_name)[0]}_collision.ply"
output_path = os.path.join(mesh_folder, filename) output_path = os.path.join(mesh_folder, filename)

View File

@ -31,6 +31,7 @@ drender-cli = "embodied_gen.data.differentiable_render:entrypoint"
backproject-cli = "embodied_gen.data.backproject_v2:entrypoint" backproject-cli = "embodied_gen.data.backproject_v2:entrypoint"
img3d-cli = "embodied_gen.scripts.imageto3d:entrypoint" img3d-cli = "embodied_gen.scripts.imageto3d:entrypoint"
text3d-cli = "embodied_gen.scripts.textto3d:text_to_3d" text3d-cli = "embodied_gen.scripts.textto3d:text_to_3d"
texture-cli = "embodied_gen.scripts.gen_texture:entrypoint"
scene3d-cli = "embodied_gen.scripts.gen_scene3d:entrypoint" scene3d-cli = "embodied_gen.scripts.gen_scene3d:entrypoint"
layout-cli = "embodied_gen.scripts.gen_layout:entrypoint" layout-cli = "embodied_gen.scripts.gen_layout:entrypoint"
sim-cli = "embodied_gen.scripts.simulate_sapien:entrypoint" sim-cli = "embodied_gen.scripts.simulate_sapien:entrypoint"

View File

@ -142,6 +142,7 @@ def test_semantic_checker(semantic_checker):
("desk", "outputs/utest_cases/semantic_checker/task_0016_desk.png"), ("desk", "outputs/utest_cases/semantic_checker/task_0016_desk.png"),
("shelf", "outputs/utest_cases/semantic_checker/task_0018_shelf.png"), ("shelf", "outputs/utest_cases/semantic_checker/task_0018_shelf.png"),
("table", "outputs/utest_cases/semantic_checker/task_0000_table.png"), ("table", "outputs/utest_cases/semantic_checker/task_0000_table.png"),
("pen", "outputs/layouts_gens2/task_0000/images/pen_raw.png"),
] ]
for test_case in test_cases: for test_case in test_cases:
flag, result = semantic_checker( flag, result = semantic_checker(