diff --git a/README.md b/README.md
index 3099ddd..681dfbb 100644
--- a/README.md
+++ b/README.md
@@ -147,15 +147,12 @@ python apps/texture_edit.py
### ⚡ API
Support Chinese and English prompts.
```sh
-bash embodied_gen/scripts/texture_gen.sh \
- --mesh_path "apps/assets/example_texture/meshes/robot_text.obj" \
- --prompt "举着牌子的写实风格机器人,大眼睛,牌子上写着“Hello”的文字" \
- --output_root "outputs/texture_gen/robot_text"
-
-bash embodied_gen/scripts/texture_gen.sh \
- --mesh_path "apps/assets/example_texture/meshes/horse.obj" \
- --prompt "A gray horse head with flying mane and brown eyes" \
- --output_root "outputs/texture_gen/gray_horse"
+texture-cli --mesh_path "apps/assets/example_texture/meshes/robot_text.obj" \
+"apps/assets/example_texture/meshes/horse.obj" \
+--prompt "举着牌子的写实风格机器人,大眼睛,牌子上写着“Hello”的文字" \
+"A gray horse head with flying mane and brown eyes" \
+--output_root "outputs/texture_gen" \
+--seed 0
```
---
@@ -185,7 +182,7 @@ CUDA_VISIBLE_DEVICES=0 scene3d-cli \
🚧 *Coming Soon*
-
+
---
diff --git a/apps/common.py b/apps/common.py
index 643151b..9399ab3 100644
--- a/apps/common.py
+++ b/apps/common.py
@@ -503,7 +503,12 @@ def extract_3d_representations_v2(
device="cpu",
)
color_path = os.path.join(user_dir, "color.png")
- render_gs_api(aligned_gs_path, color_path)
+ render_gs_api(
+ input_gs=aligned_gs_path,
+ output_path=color_path,
+ elevation=[20, -10, 60, -50],
+ num_images=12,
+ )
mesh = trimesh.Trimesh(
vertices=mesh_model.vertices.cpu().numpy(),
@@ -524,6 +529,8 @@ def extract_3d_representations_v2(
skip_fix_mesh=False,
delight=enable_delight,
texture_wh=[texture_size, texture_size],
+ elevation=[20, -10, 60, -50],
+ num_images=12,
)
mesh_glb_path = os.path.join(user_dir, f"{filename}.glb")
diff --git a/embodied_gen/data/backproject_v2.py b/embodied_gen/data/backproject_v2.py
index 651b731..85fb000 100644
--- a/embodied_gen/data/backproject_v2.py
+++ b/embodied_gen/data/backproject_v2.py
@@ -33,6 +33,7 @@ from embodied_gen.data.mesh_operator import MeshFixer
from embodied_gen.data.utils import (
CameraSetting,
DiffrastRender,
+ as_list,
get_images_from_grid,
init_kal_camera,
normalize_vertices_array,
@@ -41,6 +42,7 @@ from embodied_gen.data.utils import (
)
from embodied_gen.models.delight_model import DelightingModel
from embodied_gen.models.sr_model import ImageRealESRGAN
+from embodied_gen.utils.process_media import vcat_pil_images
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
@@ -541,8 +543,9 @@ def parse_args():
parser = argparse.ArgumentParser(description="Backproject texture")
parser.add_argument(
"--color_path",
+ nargs="+",
type=str,
- help="Multiview color image in 6x512x512 file path",
+ help="Multiview color image in grid file paths",
)
parser.add_argument(
"--mesh_path",
@@ -559,7 +562,7 @@ def parse_args():
)
parser.add_argument(
"--elevation",
- nargs=2,
+ nargs="+",
type=float,
default=[20.0, -10.0],
help="Elevation angles for the camera (default: [20.0, -10.0])",
@@ -647,19 +650,23 @@ def entrypoint(
fov=math.radians(args.fov),
device=args.device,
)
- view_weights = [1, 0.1, 0.02, 0.1, 1, 0.02]
- color_grid = Image.open(args.color_path)
+ args.color_path = as_list(args.color_path)
+ if args.delight and delight_model is None:
+ delight_model = DelightingModel()
+
+ color_grid = [Image.open(color_path) for color_path in args.color_path]
+ color_grid = vcat_pil_images(color_grid, image_mode="RGBA")
if args.delight:
- if delight_model is None:
- delight_model = DelightingModel()
- save_dir = os.path.dirname(args.output_path)
- os.makedirs(save_dir, exist_ok=True)
color_grid = delight_model(color_grid)
if not args.no_save_delight_img:
- color_grid.save(f"{save_dir}/color_grid_delight.png")
+ save_dir = os.path.dirname(args.output_path)
+ os.makedirs(save_dir, exist_ok=True)
+ color_grid.save(f"{save_dir}/color_delight.png")
multiviews = get_images_from_grid(color_grid, img_size=512)
+ view_weights = [1, 0.1, 0.02, 0.1, 1, 0.02]
+ view_weights += [0.01] * (len(multiviews) - len(view_weights))
# Use RealESRGAN_x4plus for x4 (512->2048) image super resolution.
if imagesr_model is None:
@@ -688,7 +695,7 @@ def entrypoint(
texture_backer = TextureBacker(
camera_params=camera_params,
view_weights=view_weights,
- render_wh=camera_params.resolution_hw,
+ render_wh=args.resolution_hw,
texture_wh=args.texture_wh,
smooth_texture=not args.no_smooth_texture,
)
diff --git a/embodied_gen/data/differentiable_render.py b/embodied_gen/data/differentiable_render.py
index 2ad386c..fdd5a26 100644
--- a/embodied_gen/data/differentiable_render.py
+++ b/embodied_gen/data/differentiable_render.py
@@ -503,7 +503,7 @@ def parse_args():
help="Whether to generate global normal .mp4 rendering file.",
)
parser.add_argument(
- "--prompts",
+ "--video_prompts",
type=str,
nargs="+",
default=None,
@@ -579,7 +579,7 @@ def entrypoint(**kwargs) -> None:
mesh_path=args.mesh_path,
output_root=args.output_root,
uuid=args.uuid,
- prompts=args.prompts,
+ prompts=args.video_prompts,
)
return
diff --git a/embodied_gen/data/utils.py b/embodied_gen/data/utils.py
index 0d39f71..98900d9 100644
--- a/embodied_gen/data/utils.py
+++ b/embodied_gen/data/utils.py
@@ -28,7 +28,7 @@ import numpy as np
import nvdiffrast.torch as dr
import torch
import torch.nn.functional as F
-from PIL import Image
+from PIL import Image, ImageEnhance
try:
from kolors.models.modeling_chatglm import ChatGLMModel
@@ -698,6 +698,8 @@ def as_list(obj):
return obj
elif isinstance(obj, set):
return list(obj)
+ elif obj is None:
+ return obj
else:
return [obj]
@@ -742,6 +744,8 @@ def _compute_az_el_by_camera_params(
):
num_view = camera_params.num_images // len(camera_params.elevation)
view_interval = 2 * np.pi / num_view / 2
+ if num_view == 1:
+ view_interval = np.pi / 2
azimuths = []
elevations = []
for idx, el in enumerate(camera_params.elevation):
@@ -758,8 +762,13 @@ def _compute_az_el_by_camera_params(
return azimuths, elevations
-def init_kal_camera(camera_params: CameraSetting) -> Camera:
- azimuths, elevations = _compute_az_el_by_camera_params(camera_params)
+def init_kal_camera(
+ camera_params: CameraSetting,
+ flip_az: bool = False,
+) -> Camera:
+ azimuths, elevations = _compute_az_el_by_camera_params(
+ camera_params, flip_az
+ )
cam_pts = _compute_cam_pts_by_az_el(
azimuths, elevations, camera_params.distance
)
@@ -856,13 +865,38 @@ def get_images_from_grid(
image = Image.open(image)
view_images = np.array(image)
- view_images = np.concatenate(
- [view_images[:img_size, ...], view_images[img_size:, ...]], axis=1
- )
- images = np.split(view_images, view_images.shape[1] // img_size, axis=1)
- images = [Image.fromarray(img) for img in images]
+ height, width, _ = view_images.shape
+ rows = height // img_size
+ cols = width // img_size
+ blocks = []
+ for i in range(rows):
+ for j in range(cols):
+ block = view_images[
+ i * img_size : (i + 1) * img_size,
+ j * img_size : (j + 1) * img_size,
+ :,
+ ]
+ blocks.append(Image.fromarray(block))
- return images
+ return blocks
+
+
+def enhance_image(
+ image: Image.Image,
+ contrast_factor: float = 1.3,
+ color_factor: float = 1.2,
+ brightness_factor: float = 0.95,
+) -> Image.Image:
+ enhancer_contrast = ImageEnhance.Contrast(image)
+ img_contrasted = enhancer_contrast.enhance(contrast_factor)
+
+ enhancer_color = ImageEnhance.Color(img_contrasted)
+ img_colored = enhancer_color.enhance(color_factor)
+
+ enhancer_brightness = ImageEnhance.Brightness(img_colored)
+ enhanced_image = enhancer_brightness.enhance(brightness_factor)
+
+ return enhanced_image
def post_process_texture(texture: np.ndarray, iter: int = 1) -> np.ndarray:
@@ -872,7 +906,14 @@ def post_process_texture(texture: np.ndarray, iter: int = 1) -> np.ndarray:
texture, d=5, sigmaColor=20, sigmaSpace=20
)
- return texture
+ texture = enhance_image(
+ image=Image.fromarray(texture),
+ contrast_factor=1.3,
+ color_factor=1.2,
+ brightness_factor=0.95,
+ )
+
+ return np.array(texture)
def quat_mult(q1, q2):
diff --git a/embodied_gen/models/delight_model.py b/embodied_gen/models/delight_model.py
index 645b4c5..14abb4c 100644
--- a/embodied_gen/models/delight_model.py
+++ b/embodied_gen/models/delight_model.py
@@ -29,6 +29,7 @@ from diffusers import (
from huggingface_hub import snapshot_download
from PIL import Image
from embodied_gen.models.segment_model import RembgRemover
+from embodied_gen.utils.log import logger
__all__ = [
"DelightingModel",
@@ -84,6 +85,7 @@ class DelightingModel(object):
def _lazy_init_pipeline(self):
if self.pipeline is None:
+ logger.info("Loading Delighting Model...")
pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
self.model_path,
torch_dtype=torch.float16,
diff --git a/embodied_gen/models/layout.py b/embodied_gen/models/layout.py
index 568597f..97ea38b 100644
--- a/embodied_gen/models/layout.py
+++ b/embodied_gen/models/layout.py
@@ -43,7 +43,7 @@ __all__ = [
]
-DISTRACTOR_NUM = 3 # Maximum number of distractor objects allowed
+DISTRACTOR_NUM = 2 # Maximum number of distractor objects allowed
LAYOUT_DISASSEMBLE_PROMPT = f"""
You are an intelligent 3D scene planner. Given a natural language
description of a robotic task, output a structured description of
diff --git a/embodied_gen/models/texture_model.py b/embodied_gen/models/texture_model.py
index 4d609c2..d29b06a 100644
--- a/embodied_gen/models/texture_model.py
+++ b/embodied_gen/models/texture_model.py
@@ -29,6 +29,7 @@ from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import (
)
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from embodied_gen.models.text_model import download_kolors_weights
+from embodied_gen.utils.log import logger
__all__ = [
"build_texture_gen_pipe",
@@ -42,7 +43,7 @@ def build_texture_gen_pipe(
device: str = "cuda",
) -> DiffusionPipeline:
download_kolors_weights(f"{base_ckpt_dir}/Kolors")
-
+ logger.info(f"Load Kolors weights...")
tokenizer = ChatGLMTokenizer.from_pretrained(
f"{base_ckpt_dir}/Kolors/text_encoder"
)
diff --git a/embodied_gen/scripts/gen_texture.py b/embodied_gen/scripts/gen_texture.py
new file mode 100644
index 0000000..a0023a8
--- /dev/null
+++ b/embodied_gen/scripts/gen_texture.py
@@ -0,0 +1,123 @@
+import os
+import shutil
+from dataclasses import dataclass
+
+import tyro
+from embodied_gen.data.backproject_v2 import entrypoint as backproject_api
+from embodied_gen.data.differentiable_render import entrypoint as drender_api
+from embodied_gen.data.utils import as_list
+from embodied_gen.models.delight_model import DelightingModel
+from embodied_gen.models.sr_model import ImageRealESRGAN
+from embodied_gen.scripts.render_mv import (
+ build_texture_gen_pipe,
+)
+from embodied_gen.scripts.render_mv import infer_pipe as render_mv_api
+from embodied_gen.utils.log import logger
+
+
+@dataclass
+class TextureGenConfig:
+ mesh_path: str | list[str]
+ prompt: str | list[str]
+ output_root: str
+ controlnet_cond_scale: float = 0.7
+ guidance_scale: float = 9
+ strength: float = 0.9
+ num_inference_steps: int = 40
+ delight: bool = True
+ seed: int = 0
+ base_ckpt_dir: str = "./weights"
+ texture_size: int = 2048
+ ip_adapt_scale: float = 0.0
+ ip_img_path: str | list[str] | None = None
+
+
+def entrypoint() -> None:
+ cfg = tyro.cli(TextureGenConfig)
+ cfg.mesh_path = as_list(cfg.mesh_path)
+ cfg.prompt = as_list(cfg.prompt)
+ cfg.ip_img_path = as_list(cfg.ip_img_path)
+ assert len(cfg.mesh_path) == len(cfg.prompt)
+
+ # Pre-load models.
+ if cfg.ip_adapt_scale > 0:
+ PIPELINE = build_texture_gen_pipe(
+ base_ckpt_dir="./weights",
+ ip_adapt_scale=cfg.ip_adapt_scale,
+ device="cuda",
+ )
+ else:
+ PIPELINE = build_texture_gen_pipe(
+ base_ckpt_dir="./weights",
+ ip_adapt_scale=0,
+ device="cuda",
+ )
+ DELIGHT = None
+ if cfg.delight:
+ DELIGHT = DelightingModel()
+ IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
+
+ for idx in range(len(cfg.mesh_path)):
+ mesh_path = cfg.mesh_path[idx]
+ prompt = cfg.prompt[idx]
+ uuid = os.path.splitext(os.path.basename(mesh_path))[0]
+ output_root = os.path.join(cfg.output_root, uuid)
+ drender_api(
+ mesh_path=mesh_path,
+ output_root=f"{output_root}/condition",
+ uuid=uuid,
+ )
+ render_mv_api(
+ index_file=f"{output_root}/condition/index.json",
+ controlnet_cond_scale=cfg.controlnet_cond_scale,
+ guidance_scale=cfg.guidance_scale,
+ strength=cfg.strength,
+ num_inference_steps=cfg.num_inference_steps,
+ ip_adapt_scale=cfg.ip_adapt_scale,
+ ip_img_path=(
+ None if cfg.ip_img_path is None else cfg.ip_img_path[idx]
+ ),
+ prompt=prompt,
+ save_dir=f"{output_root}/multi_view",
+ sub_idxs=[[0, 1, 2], [3, 4, 5]],
+ pipeline=PIPELINE,
+ seed=cfg.seed,
+ )
+ textured_mesh = backproject_api(
+ delight_model=DELIGHT,
+ imagesr_model=IMAGESR_MODEL,
+ mesh_path=mesh_path,
+ color_path=f"{output_root}/multi_view/color_sample0.png",
+ output_path=f"{output_root}/texture_mesh/{uuid}.obj",
+ save_glb_path=f"{output_root}/texture_mesh/{uuid}.glb",
+ skip_fix_mesh=True,
+ delight=cfg.delight,
+ no_save_delight_img=True,
+ texture_wh=[cfg.texture_size, cfg.texture_size],
+ )
+ drender_api(
+ mesh_path=f"{output_root}/texture_mesh/{uuid}.obj",
+ output_root=f"{output_root}/texture_mesh",
+ uuid=uuid,
+ num_images=90,
+ elevation=[20],
+ with_mtl=True,
+ gen_color_mp4=True,
+ pbr_light_factor=1.2,
+ )
+
+ # Re-organize folders
+ shutil.rmtree(f"{output_root}/condition")
+ shutil.copy(
+ f"{output_root}/texture_mesh/{uuid}/color.mp4",
+ f"{output_root}/color.mp4",
+ )
+ shutil.rmtree(f"{output_root}/texture_mesh/{uuid}")
+
+ logger.info(
+ f"Successfully generate textured mesh in {output_root}/texture_mesh"
+ )
+
+
+if __name__ == "__main__":
+ entrypoint()
diff --git a/embodied_gen/scripts/imageto3d.py b/embodied_gen/scripts/imageto3d.py
index 14d3202..19719e1 100644
--- a/embodied_gen/scripts/imageto3d.py
+++ b/embodied_gen/scripts/imageto3d.py
@@ -108,6 +108,9 @@ def parse_args():
default=2,
)
parser.add_argument("--disable_decompose_convex", action="store_true")
+ parser.add_argument(
+ "--texture_wh", type=int, nargs=2, default=[2048, 2048]
+ )
args, unknown = parser.parse_known_args()
return args
@@ -209,11 +212,17 @@ def entrypoint(**kwargs):
device="cpu",
)
color_path = os.path.join(output_root, "color.png")
- render_gs_api(aligned_gs_path, color_path)
-
- geo_flag, geo_result = GEO_CHECKER(
- [color_path], text=asset_node
+ render_gs_api(
+ input_gs=aligned_gs_path,
+ output_path=color_path,
+ elevation=[20, -10, 60, -50],
+ num_images=12,
)
+
+ color_img = Image.open(color_path)
+ keep_height = int(color_img.height * 2 / 3)
+ crop_img = color_img.crop((0, 0, color_img.width, keep_height))
+ geo_flag, geo_result = GEO_CHECKER([crop_img], text=asset_node)
logger.warning(
f"{GEO_CHECKER.__class__.__name__}: {geo_result} for {seg_path}"
)
@@ -246,7 +255,9 @@ def entrypoint(**kwargs):
output_path=mesh_obj_path,
skip_fix_mesh=False,
delight=True,
- texture_wh=[2048, 2048],
+ texture_wh=args.texture_wh,
+ elevation=[20, -10, 60, -50],
+ num_images=12,
)
mesh_glb_path = os.path.join(output_root, f"{filename}.glb")
diff --git a/embodied_gen/scripts/render_gs.py b/embodied_gen/scripts/render_gs.py
index 16e7f37..2c8459d 100644
--- a/embodied_gen/scripts/render_gs.py
+++ b/embodied_gen/scripts/render_gs.py
@@ -18,12 +18,11 @@
import argparse
import logging
import math
-import os
import cv2
-import numpy as np
import spaces
import torch
+from PIL import Image
from tqdm import tqdm
from embodied_gen.data.utils import (
CameraSetting,
@@ -31,6 +30,7 @@ from embodied_gen.data.utils import (
normalize_vertices_array,
)
from embodied_gen.models.gs_model import GaussianOperator
+from embodied_gen.utils.process_media import combine_images_to_grid
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
@@ -104,7 +104,7 @@ def load_gs_model(
# Normalize vertices to [-1, 1], center to (0, 0, 0).
_, scale, center = normalize_vertices_array(gs_model._means)
scale, center = float(scale), center.tolist()
- transpose = [*[-v for v in center], *pre_quat]
+ transpose = [*[v for v in center], *pre_quat]
instance_pose = torch.tensor(transpose).to(gs_model.device)
gs_model = gs_model.get_gaussians(instance_pose=instance_pose)
gs_model.rescale(scale)
@@ -113,12 +113,11 @@ def load_gs_model(
@spaces.GPU
-def entrypoint(input_gs: str = None, output_path: str = None) -> None:
+def entrypoint(**kwargs) -> None:
args = parse_args()
- if isinstance(input_gs, str):
- args.input_gs = input_gs
- if isinstance(output_path, str):
- args.output_path = output_path
+ for k, v in kwargs.items():
+ if hasattr(args, k) and v is not None:
+ setattr(args, k, v)
# Setup camera parameters
camera_params = CameraSetting(
@@ -129,7 +128,7 @@ def entrypoint(input_gs: str = None, output_path: str = None) -> None:
fov=math.radians(args.fov),
device=args.device,
)
- camera = init_kal_camera(camera_params)
+ camera = init_kal_camera(camera_params, flip_az=True)
matrix_mv = camera.view_matrix() # (n_cam 4 4) world2cam
matrix_mv[:, :3, 3] = -matrix_mv[:, :3, 3]
w2cs = matrix_mv.to(camera_params.device)
@@ -153,21 +152,11 @@ def entrypoint(input_gs: str = None, output_path: str = None) -> None:
(args.image_size, args.image_size),
interpolation=cv2.INTER_AREA,
)
- images.append(color)
+ color = cv2.cvtColor(color, cv2.COLOR_BGRA2RGBA)
+ images.append(Image.fromarray(color))
- # Cat color images into grid image and save.
- select_idxs = [[0, 2, 1], [5, 4, 3]] # fix order for 6 views
- grid_image = []
- for row_idxs in select_idxs:
- row_image = []
- for row_idx in row_idxs:
- row_image.append(images[row_idx])
- row_image = np.concatenate(row_image, axis=1)
- grid_image.append(row_image)
+ combine_images_to_grid(images, image_mode="RGBA")[0].save(args.output_path)
- grid_image = np.concatenate(grid_image, axis=0)
- os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
- cv2.imwrite(args.output_path, grid_image)
logger.info(f"Saved grid image to {args.output_path}")
diff --git a/embodied_gen/scripts/simulate_sapien.py b/embodied_gen/scripts/simulate_sapien.py
index faa6fbf..01dcd7c 100644
--- a/embodied_gen/scripts/simulate_sapien.py
+++ b/embodied_gen/scripts/simulate_sapien.py
@@ -170,7 +170,8 @@ def entrypoint(**kwargs):
for node in actions:
if actions[node] is None:
continue
- for action in tqdm(actions[node]):
+ logger.info(f"Render SIM grasping in camera {idx} for {node}...")
+ for action in actions[node]:
grasp_frames = scene_manager.step_action(
agent,
torch.Tensor(action[None, ...]),
diff --git a/embodied_gen/scripts/texture_gen.sh b/embodied_gen/scripts/texture_gen.sh
index 7374e84..e703200 100644
--- a/embodied_gen/scripts/texture_gen.sh
+++ b/embodied_gen/scripts/texture_gen.sh
@@ -28,6 +28,7 @@ if [[ -z "$mesh_path" || -z "$prompt" || -z "$output_root" ]]; then
exit 1
fi
+echo "Will be deprecated, recommended to use 'texture-cli' instead."
uuid=$(basename "$output_root")
# Step 1: drender-cli for condition rendering
drender-cli --mesh_path ${mesh_path} \
diff --git a/embodied_gen/utils/process_media.py b/embodied_gen/utils/process_media.py
index 8b3518c..88eb8e5 100644
--- a/embodied_gen/utils/process_media.py
+++ b/embodied_gen/utils/process_media.py
@@ -49,6 +49,7 @@ __all__ = [
"is_image_file",
"parse_text_prompts",
"check_object_edge_truncated",
+ "vcat_pil_images",
]
@@ -166,6 +167,7 @@ def combine_images_to_grid(
images: list[str | Image.Image],
cat_row_col: tuple[int, int] = None,
target_wh: tuple[int, int] = (512, 512),
+ image_mode: str = "RGB",
) -> list[Image.Image]:
n_images = len(images)
if n_images == 1:
@@ -178,13 +180,13 @@ def combine_images_to_grid(
n_row, n_col = cat_row_col
images = [
- Image.open(p).convert("RGB") if isinstance(p, str) else p
+ Image.open(p).convert(image_mode) if isinstance(p, str) else p
for p in images
]
images = [img.resize(target_wh) for img in images]
grid_w, grid_h = n_col * target_wh[0], n_row * target_wh[1]
- grid = Image.new("RGB", (grid_w, grid_h), (0, 0, 0))
+ grid = Image.new(image_mode, (grid_w, grid_h), (0, 0, 0))
for idx, img in enumerate(images):
row, col = divmod(idx, n_col)
@@ -435,6 +437,21 @@ def check_object_edge_truncated(
return not (top or bottom or left or right)
+def vcat_pil_images(
+ images: list[Image.Image], image_mode: str = "RGB"
+) -> Image.Image:
+ widths, heights = zip(*(img.size for img in images))
+ total_height = sum(heights)
+ max_width = max(widths)
+ new_image = Image.new(image_mode, (max_width, total_height))
+ y_offset = 0
+ for image in images:
+ new_image.paste(image, (0, y_offset))
+ y_offset += image.size[1]
+
+ return new_image
+
+
if __name__ == "__main__":
image_paths = [
"outputs/layouts_sim/task_0000/images/pen.png",
diff --git a/embodied_gen/validators/quality_checkers.py b/embodied_gen/validators/quality_checkers.py
index b8f6e1d..d7642c4 100644
--- a/embodied_gen/validators/quality_checkers.py
+++ b/embodied_gen/validators/quality_checkers.py
@@ -249,7 +249,7 @@ class SemanticConsistChecker(BaseChecker):
fewer than four legs or if the legs are unevenly distributed, are not allowed. Do not assume
hidden legs unless they are clearly visible.)
- Geometric completeness is required: the object must not have missing, truncated, or cropped parts.
- - The image must contain exactly one object. Multiple distinct objects are not allowed.
+ - The image must contain exactly one object. Multiple distinct objects (e.g. multiple pens) are not allowed.
A single composite object (e.g., a chair with legs) is acceptable.
- The object should be shown from a slightly angled (three-quarter) perspective,
not a flat, front-facing view showing only one surface.
diff --git a/embodied_gen/validators/urdf_convertor.py b/embodied_gen/validators/urdf_convertor.py
index 0fa9461..3e119b8 100644
--- a/embodied_gen/validators/urdf_convertor.py
+++ b/embodied_gen/validators/urdf_convertor.py
@@ -266,7 +266,7 @@ class URDFGenerator(object):
if self.decompose_convex:
try:
d_params = dict(
- threshold=0.05, max_convex_hull=64, verbose=False
+ threshold=0.05, max_convex_hull=100, verbose=False
)
filename = f"{os.path.splitext(obj_name)[0]}_collision.ply"
output_path = os.path.join(mesh_folder, filename)
diff --git a/pyproject.toml b/pyproject.toml
index 12b7da5..5e4e613 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,7 @@ drender-cli = "embodied_gen.data.differentiable_render:entrypoint"
backproject-cli = "embodied_gen.data.backproject_v2:entrypoint"
img3d-cli = "embodied_gen.scripts.imageto3d:entrypoint"
text3d-cli = "embodied_gen.scripts.textto3d:text_to_3d"
+texture-cli = "embodied_gen.scripts.gen_texture:entrypoint"
scene3d-cli = "embodied_gen.scripts.gen_scene3d:entrypoint"
layout-cli = "embodied_gen.scripts.gen_layout:entrypoint"
sim-cli = "embodied_gen.scripts.simulate_sapien:entrypoint"
diff --git a/tests/test_examples/test_quality_checkers.py b/tests/test_examples/test_quality_checkers.py
index 991f425..8b604d6 100644
--- a/tests/test_examples/test_quality_checkers.py
+++ b/tests/test_examples/test_quality_checkers.py
@@ -142,6 +142,7 @@ def test_semantic_checker(semantic_checker):
("desk", "outputs/utest_cases/semantic_checker/task_0016_desk.png"),
("shelf", "outputs/utest_cases/semantic_checker/task_0018_shelf.png"),
("table", "outputs/utest_cases/semantic_checker/task_0000_table.png"),
+ ("pen", "outputs/layouts_gens2/task_0000/images/pen_raw.png"),
]
for test_case in test_cases:
flag, result = semantic_checker(