From c258ff866602f4af7fc6b805db5d061075ad9290 Mon Sep 17 00:00:00 2001 From: Xinjie Date: Thu, 31 Jul 2025 19:53:56 +0800 Subject: [PATCH] feat(pipe): Faster texture back projection and refine quality checkers. (#29) --- embodied_gen/data/backproject_v2.py | 12 ++-- embodied_gen/models/text_model.py | 2 +- embodied_gen/scripts/textto3d.py | 39 +++++++------ embodied_gen/utils/process_media.py | 23 ++++++++ embodied_gen/validators/quality_checkers.py | 11 ++-- embodied_gen/validators/urdf_convertor.py | 61 ++++++++++++--------- 6 files changed, 94 insertions(+), 54 deletions(-) diff --git a/embodied_gen/data/backproject_v2.py b/embodied_gen/data/backproject_v2.py index efe1b34..651b731 100644 --- a/embodied_gen/data/backproject_v2.py +++ b/embodied_gen/data/backproject_v2.py @@ -251,6 +251,7 @@ class TextureBacker: during rendering. Defaults to 0.5. smooth_texture (bool, optional): If True, apply post-processing (e.g., blurring) to the final texture. Defaults to True. + inpaint_smooth (bool, optional): If True, apply inpainting to smooth. """ def __init__( @@ -262,6 +263,7 @@ class TextureBacker: bake_angle_thresh: int = 75, mask_thresh: float = 0.5, smooth_texture: bool = True, + inpaint_smooth: bool = False, ) -> None: self.camera_params = camera_params self.renderer = None @@ -271,6 +273,7 @@ class TextureBacker: self.texture_wh = texture_wh self.mask_thresh = mask_thresh self.smooth_texture = smooth_texture + self.inpaint_smooth = inpaint_smooth self.bake_angle_thresh = bake_angle_thresh self.bake_unreliable_kernel_size = int( @@ -446,11 +449,12 @@ class TextureBacker: def uv_inpaint( self, mesh: trimesh.Trimesh, texture: np.ndarray, mask: np.ndarray ) -> np.ndarray: - vertices, faces, uv_map = self.get_mesh_np_attrs(mesh) + if self.inpaint_smooth: + vertices, faces, uv_map = self.get_mesh_np_attrs(mesh) + texture, mask = _texture_inpaint_smooth( + texture, mask, vertices, faces, uv_map + ) - texture, mask = _texture_inpaint_smooth( - texture, mask, vertices, faces, uv_map - ) texture = texture.clip(0, 1) texture = cv2.inpaint( (texture * 255).astype(np.uint8), diff --git a/embodied_gen/models/text_model.py b/embodied_gen/models/text_model.py index 3ad44f4..0807814 100644 --- a/embodied_gen/models/text_model.py +++ b/embodied_gen/models/text_model.py @@ -54,7 +54,7 @@ __all__ = [ PROMPT_APPEND = ( "Angled 3D view of one {object}, centered, no cropping, no occlusion, isolated product photo, " - "no surroundings, matte, on a plain clean surface, 3D style revealing multiple surfaces" + "no surroundings, high-quality appearance, vivid colors, on a plain clean surface, 3D style revealing multiple surfaces" ) PROMPT_KAPPEND = "Single {object}, in the center of the image, white background, 3D style, best quality" diff --git a/embodied_gen/scripts/textto3d.py b/embodied_gen/scripts/textto3d.py index 4d28093..03a9d2f 100644 --- a/embodied_gen/scripts/textto3d.py +++ b/embodied_gen/scripts/textto3d.py @@ -19,6 +19,7 @@ import os import random from collections import defaultdict +import numpy as np import torch from PIL import Image from embodied_gen.models.image_comm_model import build_hf_image_pipeline @@ -27,7 +28,10 @@ from embodied_gen.models.text_model import PROMPT_APPEND from embodied_gen.scripts.imageto3d import entrypoint as imageto3d_api from embodied_gen.utils.gpt_clients import GPT_CLIENT from embodied_gen.utils.log import logger -from embodied_gen.utils.process_media import render_asset3d +from embodied_gen.utils.process_media import ( + check_object_edge_truncated, + render_asset3d, +) from embodied_gen.validators.quality_checkers import ( ImageSegChecker, SemanticConsistChecker, @@ -38,6 +42,13 @@ from embodied_gen.validators.quality_checkers import ( os.environ["TOKENIZERS_PARALLELISM"] = "false" random.seed(0) +logger.info("Loading Models...") +SEMANTIC_CHECKER = SemanticConsistChecker(GPT_CLIENT) +SEG_CHECKER = ImageSegChecker(GPT_CLIENT) +TXTGEN_CHECKER = TextGenAlignChecker(GPT_CLIENT) +PIPE_IMG = build_hf_image_pipeline(os.environ.get("TEXT_MODEL", "sd35")) +BG_REMOVER = RembgRemover() + __all__ = [ "text_to_image", @@ -69,6 +80,7 @@ def text_to_image( f"Image GEN for {os.path.basename(save_path)}\n" f"Try: {try_idx + 1}/{n_retry}, Seed: {seed}, Prompt: {f_prompt}" ) + torch.cuda.empty_cache() images = PIPE_IMG.run( f_prompt, num_inference_steps=img_denoise_step, @@ -93,16 +105,20 @@ def text_to_image( seg_flag, seg_result = SEG_CHECKER( [raw_image, image.convert("RGB")] ) + image_mask = np.array(image)[..., -1] + edge_flag = check_object_edge_truncated(image_mask) + logger.warning( + f"SEMANTIC: {semantic_result}. SEG: {seg_result}. EDGE: {edge_flag}" + ) if ( - (semantic_flag and seg_flag) - or semantic_flag is None - or seg_flag is None + (edge_flag and semantic_flag and seg_flag) + or (edge_flag and semantic_flag is None) + or (edge_flag and seg_flag is None) ): select_image = [raw_image, image] success_flag = True break - torch.cuda.empty_cache() seed = random.randint(0, 100000) if seed is not None else None return success_flag @@ -114,14 +130,6 @@ def text_to_3d(**kwargs) -> dict: if hasattr(args, k) and v is not None: setattr(args, k, v) - logger.info("Loading Models...") - global SEMANTIC_CHECKER, SEG_CHECKER, TXTGEN_CHECKER, PIPE_IMG, BG_REMOVER - SEMANTIC_CHECKER = SemanticConsistChecker(GPT_CLIENT) - SEG_CHECKER = ImageSegChecker(GPT_CLIENT) - TXTGEN_CHECKER = TextGenAlignChecker(GPT_CLIENT) - PIPE_IMG = build_hf_image_pipeline(args.text_model) - BG_REMOVER = RembgRemover() - if args.asset_names is None or len(args.asset_names) == 0: args.asset_names = [f"sample3d_{i}" for i in range(len(args.prompts))] img_save_dir = os.path.join(args.output_root, "images") @@ -261,11 +269,6 @@ def parse_args(): default=0, help="Random seed for 3D generation", ) - parser.add_argument( - "--text_model", - type=str, - default="sd35", - ) parser.add_argument("--keep_intermediate", action="store_true") args, unknown = parser.parse_known_args() diff --git a/embodied_gen/utils/process_media.py b/embodied_gen/utils/process_media.py index 3087d4f..aa343d4 100644 --- a/embodied_gen/utils/process_media.py +++ b/embodied_gen/utils/process_media.py @@ -48,6 +48,7 @@ __all__ = [ "SceneTreeVisualizer", "is_image_file", "parse_text_prompts", + "check_object_edge_truncated", ] @@ -376,6 +377,28 @@ def parse_text_prompts(prompts: list[str]) -> list[str]: return prompts +def check_object_edge_truncated( + mask: np.ndarray, edge_threshold: int = 5 +) -> bool: + """Checks if a binary object mask is truncated at the image edges. + + Args: + mask: A 2D binary NumPy array where nonzero values indicate the object region. + edge_threshold: Number of pixels from each image edge to consider for truncation. + Defaults to 5. + + Returns: + True if the object is fully enclosed (not truncated). + False if the object touches or crosses any image boundary. + """ + top = mask[:edge_threshold, :].any() + bottom = mask[-edge_threshold:, :].any() + left = mask[:, :edge_threshold].any() + right = mask[:, -edge_threshold:].any() + + return not (top or bottom or left or right) + + if __name__ == "__main__": merge_video_video( "outputs/imageto3d/room_bottle7/room_bottle_007/URDF_room_bottle_007/mesh_glo_normal.mp4", # noqa diff --git a/embodied_gen/validators/quality_checkers.py b/embodied_gen/validators/quality_checkers.py index 186346f..4f33cb8 100644 --- a/embodied_gen/validators/quality_checkers.py +++ b/embodied_gen/validators/quality_checkers.py @@ -113,8 +113,8 @@ class MeshGeoChecker(BaseChecker): Your task is to evaluate the quality of the 3D asset generation, including geometry, structure, and appearance, based on the rendered views. Criteria: - - Is the geometry complete and well-formed, without missing parts or redundant structures? - - Is the geometric structure of the object complete? + - Is the object in the image a single, complete, and well-formed instance, + without truncation, missing parts, overlapping duplicates, or redundant geometry? - Minor flaws, asymmetries, or simplifications (e.g., less detail on sides or back, soft edges) are acceptable if the object is structurally sound and recognizable. - Only evaluate geometry. Do not assess texture quality. @@ -241,10 +241,11 @@ class SemanticConsistChecker(BaseChecker): Criteria: - The image must visually match the text description in terms of object type, structure, geometry, and color. - - The object must appear realistic, with reasonable geometry (e.g., a table must have a stable number of legs). + - The object must appear realistic, with reasonable geometry (e.g., a table must have a stable number + of legs with a reasonable distribution. Count the number of legs visible in the image. (strict) For tables, + fewer than four legs or if the legs are unevenly distributed, are not allowed. Do not assume + hidden legs unless they are clearly visible.) - Geometric completeness is required: the object must not have missing, truncated, or cropped parts. - - The object must be centered in the image frame with clear margins on all sides, - it should not touch or nearly touch any image edge. - The image must contain exactly one object. Multiple distinct objects are not allowed. A single composite object (e.g., a chair with legs) is acceptable. - The object should be shown from a slightly angled (three-quarter) perspective, diff --git a/embodied_gen/validators/urdf_convertor.py b/embodied_gen/validators/urdf_convertor.py index b18be1e..fac2d77 100644 --- a/embodied_gen/validators/urdf_convertor.py +++ b/embodied_gen/validators/urdf_convertor.py @@ -101,34 +101,42 @@ class URDFGenerator(object): prompt_template = ( view_desc + """of the 3D object asset, - category: {category}. - You are an expert in 3D object analysis and physical property estimation. - Give the category of this object asset (within 3 words), - (if category is already provided, use it directly), - accurately describe this 3D object asset (within 15 words), - and give the recommended geometric height range (unit: meter), - weight range (unit: kilogram), the average static friction - coefficient of the object relative to rubber and the average - dynamic friction coefficient of the object relative to rubber. - Return response format as shown in Output Example. + category: {category}. + You are an expert in 3D object analysis and physical property estimation. + Give the category of this object asset (within 3 words), (if category is + already provided, use it directly), accurately describe this 3D object asset (within 15 words), + Determine the pose of the object in the first image and estimate the true vertical height + (vertical projection) range of the object (in meters), i.e., how tall the object appears from top + to bottom in the front view (first) image. also weight range (unit: kilogram), the average + static friction coefficient of the object relative to rubber and the average dynamic friction + coefficient of the object relative to rubber. Return response format as shown in Output Example. - IMPORTANT: - Inputed images are orthographic projection showing the front, left, right and back views, - the first image is always the front view. Use the object's pose and orientation in the - rendered images to estimate its **true vertical height as it appears in the image**, - not the real-world length or width of the object. - For example: - - A pen standing upright in the front view → vertical height: 0.15-0.2 m - - A pen lying horizontally in the front view → vertical height: 0.01-0.02 m - (based on its thickness in the image) + Output Example: + Category: cup + Description: shiny golden cup with floral design + Height: 0.1-0.15 m + Weight: 0.3-0.6 kg + Static friction coefficient: 0.6 + Dynamic friction coefficient: 0.5 - Output Example: - Category: cup - Description: shiny golden cup with floral design - Height: 0.1-0.15 m - Weight: 0.3-0.6 kg - Static friction coefficient: 1.1 - Dynamic friction coefficient: 0.9 + IMPORTANT: Estimating Vertical Height from the First (Front View) Image. + - The "vertical height" refers to the real-world vertical size of the object + as projected in the first image, aligned with the image's vertical axis. + - For flat objects like plates or disks or book, if their face is visible in the front view, + use the diameter as the vertical height. If the edge is visible, use the thickness instead. + - This is not necessarily the full length of the object, but how tall it appears + in the first image vertically, based on its pose and orientation. + - For objects(e.g., spoons, forks, writing instruments etc.) at an angle showing in + the first image, tilted at 45° will appear shorter vertically than when upright. + Estimate the vertical projection of their real length based on its pose. + For example: + - A pen standing upright in the first view (aligned with the image's vertical axis) + full body visible in the first image: → vertical height ≈ 0.14-0.20 m + - A pen lying flat in the front view (showing thickness) → vertical height ≈ 0.018-0.025 m + - Tilted pen in the first image (e.g., ~45° angle): vertical height ≈ 0.07-0.12 m + - Use the rest views(except the first image) to help determine the object's 3D pose and orientation. + Assume the object is in real-world scale and estimate the approximate vertical height + (in meters) based on how large it appears vertically in the first image. """ ) @@ -374,6 +382,7 @@ class URDFGenerator(object): ) response = self.gpt_client.query(text_prompt, image_path) + # logger.info(response) if response is None: asset_attrs = { "category": category.lower(),