feat(pipe): Faster texture back projection and refine quality checkers. (#29)

2025-07-31 19:53:56 +08:00 · 2025-07-31 19:53:56 +08:00 · c258ff8666
commit c258ff8666
parent 87ff24dbd4
6 changed files with 94 additions and 54 deletions
--- a/embodied_gen/data/backproject_v2.py
+++ b/embodied_gen/data/backproject_v2.py
@ -251,6 +251,7 @@ class TextureBacker:
            during rendering. Defaults to 0.5.
        smooth_texture (bool, optional): If True, apply post-processing (e.g.,
            blurring) to the final texture. Defaults to True.
+        inpaint_smooth (bool, optional): If True, apply inpainting to smooth.
    """

    def __init__(
@ -262,6 +263,7 @@ class TextureBacker:
        bake_angle_thresh: int = 75,
        mask_thresh: float = 0.5,
        smooth_texture: bool = True,
+        inpaint_smooth: bool = False,
    ) -> None:
        self.camera_params = camera_params
        self.renderer = None
@ -271,6 +273,7 @@ class TextureBacker:
        self.texture_wh = texture_wh
        self.mask_thresh = mask_thresh
        self.smooth_texture = smooth_texture
+        self.inpaint_smooth = inpaint_smooth

        self.bake_angle_thresh = bake_angle_thresh
        self.bake_unreliable_kernel_size = int(
@ -446,11 +449,12 @@ class TextureBacker:
    def uv_inpaint(
        self, mesh: trimesh.Trimesh, texture: np.ndarray, mask: np.ndarray
    ) -> np.ndarray:
-        vertices, faces, uv_map = self.get_mesh_np_attrs(mesh)
+        if self.inpaint_smooth:
+            vertices, faces, uv_map = self.get_mesh_np_attrs(mesh)
+            texture, mask = _texture_inpaint_smooth(
+                texture, mask, vertices, faces, uv_map
+            )

-        texture, mask = _texture_inpaint_smooth(
-            texture, mask, vertices, faces, uv_map
-        )
        texture = texture.clip(0, 1)
        texture = cv2.inpaint(
            (texture * 255).astype(np.uint8),
--- a/embodied_gen/models/text_model.py
+++ b/embodied_gen/models/text_model.py
@ -54,7 +54,7 @@ __all__ = [

 PROMPT_APPEND = (
    "Angled 3D view of one {object}, centered, no cropping, no occlusion, isolated product photo, "
-    "no surroundings, matte, on a plain clean surface, 3D style revealing multiple surfaces"
+    "no surroundings, high-quality appearance, vivid colors, on a plain clean surface, 3D style revealing multiple surfaces"
 )
 PROMPT_KAPPEND = "Single {object}, in the center of the image, white background, 3D style, best quality"

--- a/embodied_gen/scripts/textto3d.py
+++ b/embodied_gen/scripts/textto3d.py
@ -19,6 +19,7 @@ import os
 import random
 from collections import defaultdict

+import numpy as np
 import torch
 from PIL import Image
 from embodied_gen.models.image_comm_model import build_hf_image_pipeline
@ -27,7 +28,10 @@ from embodied_gen.models.text_model import PROMPT_APPEND
 from embodied_gen.scripts.imageto3d import entrypoint as imageto3d_api
 from embodied_gen.utils.gpt_clients import GPT_CLIENT
 from embodied_gen.utils.log import logger
-from embodied_gen.utils.process_media import render_asset3d
+from embodied_gen.utils.process_media import (
+    check_object_edge_truncated,
+    render_asset3d,
+)
 from embodied_gen.validators.quality_checkers import (
    ImageSegChecker,
    SemanticConsistChecker,
@ -38,6 +42,13 @@ from embodied_gen.validators.quality_checkers import (
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 random.seed(0)

+logger.info("Loading Models...")
+SEMANTIC_CHECKER = SemanticConsistChecker(GPT_CLIENT)
+SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
+TXTGEN_CHECKER = TextGenAlignChecker(GPT_CLIENT)
+PIPE_IMG = build_hf_image_pipeline(os.environ.get("TEXT_MODEL", "sd35"))
+BG_REMOVER = RembgRemover()
+

 __all__ = [
    "text_to_image",
@ -69,6 +80,7 @@ def text_to_image(
            f"Image GEN for {os.path.basename(save_path)}\n"
            f"Try: {try_idx + 1}/{n_retry}, Seed: {seed}, Prompt: {f_prompt}"
        )
+        torch.cuda.empty_cache()
        images = PIPE_IMG.run(
            f_prompt,
            num_inference_steps=img_denoise_step,
@ -93,16 +105,20 @@ def text_to_image(
            seg_flag, seg_result = SEG_CHECKER(
                [raw_image, image.convert("RGB")]
            )
+            image_mask = np.array(image)[..., -1]
+            edge_flag = check_object_edge_truncated(image_mask)
+            logger.warning(
+                f"SEMANTIC: {semantic_result}. SEG: {seg_result}. EDGE: {edge_flag}"
+            )
            if (
-                (semantic_flag and seg_flag)
-                or semantic_flag is None
-                or seg_flag is None
+                (edge_flag and semantic_flag and seg_flag)
+                or (edge_flag and semantic_flag is None)
+                or (edge_flag and seg_flag is None)
            ):
                select_image = [raw_image, image]
                success_flag = True
                break

-        torch.cuda.empty_cache()
        seed = random.randint(0, 100000) if seed is not None else None

    return success_flag
@ -114,14 +130,6 @@ def text_to_3d(**kwargs) -> dict:
        if hasattr(args, k) and v is not None:
            setattr(args, k, v)

-    logger.info("Loading Models...")
-    global SEMANTIC_CHECKER, SEG_CHECKER, TXTGEN_CHECKER, PIPE_IMG, BG_REMOVER
-    SEMANTIC_CHECKER = SemanticConsistChecker(GPT_CLIENT)
-    SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
-    TXTGEN_CHECKER = TextGenAlignChecker(GPT_CLIENT)
-    PIPE_IMG = build_hf_image_pipeline(args.text_model)
-    BG_REMOVER = RembgRemover()
-
    if args.asset_names is None or len(args.asset_names) == 0:
        args.asset_names = [f"sample3d_{i}" for i in range(len(args.prompts))]
    img_save_dir = os.path.join(args.output_root, "images")
@ -261,11 +269,6 @@ def parse_args():
        default=0,
        help="Random seed for 3D generation",
    )
-    parser.add_argument(
-        "--text_model",
-        type=str,
-        default="sd35",
-    )
    parser.add_argument("--keep_intermediate", action="store_true")

    args, unknown = parser.parse_known_args()
--- a/embodied_gen/utils/process_media.py
+++ b/embodied_gen/utils/process_media.py
@ -48,6 +48,7 @@ __all__ = [
    "SceneTreeVisualizer",
    "is_image_file",
    "parse_text_prompts",
+    "check_object_edge_truncated",
 ]


@ -376,6 +377,28 @@ def parse_text_prompts(prompts: list[str]) -> list[str]:
    return prompts


+def check_object_edge_truncated(
+    mask: np.ndarray, edge_threshold: int = 5
+) -> bool:
+    """Checks if a binary object mask is truncated at the image edges.
+
+    Args:
+        mask: A 2D binary NumPy array where nonzero values indicate the object region.
+        edge_threshold: Number of pixels from each image edge to consider for truncation.
+            Defaults to 5.
+
+    Returns:
+        True if the object is fully enclosed (not truncated).
+        False if the object touches or crosses any image boundary.
+    """
+    top = mask[:edge_threshold, :].any()
+    bottom = mask[-edge_threshold:, :].any()
+    left = mask[:, :edge_threshold].any()
+    right = mask[:, -edge_threshold:].any()
+
+    return not (top or bottom or left or right)
+
+
 if __name__ == "__main__":
    merge_video_video(
        "outputs/imageto3d/room_bottle7/room_bottle_007/URDF_room_bottle_007/mesh_glo_normal.mp4",  # noqa
--- a/embodied_gen/validators/quality_checkers.py
+++ b/embodied_gen/validators/quality_checkers.py
@ -113,8 +113,8 @@ class MeshGeoChecker(BaseChecker):
            Your task is to evaluate the quality of the 3D asset generation,
            including geometry, structure, and appearance, based on the rendered views.
            Criteria:
-            - Is the geometry complete and well-formed, without missing parts or redundant structures?
-            - Is the geometric structure of the object complete?
+            - Is the object in the image a single, complete, and well-formed instance,
+                without truncation, missing parts, overlapping duplicates, or redundant geometry?
            - Minor flaws, asymmetries, or simplifications (e.g., less detail on sides or back,
                soft edges) are acceptable if the object is structurally sound and recognizable.
            - Only evaluate geometry. Do not assess texture quality.
@ -241,10 +241,11 @@ class SemanticConsistChecker(BaseChecker):

            Criteria:
            - The image must visually match the text description in terms of object type, structure, geometry, and color.
-            - The object must appear realistic, with reasonable geometry (e.g., a table must have a stable number of legs).
+            - The object must appear realistic, with reasonable geometry (e.g., a table must have a stable number
+                of legs with a reasonable distribution. Count the number of legs visible in the image. (strict) For tables,
+                fewer than four legs or if the legs are unevenly distributed, are not allowed. Do not assume
+                hidden legs unless they are clearly visible.)
            - Geometric completeness is required: the object must not have missing, truncated, or cropped parts.
-            - The object must be centered in the image frame with clear margins on all sides,
-                it should not touch or nearly touch any image edge.
            - The image must contain exactly one object. Multiple distinct objects are not allowed.
                A single composite object (e.g., a chair with legs) is acceptable.
            - The object should be shown from a slightly angled (three-quarter) perspective,
--- a/embodied_gen/validators/urdf_convertor.py
+++ b/embodied_gen/validators/urdf_convertor.py
@ -101,34 +101,42 @@ class URDFGenerator(object):
            prompt_template = (
                view_desc
                + """of the 3D object asset,
-                category: {category}.
-                You are an expert in 3D object analysis and physical property estimation.
-                Give the category of this object asset (within 3 words),
-                (if category is already provided, use it directly),
-                accurately describe this 3D object asset (within 15 words),
-                and give the recommended geometric height range (unit: meter),
-                weight range (unit: kilogram), the average static friction
-                coefficient of the object relative to rubber and the average
-                dynamic friction coefficient of the object relative to rubber.
-                Return response format as shown in Output Example.
+            category: {category}.
+            You are an expert in 3D object analysis and physical property estimation.
+            Give the category of this object asset (within 3 words), (if category is
+            already provided, use it directly), accurately describe this 3D object asset (within 15 words),
+            Determine the pose of the object in the first image and estimate the true vertical height
+            (vertical projection) range of the object (in meters), i.e., how tall the object appears from top
+            to bottom in the front view (first) image. also weight range (unit: kilogram), the average
+            static friction coefficient of the object relative to rubber and the average dynamic friction
+            coefficient of the object relative to rubber. Return response format as shown in Output Example.

-                IMPORTANT:
-                Inputed images are orthographic projection showing the front, left, right and back views,
-                the first image is always the front view. Use the object's pose and orientation in the
-                rendered images to estimate its **true vertical height as it appears in the image**,
-                not the real-world length or width of the object.
-                For example:
-                - A pen standing upright in the front view → vertical height: 0.15-0.2 m
-                - A pen lying horizontally in the front view → vertical height: 0.01-0.02 m
-                    (based on its thickness in the image)
+            Output Example:
+            Category: cup
+            Description: shiny golden cup with floral design
+            Height: 0.1-0.15 m
+            Weight: 0.3-0.6 kg
+            Static friction coefficient: 0.6
+            Dynamic friction coefficient: 0.5

-                Output Example:
-                Category: cup
-                Description: shiny golden cup with floral design
-                Height: 0.1-0.15 m
-                Weight: 0.3-0.6 kg
-                Static friction coefficient: 1.1
-                Dynamic friction coefficient: 0.9
+            IMPORTANT: Estimating Vertical Height from the First (Front View) Image.
+            - The "vertical height" refers to the real-world vertical size of the object
+            as projected in the first image, aligned with the image's vertical axis.
+            - For flat objects like plates or disks or book, if their face is visible in the front view,
+            use the diameter as the vertical height. If the edge is visible, use the thickness instead.
+            - This is not necessarily the full length of the object, but how tall it appears
+            in the first image vertically, based on its pose and orientation.
+            - For objects(e.g., spoons, forks, writing instruments etc.) at an angle showing in
+            the first image, tilted at 45° will appear shorter vertically than when upright.
+            Estimate the vertical projection of their real length based on its pose.
+            For example:
+              - A pen standing upright in the first view (aligned with the image's vertical axis)
+              full body visible in the first image: → vertical height ≈ 0.14-0.20 m
+              - A pen lying flat in the front view (showing thickness) → vertical height ≈ 0.018-0.025 m
+              - Tilted pen in the first image (e.g., ~45° angle): vertical height ≈ 0.07-0.12 m
+            - Use the rest views(except the first image) to help determine the object's 3D pose and orientation.
+            Assume the object is in real-world scale and estimate the approximate vertical height
+            (in meters) based on how large it appears vertically in the first image.
            """
            )

@ -374,6 +382,7 @@ class URDFGenerator(object):
        )

        response = self.gpt_client.query(text_prompt, image_path)
+        # logger.info(response)
        if response is None:
            asset_attrs = {
                "category": category.lower(),