feat(urdf): Improve the scale restoration logic to make it more robust.(#17)

Improve the scale restoration logic to make it more robust.
2025-06-27 00:39:42 +08:00 · 2025-06-27 00:39:42 +08:00 · e8de0e44df
commit e8de0e44df
parent 52983c8de2
6 changed files with 55 additions and 37 deletions
--- a/apps/image_to_3d.py
+++ b/apps/image_to_3d.py
@ -40,6 +40,8 @@ from common import (
 )

 with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
+    gr.HTML(image_css, visible=False)
+    gr.HTML(lighting_css, visible=False)
    gr.Markdown(
        """
        ## ***EmbodiedGen***: Image-to-3D Asset
@ -54,21 +56,18 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
            <a href="https://github.com/HorizonRobotics/EmbodiedGen">
                <img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
            </a>
-            <a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
+            <a href="https://www.youtube.com/watch?v=rG4odybuJRk">
                <img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
            </a>
        </p>

        🖼️ Generate physically plausible 3D asset from single input image.
-
        """.format(
            VERSION=VERSION
        ),
        elem_classes=["header"],
    )

-    gr.HTML(image_css)
-    gr.HTML(lighting_css)
    with gr.Row():
        with gr.Column(scale=2):
            with gr.Tabs() as input_tabs:
@ -239,9 +238,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                )

            gr.Markdown(
-                """ NOTE: If `Asset Attributes` are provided, the provided
-                properties will be used; otherwise, the GPT-preset properties
-                will be applied. \n
+                """ NOTE: If `Asset Attributes` are provided, it will guide
+                GPT to perform physical attributes restoration. \n
                The `Download URDF` file is restored to the real scale and
                has quality inspection, open with an editor to view details.
            """
@ -279,6 +277,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                    examples_per_page=10,
                )
        with gr.Column(scale=1):
+            gr.Markdown("<br>")
            video_output = gr.Video(
                label="Generated 3D Asset",
                autoplay=True,
--- a/apps/text_to_3d.py
+++ b/apps/text_to_3d.py
@ -40,6 +40,8 @@ from common import (
 )

 with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
+    gr.HTML(image_css, visible=False)
+    gr.HTML(lighting_css, visible=False)
    gr.Markdown(
        """
        ## ***EmbodiedGen***: Text-to-3D Asset
@ -54,20 +56,18 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
            <a href="https://github.com/HorizonRobotics/EmbodiedGen">
                <img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
            </a>
-            <a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
+            <a href="https://www.youtube.com/watch?v=rG4odybuJRk">
                <img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
            </a>
        </p>

        📝 Create 3D assets from text descriptions for a wide range of geometry and styles.
-
        """.format(
            VERSION=VERSION
        ),
        elem_classes=["header"],
    )
-    gr.HTML(image_css)
-    gr.HTML(lighting_css)
+
    with gr.Row():
        with gr.Column(scale=1):
            raw_image_cache = gr.Image(
@ -267,8 +267,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                    visible=False,
                )
            gr.Markdown(
-                "The generated image may be of poor quality due to auto "
-                "segmentation. Try adjusting the text prompt or seed."
+                "Generated image may be poor quality due to auto seg."
+                "Retry by adjusting text prompt, seed or switch seg model in `Image Gen Settings`."
            )
            with gr.Row():
                video_output = gr.Video(
--- a/apps/texture_edit.py
+++ b/apps/texture_edit.py
@ -50,6 +50,8 @@ def active_btn_by_content(mesh_content: gr.Model3D, text_content: gr.Textbox):


 with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
+    gr.HTML(image_css, visible=False)
+    gr.HTML(lighting_css, visible=False)
    gr.Markdown(
        """
        ## ***EmbodiedGen***: Texture Generation
@ -64,30 +66,33 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
            <a href="https://github.com/HorizonRobotics/EmbodiedGen">
                <img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
            </a>
-            <a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
+            <a href="https://www.youtube.com/watch?v=rG4odybuJRk">
                <img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
            </a>
        </p>

        🎨 Generate visually rich textures for 3D mesh.
-
        """.format(
            VERSION=VERSION
        ),
        elem_classes=["header"],
    )
-    gr.HTML(image_css)
-    gr.HTML(lighting_css)
+
    with gr.Row():
        with gr.Column(scale=1):
+            gr.Markdown(
+                "You can select input in `Mesh Gallery` at page bottom."
+            )
            mesh_input = gr.Model3D(
-                label="Upload Mesh File(.obj or .glb)", height=300
+                label="Upload Mesh File(.obj or .glb)", height=270
            )
            local_mesh = gr.Textbox(visible=False)
            text_prompt = gr.Textbox(
                label="Text Prompt (Chinese or English)",
                placeholder="Input text prompt here",
            )
+            gr.Markdown("<br>")
+
            ip_image = gr.Image(
                label="Reference Image(optional)",
                format="png",
@ -97,8 +102,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                elem_classes=["image_fit"],
            )
            gr.Markdown(
-                "Note: The `reference image` is optional. If provided, please "
-                "increase the `Condition Scale` in Generation Settings."
+                "Note: The `reference image` is optional. If provided, "
+                "increase `Condition Scale` in Generation Settings."
            )

            with gr.Accordion(label="Generation Settings", open=False):
@ -139,12 +144,6 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                    512, 2048, label="Video Resolution", value=512, step=256
                )

-            generate_mv_btn = gr.Button(
-                "🎨 1. Generate MV Images(~1min)",
-                variant="primary",
-                interactive=False,
-            )
-
        with gr.Column(scale=3):
            with gr.Row():
                image_sample1 = gr.Image(
@ -194,10 +193,10 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                    visible=False,
                )

-            gr.Markdown(
-                "Note: Select samples with consistent textures from various "
-                "perspectives and no obvious reflections."
-            )
+            # gr.Markdown(
+            #     "Note: Select samples with consistent textures from various "
+            #     "perspectives and no obvious reflections."
+            # )
            with gr.Row():
                with gr.Column(scale=1):
                    with gr.Row():
@ -222,6 +221,11 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                        )

                with gr.Column(scale=1):
+                    generate_mv_btn = gr.Button(
+                        "🎨 1. Generate MV Images(~1min)",
+                        variant="primary",
+                        interactive=False,
+                    )
                    texture_bake_btn = gr.Button(
                        "🛠️ 2. Texture Baking(~2min)",
                        variant="primary",
@ -237,7 +241,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                mesh_output = gr.Model3D(
                    label="Mesh Edit Result",
                    clear_color=[0.8, 0.8, 0.8, 1],
-                    height=380,
+                    height=340,
                    interactive=False,
                    elem_id="lighter_mesh",
                )
@ -246,7 +250,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                    label="Mesh Edit Video",
                    autoplay=True,
                    loop=True,
-                    height=380,
+                    height=340,
                )

    with gr.Row():
--- a/embodied_gen/models/text_model.py
+++ b/embodied_gen/models/text_model.py
@ -53,6 +53,9 @@ __all__ = [
 ]


+PROMPT_APPEND = "Full view of one {}, no cropping, centered, no occlusion, isolated product photo, matte, 3D style, on a plain clean surface"
+
+
 def download_kolors_weights(local_dir: str = "weights/Kolors") -> None:
    logger.info(f"Download kolors weights from huggingface...")
    os.makedirs(local_dir, exist_ok=True)
@ -179,8 +182,9 @@ def text2img_gen(
    ip_image_size: int = 512,
    seed: int = None,
 ) -> list[Image.Image]:
-    prompt = "Single " + prompt + ", in the center of the image"
-    prompt += ", high quality, high resolution, best quality, white background, 3D style"  # noqa
+    # prompt = "Single " + prompt + ", in the center of the image"
+    # prompt += ", high quality, high resolution, best quality, white background, 3D style"  # noqa
+    prompt = PROMPT_APPEND.format(prompt.strip())
    logger.info(f"Processing prompt: {prompt}")

    generator = None
--- a/embodied_gen/validators/urdf_convertor.py
+++ b/embodied_gen/validators/urdf_convertor.py
@ -102,6 +102,7 @@ class URDFGenerator(object):
                view_desc
                + """of the 3D object asset,
                category: {category}.
+                You are an expert in 3D object analysis and physical property estimation.
                Give the category of this object asset (within 3 words),
                (if category is already provided, use it directly),
                accurately describe this 3D object asset (within 15 words),
@ -109,9 +110,19 @@ class URDFGenerator(object):
                weight range (unit: kilogram), the average static friction
                coefficient of the object relative to rubber and the average
                dynamic friction coefficient of the object relative to rubber.
-                Return response format as shown in Example.
+                Return response format as shown in Output Example.

-                Example:
+                IMPORTANT:
+                Inputed images are orthographic projection showing the front, left, right and back views,
+                the first image is always the front view. Use the object's pose and orientation in the
+                rendered images to estimate its **true vertical height as it appears in the image**,
+                not the real-world length or width of the object.
+                For example:
+                - A pen standing upright in the front view → vertical height: 0.15-0.2 m
+                - A pen lying horizontally in the front view → vertical height: 0.01-0.02 m
+                    (based on its thickness in the image)
+
+                Output Example:
                Category: cup
                Description: shiny golden cup with floral design
                Height: 0.1-0.15 m
--- a/requirements.txt
+++ b/requirements.txt
@ -30,7 +30,7 @@ realesrgan==0.3.0
 pydantic==2.9.2
 vtk==9.3.1
 spaces
-utils3d@git+https://github.com/EasternJournalist/utils3d.git@9a4eb15e4021b67b12c460c7057d642626897ec8
+utils3d@git+https://github.com/EasternJournalist/utils3d.git#egg=9a4eb15
 clip@git+https://github.com/openai/CLIP.git
 kolors@git+https://github.com/Kwai-Kolors/Kolors.git#egg=038818d
 segment-anything@git+https://github.com/facebookresearch/segment-anything.git#egg=dca509f