feat(urdf): Improve the scale restoration logic to make it more robust.(#17)

Improve the scale restoration logic to make it more robust.
2025-06-27 00:39:42 +08:00 · 2025-06-27 00:39:42 +08:00 · e8de0e44df
commit e8de0e44df
parent 52983c8de2
6 changed files with 55 additions and 37 deletions
--- a/apps/image_to_3d.py
+++ b/apps/image_to_3d.py
@ -40,6 +40,8 @@ from common import (
 )
 with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
    gr.HTML(image_css, visible=False)
    gr.HTML(lighting_css, visible=False)
    gr.Markdown(
        """
        ## ***EmbodiedGen***: Image-to-3D Asset
@ -54,21 +56,18 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
            <a href="https://github.com/HorizonRobotics/EmbodiedGen">
                <img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
            </a>
-            <a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
+            <a href="https://www.youtube.com/watch?v=rG4odybuJRk">
                <img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
            </a>
        </p>
        🖼️ Generate physically plausible 3D asset from single input image.
        """.format(
            VERSION=VERSION
        ),
        elem_classes=["header"],
    )
    gr.HTML(image_css)
    gr.HTML(lighting_css)
    with gr.Row():
        with gr.Column(scale=2):
            with gr.Tabs() as input_tabs:
@ -239,9 +238,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                )
            gr.Markdown(
-                """ NOTE: If `Asset Attributes` are provided, the provided
+                """ NOTE: If `Asset Attributes` are provided, it will guide
-                properties will be used; otherwise, the GPT-preset properties
+                GPT to perform physical attributes restoration. \n
                will be applied. \n
                The `Download URDF` file is restored to the real scale and
                has quality inspection, open with an editor to view details.
            """
@ -279,6 +277,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                    examples_per_page=10,
                )
        with gr.Column(scale=1):
            gr.Markdown("<br>")
            video_output = gr.Video(
                label="Generated 3D Asset",
                autoplay=True,
--- a/apps/text_to_3d.py
+++ b/apps/text_to_3d.py
@ -40,6 +40,8 @@ from common import (
 )
 with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
    gr.HTML(image_css, visible=False)
    gr.HTML(lighting_css, visible=False)
    gr.Markdown(
        """
        ## ***EmbodiedGen***: Text-to-3D Asset
@ -54,20 +56,18 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
            <a href="https://github.com/HorizonRobotics/EmbodiedGen">
                <img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
            </a>
-            <a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
+            <a href="https://www.youtube.com/watch?v=rG4odybuJRk">
                <img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
            </a>
        </p>
        📝 Create 3D assets from text descriptions for a wide range of geometry and styles.
        """.format(
            VERSION=VERSION
        ),
        elem_classes=["header"],
    )
-    gr.HTML(image_css)
+
    gr.HTML(lighting_css)
    with gr.Row():
        with gr.Column(scale=1):
            raw_image_cache = gr.Image(
@ -267,8 +267,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                    visible=False,
                )
            gr.Markdown(
-                "The generated image may be of poor quality due to auto "
+                "Generated image may be poor quality due to auto seg."
-                "segmentation. Try adjusting the text prompt or seed."
+                "Retry by adjusting text prompt, seed or switch seg model in `Image Gen Settings`."
            )
            with gr.Row():
                video_output = gr.Video(
--- a/apps/texture_edit.py
+++ b/apps/texture_edit.py
@ -50,6 +50,8 @@ def active_btn_by_content(mesh_content: gr.Model3D, text_content: gr.Textbox):
 with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
    gr.HTML(image_css, visible=False)
    gr.HTML(lighting_css, visible=False)
    gr.Markdown(
        """
        ## ***EmbodiedGen***: Texture Generation
@ -64,30 +66,33 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
            <a href="https://github.com/HorizonRobotics/EmbodiedGen">
                <img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
            </a>
-            <a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
+            <a href="https://www.youtube.com/watch?v=rG4odybuJRk">
                <img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
            </a>
        </p>
        🎨 Generate visually rich textures for 3D mesh.
        """.format(
            VERSION=VERSION
        ),
        elem_classes=["header"],
    )
-    gr.HTML(image_css)
+
    gr.HTML(lighting_css)
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown(
                "You can select input in `Mesh Gallery` at page bottom."
            )
            mesh_input = gr.Model3D(
-                label="Upload Mesh File(.obj or .glb)", height=300
+                label="Upload Mesh File(.obj or .glb)", height=270
            )
            local_mesh = gr.Textbox(visible=False)
            text_prompt = gr.Textbox(
                label="Text Prompt (Chinese or English)",
                placeholder="Input text prompt here",
            )
            gr.Markdown("<br>")
            ip_image = gr.Image(
                label="Reference Image(optional)",
                format="png",
@ -97,8 +102,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                elem_classes=["image_fit"],
            )
            gr.Markdown(
-                "Note: The `reference image` is optional. If provided, please "
+                "Note: The `reference image` is optional. If provided, "
-                "increase the `Condition Scale` in Generation Settings."
+                "increase `Condition Scale` in Generation Settings."
            )
            with gr.Accordion(label="Generation Settings", open=False):
@ -139,12 +144,6 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                    512, 2048, label="Video Resolution", value=512, step=256
                )
            generate_mv_btn = gr.Button(
                "🎨 1. Generate MV Images(~1min)",
                variant="primary",
                interactive=False,
            )
        with gr.Column(scale=3):
            with gr.Row():
                image_sample1 = gr.Image(
@ -194,10 +193,10 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                    visible=False,
                )
-            gr.Markdown(
+            # gr.Markdown(
-                "Note: Select samples with consistent textures from various "
+            #     "Note: Select samples with consistent textures from various "
-                "perspectives and no obvious reflections."
+            #     "perspectives and no obvious reflections."
-            )
+            # )
            with gr.Row():
                with gr.Column(scale=1):
                    with gr.Row():
@ -222,6 +221,11 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                        )
                with gr.Column(scale=1):
                    generate_mv_btn = gr.Button(
                        "🎨 1. Generate MV Images(~1min)",
                        variant="primary",
                        interactive=False,
                    )
                    texture_bake_btn = gr.Button(
                        "🛠️ 2. Texture Baking(~2min)",
                        variant="primary",
@ -237,7 +241,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                mesh_output = gr.Model3D(
                    label="Mesh Edit Result",
                    clear_color=[0.8, 0.8, 0.8, 1],
-                    height=380,
+                    height=340,
                    interactive=False,
                    elem_id="lighter_mesh",
                )
@ -246,7 +250,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                    label="Mesh Edit Video",
                    autoplay=True,
                    loop=True,
-                    height=380,
+                    height=340,
                )
    with gr.Row():
--- a/embodied_gen/models/text_model.py
+++ b/embodied_gen/models/text_model.py
@ -53,6 +53,9 @@ __all__ = [
 ]
 PROMPT_APPEND = "Full view of one {}, no cropping, centered, no occlusion, isolated product photo, matte, 3D style, on a plain clean surface"
 def download_kolors_weights(local_dir: str = "weights/Kolors") -> None:
    logger.info(f"Download kolors weights from huggingface...")
    os.makedirs(local_dir, exist_ok=True)
@ -179,8 +182,9 @@ def text2img_gen(
    ip_image_size: int = 512,
    seed: int = None,
 ) -> list[Image.Image]:
-    prompt = "Single " + prompt + ", in the center of the image"
+    # prompt = "Single " + prompt + ", in the center of the image"
-    prompt += ", high quality, high resolution, best quality, white background, 3D style"  # noqa
+    # prompt += ", high quality, high resolution, best quality, white background, 3D style"  # noqa
    prompt = PROMPT_APPEND.format(prompt.strip())
    logger.info(f"Processing prompt: {prompt}")
    generator = None
--- a/embodied_gen/validators/urdf_convertor.py
+++ b/embodied_gen/validators/urdf_convertor.py
@ -102,6 +102,7 @@ class URDFGenerator(object):
                view_desc
                + """of the 3D object asset,
                category: {category}.
                You are an expert in 3D object analysis and physical property estimation.
                Give the category of this object asset (within 3 words),
                (if category is already provided, use it directly),
                accurately describe this 3D object asset (within 15 words),
@ -109,9 +110,19 @@ class URDFGenerator(object):
                weight range (unit: kilogram), the average static friction
                coefficient of the object relative to rubber and the average
                dynamic friction coefficient of the object relative to rubber.
-                Return response format as shown in Example.
+                Return response format as shown in Output Example.
-                Example:
+                IMPORTANT:
                Inputed images are orthographic projection showing the front, left, right and back views,
                the first image is always the front view. Use the object's pose and orientation in the
                rendered images to estimate its **true vertical height as it appears in the image**,
                not the real-world length or width of the object.
                For example:
                - A pen standing upright in the front view → vertical height: 0.15-0.2 m
                - A pen lying horizontally in the front view → vertical height: 0.01-0.02 m
                    (based on its thickness in the image)
                Output Example:
                Category: cup
                Description: shiny golden cup with floral design
                Height: 0.1-0.15 m
--- a/requirements.txt
+++ b/requirements.txt
@ -30,7 +30,7 @@ realesrgan==0.3.0
 pydantic==2.9.2
 vtk==9.3.1
 spaces
-utils3d@git+https://github.com/EasternJournalist/utils3d.git@9a4eb15e4021b67b12c460c7057d642626897ec8
+utils3d@git+https://github.com/EasternJournalist/utils3d.git#egg=9a4eb15
 clip@git+https://github.com/openai/CLIP.git
 kolors@git+https://github.com/Kwai-Kolors/Kolors.git#egg=038818d
 segment-anything@git+https://github.com/facebookresearch/segment-anything.git#egg=dca509f