From e8de0e44df344a94fd96f6a016d63f843bf23fe3 Mon Sep 17 00:00:00 2001
From: Xinjie <mrwangxinjie@163.com>
Date: Fri, 27 Jun 2025 00:39:42 +0800
Subject: [PATCH] feat(urdf): Improve the scale restoration logic to make it
 more robust.(#17)

Improve the scale restoration logic to make it more robust.
---
 apps/image_to_3d.py                       | 13 ++++---
 apps/text_to_3d.py                        | 12 +++----
 apps/texture_edit.py                      | 42 +++++++++++++----------
 embodied_gen/models/text_model.py         |  8 +++--
 embodied_gen/validators/urdf_convertor.py | 15 ++++++--
 requirements.txt                          |  2 +-
 6 files changed, 55 insertions(+), 37 deletions(-)
diff --git a/apps/image_to_3d.py b/apps/image_to_3d.py
index 039b953..752d031 100644
--- a/apps/image_to_3d.py
+++ b/apps/image_to_3d.py
@@ -40,6 +40,8 @@ from common import (
 )
 
 with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
+    gr.HTML(image_css, visible=False)
+    gr.HTML(lighting_css, visible=False)
     gr.Markdown(
         """
         ## ***EmbodiedGen***: Image-to-3D Asset
@@ -54,21 +56,18 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
             <a href="https://github.com/HorizonRobotics/EmbodiedGen">
                 <img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
             </a>
-            <a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
+            <a href="https://www.youtube.com/watch?v=rG4odybuJRk">
                 <img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
             </a>
         </p>
 
         🖼️ Generate physically plausible 3D asset from single input image.
-
         """.format(
             VERSION=VERSION
         ),
         elem_classes=["header"],
     )
 
-    gr.HTML(image_css)
-    gr.HTML(lighting_css)
     with gr.Row():
         with gr.Column(scale=2):
             with gr.Tabs() as input_tabs:
@@ -239,9 +238,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                 )
 
             gr.Markdown(
-                """ NOTE: If `Asset Attributes` are provided, the provided
-                properties will be used; otherwise, the GPT-preset properties
-                will be applied. \n
+                """ NOTE: If `Asset Attributes` are provided, it will guide
+                GPT to perform physical attributes restoration. \n
                 The `Download URDF` file is restored to the real scale and
                 has quality inspection, open with an editor to view details.
             """
@@ -279,6 +277,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                     examples_per_page=10,
                 )
         with gr.Column(scale=1):
+            gr.Markdown("<br>")
             video_output = gr.Video(
                 label="Generated 3D Asset",
                 autoplay=True,
diff --git a/apps/text_to_3d.py b/apps/text_to_3d.py
index 21388f5..7bf8380 100644
--- a/apps/text_to_3d.py
+++ b/apps/text_to_3d.py
@@ -40,6 +40,8 @@ from common import (
 )
 
 with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
+    gr.HTML(image_css, visible=False)
+    gr.HTML(lighting_css, visible=False)
     gr.Markdown(
         """
         ## ***EmbodiedGen***: Text-to-3D Asset
@@ -54,20 +56,18 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
             <a href="https://github.com/HorizonRobotics/EmbodiedGen">
                 <img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
             </a>
-            <a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
+            <a href="https://www.youtube.com/watch?v=rG4odybuJRk">
                 <img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
             </a>
         </p>
 
         📝 Create 3D assets from text descriptions for a wide range of geometry and styles.
-
         """.format(
             VERSION=VERSION
         ),
         elem_classes=["header"],
     )
-    gr.HTML(image_css)
-    gr.HTML(lighting_css)
+
     with gr.Row():
         with gr.Column(scale=1):
             raw_image_cache = gr.Image(
@@ -267,8 +267,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                     visible=False,
                 )
             gr.Markdown(
-                "The generated image may be of poor quality due to auto "
-                "segmentation. Try adjusting the text prompt or seed."
+                "Generated image may be poor quality due to auto seg."
+                "Retry by adjusting text prompt, seed or switch seg model in `Image Gen Settings`."
             )
             with gr.Row():
                 video_output = gr.Video(
diff --git a/apps/texture_edit.py b/apps/texture_edit.py
index ca7d1d4..e505082 100644
--- a/apps/texture_edit.py
+++ b/apps/texture_edit.py
@@ -50,6 +50,8 @@ def active_btn_by_content(mesh_content: gr.Model3D, text_content: gr.Textbox):
 
 
 with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
+    gr.HTML(image_css, visible=False)
+    gr.HTML(lighting_css, visible=False)
     gr.Markdown(
         """
         ## ***EmbodiedGen***: Texture Generation
@@ -64,30 +66,33 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
             <a href="https://github.com/HorizonRobotics/EmbodiedGen">
                 <img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
             </a>
-            <a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
+            <a href="https://www.youtube.com/watch?v=rG4odybuJRk">
                 <img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
             </a>
         </p>
 
         🎨 Generate visually rich textures for 3D mesh.
-
         """.format(
             VERSION=VERSION
         ),
         elem_classes=["header"],
     )
-    gr.HTML(image_css)
-    gr.HTML(lighting_css)
+
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown(
+                "You can select input in `Mesh Gallery` at page bottom."
+            )
             mesh_input = gr.Model3D(
-                label="Upload Mesh File(.obj or .glb)", height=300
+                label="Upload Mesh File(.obj or .glb)", height=270
             )
             local_mesh = gr.Textbox(visible=False)
             text_prompt = gr.Textbox(
                 label="Text Prompt (Chinese or English)",
                 placeholder="Input text prompt here",
             )
+            gr.Markdown("<br>")
+
             ip_image = gr.Image(
                 label="Reference Image(optional)",
                 format="png",
@@ -97,8 +102,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                 elem_classes=["image_fit"],
             )
             gr.Markdown(
-                "Note: The `reference image` is optional. If provided, please "
-                "increase the `Condition Scale` in Generation Settings."
+                "Note: The `reference image` is optional. If provided, "
+                "increase `Condition Scale` in Generation Settings."
             )
 
             with gr.Accordion(label="Generation Settings", open=False):
@@ -139,12 +144,6 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                     512, 2048, label="Video Resolution", value=512, step=256
                 )
 
-            generate_mv_btn = gr.Button(
-                "🎨 1. Generate MV Images(~1min)",
-                variant="primary",
-                interactive=False,
-            )
-
         with gr.Column(scale=3):
             with gr.Row():
                 image_sample1 = gr.Image(
@@ -194,10 +193,10 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                     visible=False,
                 )
 
-            gr.Markdown(
-                "Note: Select samples with consistent textures from various "
-                "perspectives and no obvious reflections."
-            )
+            # gr.Markdown(
+            #     "Note: Select samples with consistent textures from various "
+            #     "perspectives and no obvious reflections."
+            # )
             with gr.Row():
                 with gr.Column(scale=1):
                     with gr.Row():
@@ -222,6 +221,11 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                         )
 
                 with gr.Column(scale=1):
+                    generate_mv_btn = gr.Button(
+                        "🎨 1. Generate MV Images(~1min)",
+                        variant="primary",
+                        interactive=False,
+                    )
                     texture_bake_btn = gr.Button(
                         "🛠️ 2. Texture Baking(~2min)",
                         variant="primary",
@@ -237,7 +241,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                 mesh_output = gr.Model3D(
                     label="Mesh Edit Result",
                     clear_color=[0.8, 0.8, 0.8, 1],
-                    height=380,
+                    height=340,
                     interactive=False,
                     elem_id="lighter_mesh",
                 )
@@ -246,7 +250,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                     label="Mesh Edit Video",
                     autoplay=True,
                     loop=True,
-                    height=380,
+                    height=340,
                 )
 
     with gr.Row():
diff --git a/embodied_gen/models/text_model.py b/embodied_gen/models/text_model.py
index 6379b16..7ea8c4c 100644
--- a/embodied_gen/models/text_model.py
+++ b/embodied_gen/models/text_model.py
@@ -53,6 +53,9 @@ __all__ = [
 ]
 
 
+PROMPT_APPEND = "Full view of one {}, no cropping, centered, no occlusion, isolated product photo, matte, 3D style, on a plain clean surface"
+
+
 def download_kolors_weights(local_dir: str = "weights/Kolors") -> None:
     logger.info(f"Download kolors weights from huggingface...")
     os.makedirs(local_dir, exist_ok=True)
@@ -179,8 +182,9 @@ def text2img_gen(
     ip_image_size: int = 512,
     seed: int = None,
 ) -> list[Image.Image]:
-    prompt = "Single " + prompt + ", in the center of the image"
-    prompt += ", high quality, high resolution, best quality, white background, 3D style"  # noqa
+    # prompt = "Single " + prompt + ", in the center of the image"
+    # prompt += ", high quality, high resolution, best quality, white background, 3D style"  # noqa
+    prompt = PROMPT_APPEND.format(prompt.strip())
     logger.info(f"Processing prompt: {prompt}")
 
     generator = None
diff --git a/embodied_gen/validators/urdf_convertor.py b/embodied_gen/validators/urdf_convertor.py
index a830519..076c01a 100644
--- a/embodied_gen/validators/urdf_convertor.py
+++ b/embodied_gen/validators/urdf_convertor.py
@@ -102,6 +102,7 @@ class URDFGenerator(object):
                 view_desc
                 + """of the 3D object asset,
                 category: {category}.
+                You are an expert in 3D object analysis and physical property estimation.
                 Give the category of this object asset (within 3 words),
                 (if category is already provided, use it directly),
                 accurately describe this 3D object asset (within 15 words),
@@ -109,9 +110,19 @@ class URDFGenerator(object):
                 weight range (unit: kilogram), the average static friction
                 coefficient of the object relative to rubber and the average
                 dynamic friction coefficient of the object relative to rubber.
-                Return response format as shown in Example.
+                Return response format as shown in Output Example.
 
-                Example:
+                IMPORTANT:
+                Inputed images are orthographic projection showing the front, left, right and back views,
+                the first image is always the front view. Use the object's pose and orientation in the
+                rendered images to estimate its **true vertical height as it appears in the image**,
+                not the real-world length or width of the object.
+                For example:
+                - A pen standing upright in the front view → vertical height: 0.15-0.2 m
+                - A pen lying horizontally in the front view → vertical height: 0.01-0.02 m
+                    (based on its thickness in the image)
+
+                Output Example:
                 Category: cup
                 Description: shiny golden cup with floral design
                 Height: 0.1-0.15 m
diff --git a/requirements.txt b/requirements.txt
index 8310d08..79e459a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ realesrgan==0.3.0
 pydantic==2.9.2
 vtk==9.3.1
 spaces
-utils3d@git+https://github.com/EasternJournalist/utils3d.git@9a4eb15e4021b67b12c460c7057d642626897ec8
+utils3d@git+https://github.com/EasternJournalist/utils3d.git#egg=9a4eb15
 clip@git+https://github.com/openai/CLIP.git
 kolors@git+https://github.com/Kwai-Kolors/Kolors.git#egg=038818d
 segment-anything@git+https://github.com/facebookresearch/segment-anything.git#egg=dca509f