From e8de0e44df344a94fd96f6a016d63f843bf23fe3 Mon Sep 17 00:00:00 2001
From: Xinjie
Date: Fri, 27 Jun 2025 00:39:42 +0800
Subject: [PATCH] feat(urdf): Improve the scale restoration logic to make it
more robust.(#17)
Improve the scale restoration logic to make it more robust.
---
apps/image_to_3d.py | 13 ++++---
apps/text_to_3d.py | 12 +++----
apps/texture_edit.py | 42 +++++++++++++----------
embodied_gen/models/text_model.py | 8 +++--
embodied_gen/validators/urdf_convertor.py | 15 ++++++--
requirements.txt | 2 +-
6 files changed, 55 insertions(+), 37 deletions(-)
diff --git a/apps/image_to_3d.py b/apps/image_to_3d.py
index 039b953..752d031 100644
--- a/apps/image_to_3d.py
+++ b/apps/image_to_3d.py
@@ -40,6 +40,8 @@ from common import (
)
with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
+ gr.HTML(image_css, visible=False)
+ gr.HTML(lighting_css, visible=False)
gr.Markdown(
"""
## ***EmbodiedGen***: Image-to-3D Asset
@@ -54,21 +56,18 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
-
+
🖼️ Generate physically plausible 3D asset from single input image.
-
""".format(
VERSION=VERSION
),
elem_classes=["header"],
)
- gr.HTML(image_css)
- gr.HTML(lighting_css)
with gr.Row():
with gr.Column(scale=2):
with gr.Tabs() as input_tabs:
@@ -239,9 +238,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
)
gr.Markdown(
- """ NOTE: If `Asset Attributes` are provided, the provided
- properties will be used; otherwise, the GPT-preset properties
- will be applied. \n
+ """ NOTE: If `Asset Attributes` are provided, it will guide
+ GPT to perform physical attributes restoration. \n
The `Download URDF` file is restored to the real scale and
has quality inspection, open with an editor to view details.
"""
@@ -279,6 +277,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
examples_per_page=10,
)
with gr.Column(scale=1):
+ gr.Markdown("
")
video_output = gr.Video(
label="Generated 3D Asset",
autoplay=True,
diff --git a/apps/text_to_3d.py b/apps/text_to_3d.py
index 21388f5..7bf8380 100644
--- a/apps/text_to_3d.py
+++ b/apps/text_to_3d.py
@@ -40,6 +40,8 @@ from common import (
)
with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
+ gr.HTML(image_css, visible=False)
+ gr.HTML(lighting_css, visible=False)
gr.Markdown(
"""
## ***EmbodiedGen***: Text-to-3D Asset
@@ -54,20 +56,18 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
-
+
📝 Create 3D assets from text descriptions for a wide range of geometry and styles.
-
""".format(
VERSION=VERSION
),
elem_classes=["header"],
)
- gr.HTML(image_css)
- gr.HTML(lighting_css)
+
with gr.Row():
with gr.Column(scale=1):
raw_image_cache = gr.Image(
@@ -267,8 +267,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
visible=False,
)
gr.Markdown(
- "The generated image may be of poor quality due to auto "
- "segmentation. Try adjusting the text prompt or seed."
+ "Generated image may be poor quality due to auto seg."
+ "Retry by adjusting text prompt, seed or switch seg model in `Image Gen Settings`."
)
with gr.Row():
video_output = gr.Video(
diff --git a/apps/texture_edit.py b/apps/texture_edit.py
index ca7d1d4..e505082 100644
--- a/apps/texture_edit.py
+++ b/apps/texture_edit.py
@@ -50,6 +50,8 @@ def active_btn_by_content(mesh_content: gr.Model3D, text_content: gr.Textbox):
with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
+ gr.HTML(image_css, visible=False)
+ gr.HTML(lighting_css, visible=False)
gr.Markdown(
"""
## ***EmbodiedGen***: Texture Generation
@@ -64,30 +66,33 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
-
+
🎨 Generate visually rich textures for 3D mesh.
-
""".format(
VERSION=VERSION
),
elem_classes=["header"],
)
- gr.HTML(image_css)
- gr.HTML(lighting_css)
+
with gr.Row():
with gr.Column(scale=1):
+ gr.Markdown(
+ "You can select input in `Mesh Gallery` at page bottom."
+ )
mesh_input = gr.Model3D(
- label="Upload Mesh File(.obj or .glb)", height=300
+ label="Upload Mesh File(.obj or .glb)", height=270
)
local_mesh = gr.Textbox(visible=False)
text_prompt = gr.Textbox(
label="Text Prompt (Chinese or English)",
placeholder="Input text prompt here",
)
+ gr.Markdown("
")
+
ip_image = gr.Image(
label="Reference Image(optional)",
format="png",
@@ -97,8 +102,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
elem_classes=["image_fit"],
)
gr.Markdown(
- "Note: The `reference image` is optional. If provided, please "
- "increase the `Condition Scale` in Generation Settings."
+ "Note: The `reference image` is optional. If provided, "
+ "increase `Condition Scale` in Generation Settings."
)
with gr.Accordion(label="Generation Settings", open=False):
@@ -139,12 +144,6 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
512, 2048, label="Video Resolution", value=512, step=256
)
- generate_mv_btn = gr.Button(
- "🎨 1. Generate MV Images(~1min)",
- variant="primary",
- interactive=False,
- )
-
with gr.Column(scale=3):
with gr.Row():
image_sample1 = gr.Image(
@@ -194,10 +193,10 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
visible=False,
)
- gr.Markdown(
- "Note: Select samples with consistent textures from various "
- "perspectives and no obvious reflections."
- )
+ # gr.Markdown(
+ # "Note: Select samples with consistent textures from various "
+ # "perspectives and no obvious reflections."
+ # )
with gr.Row():
with gr.Column(scale=1):
with gr.Row():
@@ -222,6 +221,11 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
)
with gr.Column(scale=1):
+ generate_mv_btn = gr.Button(
+ "🎨 1. Generate MV Images(~1min)",
+ variant="primary",
+ interactive=False,
+ )
texture_bake_btn = gr.Button(
"🛠️ 2. Texture Baking(~2min)",
variant="primary",
@@ -237,7 +241,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
mesh_output = gr.Model3D(
label="Mesh Edit Result",
clear_color=[0.8, 0.8, 0.8, 1],
- height=380,
+ height=340,
interactive=False,
elem_id="lighter_mesh",
)
@@ -246,7 +250,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
label="Mesh Edit Video",
autoplay=True,
loop=True,
- height=380,
+ height=340,
)
with gr.Row():
diff --git a/embodied_gen/models/text_model.py b/embodied_gen/models/text_model.py
index 6379b16..7ea8c4c 100644
--- a/embodied_gen/models/text_model.py
+++ b/embodied_gen/models/text_model.py
@@ -53,6 +53,9 @@ __all__ = [
]
+PROMPT_APPEND = "Full view of one {}, no cropping, centered, no occlusion, isolated product photo, matte, 3D style, on a plain clean surface"
+
+
def download_kolors_weights(local_dir: str = "weights/Kolors") -> None:
logger.info(f"Download kolors weights from huggingface...")
os.makedirs(local_dir, exist_ok=True)
@@ -179,8 +182,9 @@ def text2img_gen(
ip_image_size: int = 512,
seed: int = None,
) -> list[Image.Image]:
- prompt = "Single " + prompt + ", in the center of the image"
- prompt += ", high quality, high resolution, best quality, white background, 3D style" # noqa
+ # prompt = "Single " + prompt + ", in the center of the image"
+ # prompt += ", high quality, high resolution, best quality, white background, 3D style" # noqa
+ prompt = PROMPT_APPEND.format(prompt.strip())
logger.info(f"Processing prompt: {prompt}")
generator = None
diff --git a/embodied_gen/validators/urdf_convertor.py b/embodied_gen/validators/urdf_convertor.py
index a830519..076c01a 100644
--- a/embodied_gen/validators/urdf_convertor.py
+++ b/embodied_gen/validators/urdf_convertor.py
@@ -102,6 +102,7 @@ class URDFGenerator(object):
view_desc
+ """of the 3D object asset,
category: {category}.
+ You are an expert in 3D object analysis and physical property estimation.
Give the category of this object asset (within 3 words),
(if category is already provided, use it directly),
accurately describe this 3D object asset (within 15 words),
@@ -109,9 +110,19 @@ class URDFGenerator(object):
weight range (unit: kilogram), the average static friction
coefficient of the object relative to rubber and the average
dynamic friction coefficient of the object relative to rubber.
- Return response format as shown in Example.
+ Return response format as shown in Output Example.
- Example:
+ IMPORTANT:
+ Inputed images are orthographic projection showing the front, left, right and back views,
+ the first image is always the front view. Use the object's pose and orientation in the
+ rendered images to estimate its **true vertical height as it appears in the image**,
+ not the real-world length or width of the object.
+ For example:
+ - A pen standing upright in the front view → vertical height: 0.15-0.2 m
+ - A pen lying horizontally in the front view → vertical height: 0.01-0.02 m
+ (based on its thickness in the image)
+
+ Output Example:
Category: cup
Description: shiny golden cup with floral design
Height: 0.1-0.15 m
diff --git a/requirements.txt b/requirements.txt
index 8310d08..79e459a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ realesrgan==0.3.0
pydantic==2.9.2
vtk==9.3.1
spaces
-utils3d@git+https://github.com/EasternJournalist/utils3d.git@9a4eb15e4021b67b12c460c7057d642626897ec8
+utils3d@git+https://github.com/EasternJournalist/utils3d.git#egg=9a4eb15
clip@git+https://github.com/openai/CLIP.git
kolors@git+https://github.com/Kwai-Kolors/Kolors.git#egg=038818d
segment-anything@git+https://github.com/facebookresearch/segment-anything.git#egg=dca509f