feat(urdf): Improve the scale restoration logic to make it more robust.(#17)

Improve the scale restoration logic to make it more robust.
This commit is contained in:
Xinjie 2025-06-27 00:39:42 +08:00 committed by GitHub
parent 52983c8de2
commit e8de0e44df
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 55 additions and 37 deletions

View File

@ -40,6 +40,8 @@ from common import (
)
with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
gr.HTML(image_css, visible=False)
gr.HTML(lighting_css, visible=False)
gr.Markdown(
"""
## ***EmbodiedGen***: Image-to-3D Asset
@ -54,21 +56,18 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
<a href="https://github.com/HorizonRobotics/EmbodiedGen">
<img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
</a>
<a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
<a href="https://www.youtube.com/watch?v=rG4odybuJRk">
<img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
</a>
</p>
🖼 Generate physically plausible 3D asset from single input image.
""".format(
VERSION=VERSION
),
elem_classes=["header"],
)
gr.HTML(image_css)
gr.HTML(lighting_css)
with gr.Row():
with gr.Column(scale=2):
with gr.Tabs() as input_tabs:
@ -239,9 +238,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
)
gr.Markdown(
""" NOTE: If `Asset Attributes` are provided, the provided
properties will be used; otherwise, the GPT-preset properties
will be applied. \n
""" NOTE: If `Asset Attributes` are provided, it will guide
GPT to perform physical attributes restoration. \n
The `Download URDF` file is restored to the real scale and
has quality inspection, open with an editor to view details.
"""
@ -279,6 +277,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
examples_per_page=10,
)
with gr.Column(scale=1):
gr.Markdown("<br>")
video_output = gr.Video(
label="Generated 3D Asset",
autoplay=True,

View File

@ -40,6 +40,8 @@ from common import (
)
with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
gr.HTML(image_css, visible=False)
gr.HTML(lighting_css, visible=False)
gr.Markdown(
"""
## ***EmbodiedGen***: Text-to-3D Asset
@ -54,20 +56,18 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
<a href="https://github.com/HorizonRobotics/EmbodiedGen">
<img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
</a>
<a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
<a href="https://www.youtube.com/watch?v=rG4odybuJRk">
<img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
</a>
</p>
📝 Create 3D assets from text descriptions for a wide range of geometry and styles.
""".format(
VERSION=VERSION
),
elem_classes=["header"],
)
gr.HTML(image_css)
gr.HTML(lighting_css)
with gr.Row():
with gr.Column(scale=1):
raw_image_cache = gr.Image(
@ -267,8 +267,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
visible=False,
)
gr.Markdown(
"The generated image may be of poor quality due to auto "
"segmentation. Try adjusting the text prompt or seed."
"Generated image may be poor quality due to auto seg."
"Retry by adjusting text prompt, seed or switch seg model in `Image Gen Settings`."
)
with gr.Row():
video_output = gr.Video(

View File

@ -50,6 +50,8 @@ def active_btn_by_content(mesh_content: gr.Model3D, text_content: gr.Textbox):
with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
gr.HTML(image_css, visible=False)
gr.HTML(lighting_css, visible=False)
gr.Markdown(
"""
## ***EmbodiedGen***: Texture Generation
@ -64,30 +66,33 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
<a href="https://github.com/HorizonRobotics/EmbodiedGen">
<img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
</a>
<a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
<a href="https://www.youtube.com/watch?v=rG4odybuJRk">
<img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
</a>
</p>
🎨 Generate visually rich textures for 3D mesh.
""".format(
VERSION=VERSION
),
elem_classes=["header"],
)
gr.HTML(image_css)
gr.HTML(lighting_css)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown(
"You can select input in `Mesh Gallery` at page bottom."
)
mesh_input = gr.Model3D(
label="Upload Mesh File(.obj or .glb)", height=300
label="Upload Mesh File(.obj or .glb)", height=270
)
local_mesh = gr.Textbox(visible=False)
text_prompt = gr.Textbox(
label="Text Prompt (Chinese or English)",
placeholder="Input text prompt here",
)
gr.Markdown("<br>")
ip_image = gr.Image(
label="Reference Image(optional)",
format="png",
@ -97,8 +102,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
elem_classes=["image_fit"],
)
gr.Markdown(
"Note: The `reference image` is optional. If provided, please "
"increase the `Condition Scale` in Generation Settings."
"Note: The `reference image` is optional. If provided, "
"increase `Condition Scale` in Generation Settings."
)
with gr.Accordion(label="Generation Settings", open=False):
@ -139,12 +144,6 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
512, 2048, label="Video Resolution", value=512, step=256
)
generate_mv_btn = gr.Button(
"🎨 1. Generate MV Images(~1min)",
variant="primary",
interactive=False,
)
with gr.Column(scale=3):
with gr.Row():
image_sample1 = gr.Image(
@ -194,10 +193,10 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
visible=False,
)
gr.Markdown(
"Note: Select samples with consistent textures from various "
"perspectives and no obvious reflections."
)
# gr.Markdown(
# "Note: Select samples with consistent textures from various "
# "perspectives and no obvious reflections."
# )
with gr.Row():
with gr.Column(scale=1):
with gr.Row():
@ -222,6 +221,11 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
)
with gr.Column(scale=1):
generate_mv_btn = gr.Button(
"🎨 1. Generate MV Images(~1min)",
variant="primary",
interactive=False,
)
texture_bake_btn = gr.Button(
"🛠️ 2. Texture Baking(~2min)",
variant="primary",
@ -237,7 +241,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
mesh_output = gr.Model3D(
label="Mesh Edit Result",
clear_color=[0.8, 0.8, 0.8, 1],
height=380,
height=340,
interactive=False,
elem_id="lighter_mesh",
)
@ -246,7 +250,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
label="Mesh Edit Video",
autoplay=True,
loop=True,
height=380,
height=340,
)
with gr.Row():

View File

@ -53,6 +53,9 @@ __all__ = [
]
PROMPT_APPEND = "Full view of one {}, no cropping, centered, no occlusion, isolated product photo, matte, 3D style, on a plain clean surface"
def download_kolors_weights(local_dir: str = "weights/Kolors") -> None:
logger.info(f"Download kolors weights from huggingface...")
os.makedirs(local_dir, exist_ok=True)
@ -179,8 +182,9 @@ def text2img_gen(
ip_image_size: int = 512,
seed: int = None,
) -> list[Image.Image]:
prompt = "Single " + prompt + ", in the center of the image"
prompt += ", high quality, high resolution, best quality, white background, 3D style" # noqa
# prompt = "Single " + prompt + ", in the center of the image"
# prompt += ", high quality, high resolution, best quality, white background, 3D style" # noqa
prompt = PROMPT_APPEND.format(prompt.strip())
logger.info(f"Processing prompt: {prompt}")
generator = None

View File

@ -102,6 +102,7 @@ class URDFGenerator(object):
view_desc
+ """of the 3D object asset,
category: {category}.
You are an expert in 3D object analysis and physical property estimation.
Give the category of this object asset (within 3 words),
(if category is already provided, use it directly),
accurately describe this 3D object asset (within 15 words),
@ -109,9 +110,19 @@ class URDFGenerator(object):
weight range (unit: kilogram), the average static friction
coefficient of the object relative to rubber and the average
dynamic friction coefficient of the object relative to rubber.
Return response format as shown in Example.
Return response format as shown in Output Example.
Example:
IMPORTANT:
Inputed images are orthographic projection showing the front, left, right and back views,
the first image is always the front view. Use the object's pose and orientation in the
rendered images to estimate its **true vertical height as it appears in the image**,
not the real-world length or width of the object.
For example:
- A pen standing upright in the front view vertical height: 0.15-0.2 m
- A pen lying horizontally in the front view vertical height: 0.01-0.02 m
(based on its thickness in the image)
Output Example:
Category: cup
Description: shiny golden cup with floral design
Height: 0.1-0.15 m

View File

@ -30,7 +30,7 @@ realesrgan==0.3.0
pydantic==2.9.2
vtk==9.3.1
spaces
utils3d@git+https://github.com/EasternJournalist/utils3d.git@9a4eb15e4021b67b12c460c7057d642626897ec8
utils3d@git+https://github.com/EasternJournalist/utils3d.git#egg=9a4eb15
clip@git+https://github.com/openai/CLIP.git
kolors@git+https://github.com/Kwai-Kolors/Kolors.git#egg=038818d
segment-anything@git+https://github.com/facebookresearch/segment-anything.git#egg=dca509f