From e8de0e44df344a94fd96f6a016d63f843bf23fe3 Mon Sep 17 00:00:00 2001 From: Xinjie Date: Fri, 27 Jun 2025 00:39:42 +0800 Subject: [PATCH] feat(urdf): Improve the scale restoration logic to make it more robust.(#17) Improve the scale restoration logic to make it more robust. --- apps/image_to_3d.py | 13 ++++--- apps/text_to_3d.py | 12 +++---- apps/texture_edit.py | 42 +++++++++++++---------- embodied_gen/models/text_model.py | 8 +++-- embodied_gen/validators/urdf_convertor.py | 15 ++++++-- requirements.txt | 2 +- 6 files changed, 55 insertions(+), 37 deletions(-) diff --git a/apps/image_to_3d.py b/apps/image_to_3d.py index 039b953..752d031 100644 --- a/apps/image_to_3d.py +++ b/apps/image_to_3d.py @@ -40,6 +40,8 @@ from common import ( ) with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: + gr.HTML(image_css, visible=False) + gr.HTML(lighting_css, visible=False) gr.Markdown( """ ## ***EmbodiedGen***: Image-to-3D Asset @@ -54,21 +56,18 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: 💻 GitHub - + 🎥 Video

🖼️ Generate physically plausible 3D asset from single input image. - """.format( VERSION=VERSION ), elem_classes=["header"], ) - gr.HTML(image_css) - gr.HTML(lighting_css) with gr.Row(): with gr.Column(scale=2): with gr.Tabs() as input_tabs: @@ -239,9 +238,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: ) gr.Markdown( - """ NOTE: If `Asset Attributes` are provided, the provided - properties will be used; otherwise, the GPT-preset properties - will be applied. \n + """ NOTE: If `Asset Attributes` are provided, it will guide + GPT to perform physical attributes restoration. \n The `Download URDF` file is restored to the real scale and has quality inspection, open with an editor to view details. """ @@ -279,6 +277,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: examples_per_page=10, ) with gr.Column(scale=1): + gr.Markdown("
") video_output = gr.Video( label="Generated 3D Asset", autoplay=True, diff --git a/apps/text_to_3d.py b/apps/text_to_3d.py index 21388f5..7bf8380 100644 --- a/apps/text_to_3d.py +++ b/apps/text_to_3d.py @@ -40,6 +40,8 @@ from common import ( ) with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: + gr.HTML(image_css, visible=False) + gr.HTML(lighting_css, visible=False) gr.Markdown( """ ## ***EmbodiedGen***: Text-to-3D Asset @@ -54,20 +56,18 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: 💻 GitHub - + 🎥 Video

📝 Create 3D assets from text descriptions for a wide range of geometry and styles. - """.format( VERSION=VERSION ), elem_classes=["header"], ) - gr.HTML(image_css) - gr.HTML(lighting_css) + with gr.Row(): with gr.Column(scale=1): raw_image_cache = gr.Image( @@ -267,8 +267,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: visible=False, ) gr.Markdown( - "The generated image may be of poor quality due to auto " - "segmentation. Try adjusting the text prompt or seed." + "Generated image may be poor quality due to auto seg." + "Retry by adjusting text prompt, seed or switch seg model in `Image Gen Settings`." ) with gr.Row(): video_output = gr.Video( diff --git a/apps/texture_edit.py b/apps/texture_edit.py index ca7d1d4..e505082 100644 --- a/apps/texture_edit.py +++ b/apps/texture_edit.py @@ -50,6 +50,8 @@ def active_btn_by_content(mesh_content: gr.Model3D, text_content: gr.Textbox): with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: + gr.HTML(image_css, visible=False) + gr.HTML(lighting_css, visible=False) gr.Markdown( """ ## ***EmbodiedGen***: Texture Generation @@ -64,30 +66,33 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: 💻 GitHub - + 🎥 Video

🎨 Generate visually rich textures for 3D mesh. - """.format( VERSION=VERSION ), elem_classes=["header"], ) - gr.HTML(image_css) - gr.HTML(lighting_css) + with gr.Row(): with gr.Column(scale=1): + gr.Markdown( + "You can select input in `Mesh Gallery` at page bottom." + ) mesh_input = gr.Model3D( - label="Upload Mesh File(.obj or .glb)", height=300 + label="Upload Mesh File(.obj or .glb)", height=270 ) local_mesh = gr.Textbox(visible=False) text_prompt = gr.Textbox( label="Text Prompt (Chinese or English)", placeholder="Input text prompt here", ) + gr.Markdown("
") + ip_image = gr.Image( label="Reference Image(optional)", format="png", @@ -97,8 +102,8 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: elem_classes=["image_fit"], ) gr.Markdown( - "Note: The `reference image` is optional. If provided, please " - "increase the `Condition Scale` in Generation Settings." + "Note: The `reference image` is optional. If provided, " + "increase `Condition Scale` in Generation Settings." ) with gr.Accordion(label="Generation Settings", open=False): @@ -139,12 +144,6 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: 512, 2048, label="Video Resolution", value=512, step=256 ) - generate_mv_btn = gr.Button( - "🎨 1. Generate MV Images(~1min)", - variant="primary", - interactive=False, - ) - with gr.Column(scale=3): with gr.Row(): image_sample1 = gr.Image( @@ -194,10 +193,10 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: visible=False, ) - gr.Markdown( - "Note: Select samples with consistent textures from various " - "perspectives and no obvious reflections." - ) + # gr.Markdown( + # "Note: Select samples with consistent textures from various " + # "perspectives and no obvious reflections." + # ) with gr.Row(): with gr.Column(scale=1): with gr.Row(): @@ -222,6 +221,11 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: ) with gr.Column(scale=1): + generate_mv_btn = gr.Button( + "🎨 1. Generate MV Images(~1min)", + variant="primary", + interactive=False, + ) texture_bake_btn = gr.Button( "🛠️ 2. Texture Baking(~2min)", variant="primary", @@ -237,7 +241,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: mesh_output = gr.Model3D( label="Mesh Edit Result", clear_color=[0.8, 0.8, 0.8, 1], - height=380, + height=340, interactive=False, elem_id="lighter_mesh", ) @@ -246,7 +250,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo: label="Mesh Edit Video", autoplay=True, loop=True, - height=380, + height=340, ) with gr.Row(): diff --git a/embodied_gen/models/text_model.py b/embodied_gen/models/text_model.py index 6379b16..7ea8c4c 100644 --- a/embodied_gen/models/text_model.py +++ b/embodied_gen/models/text_model.py @@ -53,6 +53,9 @@ __all__ = [ ] +PROMPT_APPEND = "Full view of one {}, no cropping, centered, no occlusion, isolated product photo, matte, 3D style, on a plain clean surface" + + def download_kolors_weights(local_dir: str = "weights/Kolors") -> None: logger.info(f"Download kolors weights from huggingface...") os.makedirs(local_dir, exist_ok=True) @@ -179,8 +182,9 @@ def text2img_gen( ip_image_size: int = 512, seed: int = None, ) -> list[Image.Image]: - prompt = "Single " + prompt + ", in the center of the image" - prompt += ", high quality, high resolution, best quality, white background, 3D style" # noqa + # prompt = "Single " + prompt + ", in the center of the image" + # prompt += ", high quality, high resolution, best quality, white background, 3D style" # noqa + prompt = PROMPT_APPEND.format(prompt.strip()) logger.info(f"Processing prompt: {prompt}") generator = None diff --git a/embodied_gen/validators/urdf_convertor.py b/embodied_gen/validators/urdf_convertor.py index a830519..076c01a 100644 --- a/embodied_gen/validators/urdf_convertor.py +++ b/embodied_gen/validators/urdf_convertor.py @@ -102,6 +102,7 @@ class URDFGenerator(object): view_desc + """of the 3D object asset, category: {category}. + You are an expert in 3D object analysis and physical property estimation. Give the category of this object asset (within 3 words), (if category is already provided, use it directly), accurately describe this 3D object asset (within 15 words), @@ -109,9 +110,19 @@ class URDFGenerator(object): weight range (unit: kilogram), the average static friction coefficient of the object relative to rubber and the average dynamic friction coefficient of the object relative to rubber. - Return response format as shown in Example. + Return response format as shown in Output Example. - Example: + IMPORTANT: + Inputed images are orthographic projection showing the front, left, right and back views, + the first image is always the front view. Use the object's pose and orientation in the + rendered images to estimate its **true vertical height as it appears in the image**, + not the real-world length or width of the object. + For example: + - A pen standing upright in the front view → vertical height: 0.15-0.2 m + - A pen lying horizontally in the front view → vertical height: 0.01-0.02 m + (based on its thickness in the image) + + Output Example: Category: cup Description: shiny golden cup with floral design Height: 0.1-0.15 m diff --git a/requirements.txt b/requirements.txt index 8310d08..79e459a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,7 +30,7 @@ realesrgan==0.3.0 pydantic==2.9.2 vtk==9.3.1 spaces -utils3d@git+https://github.com/EasternJournalist/utils3d.git@9a4eb15e4021b67b12c460c7057d642626897ec8 +utils3d@git+https://github.com/EasternJournalist/utils3d.git#egg=9a4eb15 clip@git+https://github.com/openai/CLIP.git kolors@git+https://github.com/Kwai-Kolors/Kolors.git#egg=038818d segment-anything@git+https://github.com/facebookresearch/segment-anything.git#egg=dca509f