163 lines
5.9 KiB
Python
163 lines
5.9 KiB
Python
# Project EmbodiedGen
|
|
#
|
|
# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied. See the License for the specific language governing
|
|
# permissions and limitations under the License.
|
|
|
|
|
|
import os
|
|
|
|
import torch
|
|
from diffusers import AutoencoderKL, DiffusionPipeline, EulerDiscreteScheduler
|
|
from huggingface_hub import snapshot_download
|
|
from kolors.models.controlnet import ControlNetModel
|
|
from kolors.models.modeling_chatglm import ChatGLMModel
|
|
from kolors.models.tokenization_chatglm import ChatGLMTokenizer
|
|
from kolors.models.unet_2d_condition import UNet2DConditionModel
|
|
from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import (
|
|
StableDiffusionXLControlNetImg2ImgPipeline,
|
|
)
|
|
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
|
|
from embodied_gen.models.text_model import download_kolors_weights
|
|
from embodied_gen.utils.log import logger
|
|
|
|
__all__ = [
|
|
"build_texture_gen_pipe",
|
|
]
|
|
|
|
|
|
def build_texture_gen_pipe(
|
|
base_ckpt_dir: str,
|
|
controlnet_ckpt: str = None,
|
|
ip_adapt_scale: float = 0,
|
|
device: str = "cuda",
|
|
) -> DiffusionPipeline:
|
|
"""Build and initialize the Kolors + ControlNet (optional IP-Adapter) texture generation pipeline.
|
|
|
|
Loads Kolors tokenizer, text encoder (ChatGLM), VAE, UNet, scheduler and (optionally)
|
|
a ControlNet checkpoint plus IP-Adapter vision encoder. If ``controlnet_ckpt`` is
|
|
not provided, the default multi-view texture ControlNet weights are downloaded
|
|
automatically from the hub. When ``ip_adapt_scale > 0`` an IP-Adapter vision
|
|
encoder and its weights are also loaded and activated.
|
|
|
|
Args:
|
|
base_ckpt_dir (str):
|
|
Root directory where Kolors (and optionally Kolors-IP-Adapter-Plus) weights
|
|
are or will be stored. Required subfolders: ``Kolors/{text_encoder,vae,unet,scheduler}``.
|
|
controlnet_ckpt (str, optional):
|
|
Directory containing a ControlNet checkpoint (safetensors). If ``None``,
|
|
downloads the default ``texture_gen_mv_v1`` snapshot.
|
|
ip_adapt_scale (float, optional):
|
|
Strength (>=0) of IP-Adapter conditioning. Set >0 to enable IP-Adapter;
|
|
typical values: 0.4-0.8. Default: 0 (disabled).
|
|
device (str, optional):
|
|
Target device to move the pipeline to (e.g. ``"cuda"``, ``"cuda:0"``, ``"cpu"``).
|
|
Default: ``"cuda"``.
|
|
|
|
Returns:
|
|
DiffusionPipeline: A configured
|
|
``StableDiffusionXLControlNetImg2ImgPipeline`` ready for multi-view texture
|
|
generation (with optional IP-Adapter support).
|
|
|
|
Example:
|
|
Initialize pipeline with IP-Adapter enabled.
|
|
```python
|
|
from embodied_gen.models.texture_model import build_texture_gen_pipe
|
|
ip_adapt_scale = 0.7
|
|
PIPELINE = build_texture_gen_pipe(
|
|
base_ckpt_dir="./weights",
|
|
ip_adapt_scale=ip_adapt_scale,
|
|
device="cuda",
|
|
)
|
|
PIPELINE.set_ip_adapter_scale([ip_adapt_scale])
|
|
```
|
|
Initialize pipeline without IP-Adapter.
|
|
```python
|
|
from embodied_gen.models.texture_model import build_texture_gen_pipe
|
|
PIPELINE = build_texture_gen_pipe(
|
|
base_ckpt_dir="./weights",
|
|
ip_adapt_scale=0,
|
|
device="cuda",
|
|
)
|
|
```
|
|
"""
|
|
|
|
download_kolors_weights(f"{base_ckpt_dir}/Kolors")
|
|
logger.info(f"Load Kolors weights...")
|
|
tokenizer = ChatGLMTokenizer.from_pretrained(
|
|
f"{base_ckpt_dir}/Kolors/text_encoder"
|
|
)
|
|
text_encoder = ChatGLMModel.from_pretrained(
|
|
f"{base_ckpt_dir}/Kolors/text_encoder", torch_dtype=torch.float16
|
|
).half()
|
|
vae = AutoencoderKL.from_pretrained(
|
|
f"{base_ckpt_dir}/Kolors/vae", revision=None
|
|
).half()
|
|
unet = UNet2DConditionModel.from_pretrained(
|
|
f"{base_ckpt_dir}/Kolors/unet", revision=None
|
|
).half()
|
|
scheduler = EulerDiscreteScheduler.from_pretrained(
|
|
f"{base_ckpt_dir}/Kolors/scheduler"
|
|
)
|
|
|
|
if controlnet_ckpt is None:
|
|
suffix = "texture_gen_mv_v1" # "geo_cond_mv"
|
|
model_path = snapshot_download(
|
|
repo_id="xinjjj/RoboAssetGen", allow_patterns=f"{suffix}/*"
|
|
)
|
|
controlnet_ckpt = os.path.join(model_path, suffix)
|
|
|
|
controlnet = ControlNetModel.from_pretrained(
|
|
controlnet_ckpt, use_safetensors=True
|
|
).half()
|
|
|
|
# IP-Adapter model
|
|
image_encoder = None
|
|
clip_image_processor = None
|
|
if ip_adapt_scale > 0:
|
|
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
|
f"{base_ckpt_dir}/Kolors-IP-Adapter-Plus/image_encoder",
|
|
# ignore_mismatched_sizes=True,
|
|
).to(dtype=torch.float16)
|
|
ip_img_size = 336
|
|
clip_image_processor = CLIPImageProcessor(
|
|
size=ip_img_size, crop_size=ip_img_size
|
|
)
|
|
|
|
pipe = StableDiffusionXLControlNetImg2ImgPipeline(
|
|
vae=vae,
|
|
controlnet=controlnet,
|
|
text_encoder=text_encoder,
|
|
tokenizer=tokenizer,
|
|
unet=unet,
|
|
scheduler=scheduler,
|
|
image_encoder=image_encoder,
|
|
feature_extractor=clip_image_processor,
|
|
force_zeros_for_empty_prompt=False,
|
|
)
|
|
|
|
if ip_adapt_scale > 0:
|
|
if hasattr(pipe.unet, "encoder_hid_proj"):
|
|
pipe.unet.text_encoder_hid_proj = pipe.unet.encoder_hid_proj
|
|
pipe.load_ip_adapter(
|
|
f"{base_ckpt_dir}/Kolors-IP-Adapter-Plus",
|
|
subfolder="",
|
|
weight_name=["ip_adapter_plus_general.bin"],
|
|
)
|
|
pipe.set_ip_adapter_scale([ip_adapt_scale])
|
|
|
|
pipe = pipe.to(device)
|
|
pipe.enable_model_cpu_offload()
|
|
|
|
return pipe
|