embodiedgen/embodied_gen/models/texture_model.py

# Project EmbodiedGen
#
# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License.


import os

import torch
from diffusers import AutoencoderKL, DiffusionPipeline, EulerDiscreteScheduler
from huggingface_hub import snapshot_download
from kolors.models.controlnet import ControlNetModel
from kolors.models.modeling_chatglm import ChatGLMModel
from kolors.models.tokenization_chatglm import ChatGLMTokenizer
from kolors.models.unet_2d_condition import UNet2DConditionModel
from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import (
    StableDiffusionXLControlNetImg2ImgPipeline,
)
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from embodied_gen.models.text_model import download_kolors_weights
from embodied_gen.utils.log import logger

__all__ = [
    "build_texture_gen_pipe",
]


def build_texture_gen_pipe(
    base_ckpt_dir: str,
    controlnet_ckpt: str = None,
    ip_adapt_scale: float = 0,
    device: str = "cuda",
) -> DiffusionPipeline:
    """Build and initialize the Kolors + ControlNet (optional IP-Adapter) texture generation pipeline.

    Loads Kolors tokenizer, text encoder (ChatGLM), VAE, UNet, scheduler and (optionally)
    a ControlNet checkpoint plus IP-Adapter vision encoder. If ``controlnet_ckpt`` is
    not provided, the default multi-view texture ControlNet weights are downloaded
    automatically from the hub. When ``ip_adapt_scale > 0`` an IP-Adapter vision
    encoder and its weights are also loaded and activated.

    Args:
        base_ckpt_dir (str):
            Root directory where Kolors (and optionally Kolors-IP-Adapter-Plus) weights
            are or will be stored. Required subfolders: ``Kolors/{text_encoder,vae,unet,scheduler}``.
        controlnet_ckpt (str, optional):
            Directory containing a ControlNet checkpoint (safetensors). If ``None``,
            downloads the default ``texture_gen_mv_v1`` snapshot.
        ip_adapt_scale (float, optional):
            Strength (>=0) of IP-Adapter conditioning. Set >0 to enable IP-Adapter;
            typical values: 0.4-0.8. Default: 0 (disabled).
        device (str, optional):
            Target device to move the pipeline to (e.g. ``"cuda"``, ``"cuda:0"``, ``"cpu"``).
            Default: ``"cuda"``.

    Returns:
        DiffusionPipeline: A configured
        ``StableDiffusionXLControlNetImg2ImgPipeline`` ready for multi-view texture
        generation (with optional IP-Adapter support).

    Example:
        Initialize pipeline with IP-Adapter enabled.
        ```python
        from embodied_gen.models.texture_model import build_texture_gen_pipe
        ip_adapt_scale = 0.7
        PIPELINE = build_texture_gen_pipe(
            base_ckpt_dir="./weights",
            ip_adapt_scale=ip_adapt_scale,
            device="cuda",
        )
        PIPELINE.set_ip_adapter_scale([ip_adapt_scale])
        ```
        Initialize pipeline without IP-Adapter.
        ```python
        from embodied_gen.models.texture_model import build_texture_gen_pipe
        PIPELINE = build_texture_gen_pipe(
            base_ckpt_dir="./weights",
            ip_adapt_scale=0,
            device="cuda",
        )
        ```
    """

    download_kolors_weights(f"{base_ckpt_dir}/Kolors")
    logger.info(f"Load Kolors weights...")
    tokenizer = ChatGLMTokenizer.from_pretrained(
        f"{base_ckpt_dir}/Kolors/text_encoder"
    )
    text_encoder = ChatGLMModel.from_pretrained(
        f"{base_ckpt_dir}/Kolors/text_encoder", torch_dtype=torch.float16
    ).half()
    vae = AutoencoderKL.from_pretrained(
        f"{base_ckpt_dir}/Kolors/vae", revision=None
    ).half()
    unet = UNet2DConditionModel.from_pretrained(
        f"{base_ckpt_dir}/Kolors/unet", revision=None
    ).half()
    scheduler = EulerDiscreteScheduler.from_pretrained(
        f"{base_ckpt_dir}/Kolors/scheduler"
    )

    if controlnet_ckpt is None:
        suffix = "texture_gen_mv_v1"  # "geo_cond_mv"
        model_path = snapshot_download(
            repo_id="xinjjj/RoboAssetGen", allow_patterns=f"{suffix}/*"
        )
        controlnet_ckpt = os.path.join(model_path, suffix)

    controlnet = ControlNetModel.from_pretrained(
        controlnet_ckpt, use_safetensors=True
    ).half()

    # IP-Adapter model
    image_encoder = None
    clip_image_processor = None
    if ip_adapt_scale > 0:
        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
            f"{base_ckpt_dir}/Kolors-IP-Adapter-Plus/image_encoder",
            # ignore_mismatched_sizes=True,
        ).to(dtype=torch.float16)
        ip_img_size = 336
        clip_image_processor = CLIPImageProcessor(
            size=ip_img_size, crop_size=ip_img_size
        )

    pipe = StableDiffusionXLControlNetImg2ImgPipeline(
        vae=vae,
        controlnet=controlnet,
        text_encoder=text_encoder,
        tokenizer=tokenizer,
        unet=unet,
        scheduler=scheduler,
        image_encoder=image_encoder,
        feature_extractor=clip_image_processor,
        force_zeros_for_empty_prompt=False,
    )

    if ip_adapt_scale > 0:
        if hasattr(pipe.unet, "encoder_hid_proj"):
            pipe.unet.text_encoder_hid_proj = pipe.unet.encoder_hid_proj
        pipe.load_ip_adapter(
            f"{base_ckpt_dir}/Kolors-IP-Adapter-Plus",
            subfolder="",
            weight_name=["ip_adapter_plus_general.bin"],
        )
        pipe.set_ip_adapter_scale([ip_adapt_scale])

    pipe = pipe.to(device)
    pipe.enable_model_cpu_offload()

    return pipe