diffusers Combining community pipeline for image generation

Describe the bug

I cannot use both stable diffusion XL reference and Instant ID in the same pipeline. I get 'FrozenDict' object has no attribute 'block_out_channels'"

Reproduction

from stable_diffusion_xl_reference import StableDiffusionXLReferencePipeline
from pipeline_stable_diffusion_xl_instantid import StableDiffusionXLInstantIDPipeline, draw_kps

controlnet_path = f'path/to/instant/id'

# load IdentityNet
identityNet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)

pipe = StableDiffusionXLReferencePipeline.from_pretrained(
    "../path/to/model",
    torch_dtype=torch.float16,
    #use_safetensors=True,
    variant="fp16").to('cuda')

pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

pipe_instant = StableDiffusionXLInstantIDPipeline(
    pipe,
    #vae = pipe.vae, # I tried both witout and with the VAE
    text_encoder = pipe.text_encoder,
    text_encoder_2 = pipe.text_encoder_2,
    tokenizer = pipe.tokenizer,
    tokenizer_2 = pipe.tokenizer_2,
    unet = pipe.unet,
    scheduler = pipe.scheduler,
    feature_extractor = pipe.feature_extractor,
    controlnet= [identityNet],
)

Logs

{
	"name": "AttributeError",
	"message": "'FrozenDict' object has no attribute 'block_out_channels'",
	"stack": "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)\nCell \u001b[1;32mIn[3], line 28\u001b[0m\n\u001b[0;32m     20\u001b[0m pipe \u001b[38;5;241m=\u001b[39m StableDiffusionXLReferencePipeline\u001b[38;5;241m.\u001b[39mfrom_pretrained(\n\u001b[0;32m     21\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m../models/StableDiffusion/RealvisXLv40_lightning\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     22\u001b[0m     torch_dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat16,\n\u001b[0;32m     23\u001b[0m     \u001b[38;5;66;03m#use_safetensors=True,\u001b[39;00m\n\u001b[0;32m     24\u001b[0m     variant\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfp16\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m     26\u001b[0m pipe\u001b[38;5;241m.\u001b[39mscheduler \u001b[38;5;241m=\u001b[39m UniPCMultistepScheduler\u001b[38;5;241m.\u001b[39mfrom_config(pipe\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mconfig)\n\u001b[1;32m---> 28\u001b[0m pipe_instant \u001b[38;5;241m=\u001b[39m \u001b[43mStableDiffusionXLInstantIDPipeline\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m     29\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpipe\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     30\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;66;43;03m#vae = pipe.vae, \u001b[39;49;00m\n\u001b[0;32m     31\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext_encoder\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext_encoder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     32\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext_encoder_2\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext_encoder_2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     33\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     34\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtokenizer_2\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer_2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     35\u001b[0m \u001b[43m    \u001b[49m\u001b[43munet\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43munet\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     36\u001b[0m \u001b[43m    \u001b[49m\u001b[43mscheduler\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscheduler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     37\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;66;43;03m#safety_checker = pipe.safety_checker,\u001b[39;49;00m\n\u001b[0;32m     38\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfeature_extractor\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeature_extractor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     39\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcontrolnet\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43midentityNet\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     40\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;66;43;03m#torch_dtype=torch.float16\u001b[39;49;00m\n\u001b[0;32m     41\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m     44\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m     45\u001b[0m \u001b[38;5;124;03mresult_img = pipe_instant(ref_image=input_image,\u001b[39;00m\n\u001b[0;32m     46\u001b[0m \u001b[38;5;124;03m                prompt=\"1girl\",\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     55\u001b[0m \u001b[38;5;124;03mresult_img.show()\u001b[39;00m\n\u001b[0;32m     56\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\nFile \u001b[1;32me:\\conda\\envs\\rayban\\lib\\site-packages\\diffusers\\pipelines\\controlnet\\pipeline_controlnet_sd_xl.py:211\u001b[0m, in \u001b[0;36mStableDiffusionXLControlNetPipeline.__init__\u001b[1;34m(self, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2, unet, controlnet, scheduler, force_zeros_for_empty_prompt, add_watermarker, feature_extractor, image_encoder)\u001b[0m\n\u001b[0;32m    197\u001b[0m     controlnet \u001b[38;5;241m=\u001b[39m MultiControlNetModel(controlnet)\n\u001b[0;32m    199\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mregister_modules(\n\u001b[0;32m    200\u001b[0m     vae\u001b[38;5;241m=\u001b[39mvae,\n\u001b[0;32m    201\u001b[0m     text_encoder\u001b[38;5;241m=\u001b[39mtext_encoder,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    209\u001b[0m     image_encoder\u001b[38;5;241m=\u001b[39mimage_encoder,\n\u001b[0;32m    210\u001b[0m )\n\u001b[1;32m--> 211\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvae_scale_factor \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m (\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvae\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblock_out_channels\u001b[49m) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m    212\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mimage_processor \u001b[38;5;241m=\u001b[39m VaeImageProcessor(vae_scale_factor\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvae_scale_factor, do_convert_rgb\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m    213\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol_image_processor \u001b[38;5;241m=\u001b[39m VaeImageProcessor(\n\u001b[0;32m    214\u001b[0m     vae_scale_factor\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvae_scale_factor, do_convert_rgb\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, do_normalize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m    215\u001b[0m )\n\n\u001b[1;31mAttributeError\u001b[0m: 'FrozenDict' object has no attribute 'block_out_channels'"
}

System Info

diffusers version: 0.25.0
Platform: Windows-10-10.0.19045-SP0
Python version: 3.10.14
PyTorch version (GPU?): 2.2.2 (True)
Huggingface_hub version: 0.22.2
Transformers version: 4.36.2
Accelerate version: 0.29.2
xFormers version: not installed
Using GPU in script?: yes
Using distributed or parallel set-up in script?: no

Who can help?

@yiyixuxu @sayakpaul @DN6 @stevhliu

May 14 '24 11:05 Fqlox

I installed diffusers from the source and this seems to work:

import torch
from diffusers import DiffusionPipeline, ControlNetModel
from diffusers import UniPCMultistepScheduler


controlnet_path = f'path/to/instant/id'

# load IdentityNet
identityNet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)

pipe = DiffusionPipeline.from_pretrained(
     "stabilityai/stable-diffusion-xl-base-1.0",
     custom_pipeline="stable_diffusion_xl_reference",
     torch_dtype=torch.float16,
     use_safetensors=True,
     variant="fp16").to('cuda')

pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe_instant = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    custom_pipeline="pipeline_stable_diffusion_xl_instantid",
    #vae = pipe.vae, # I tried both witout and with the VAE
    text_encoder = pipe.text_encoder,
    text_encoder_2 = pipe.text_encoder_2,
    tokenizer = pipe.tokenizer,
    tokenizer_2 = pipe.tokenizer_2,
    unet = pipe.unet,
    scheduler = pipe.scheduler,
    feature_extractor = pipe.feature_extractor,
    controlnet = identityNet,
)

May 15 '24 08:05 tolgacangoz

you can create your instant ID pipeline from SDXL reference pipeline with this script

import torch
from diffusers import DiffusionPipeline, ControlNetModel
from diffusers import UniPCMultistepScheduler


# load IdentityNet
identityNet = ControlNetModel.from_pretrained("InstantX/InstantID", subfolder ="ControlNetModel", torch_dtype=torch.float16)

pipe = DiffusionPipeline.from_pretrained(
     "stabilityai/stable-diffusion-xl-base-1.0",
     custom_pipeline="stable_diffusion_xl_reference",
     torch_dtype=torch.float16,
     use_safetensors=True,
     variant="fp16").to('cuda')

pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe_instant = DiffusionPipeline.from_pipe(
    pipe,
    custom_pipeline="pipeline_stable_diffusion_xl_instantid",
    controlnet = identityNet,
)

May 16 '24 03:05 yiyixuxu

@standardAI I did install diffuser from source then I had to downgrade due to an error on instantID. When I generate using both reference and instantId arguments, and with only instant id argument. It does not impact the generation.

image_plus_ref = pipe_instant(
    prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=4,
    guidance_scale=1.2,
    image_proj_model_in_features=face_emb,
    image_embeds=face_emb,
    image=face_kps,
    controlnet_conditioning_scale=0.8,
    seed = 42,
    reference_attn=True,
    reference_adain=True,
    ref_image = ref_image
).images[0]

And

image = pipe_instant(
    prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=4,
    guidance_scale=1.2,
    image_proj_model_in_features=face_emb,
    image_embeds=face_emb,
    image=face_kps,
    controlnet_conditioning_scale=0.8,
    seed = 42,
).images[0]

[note that I use lightning diffuser model]

@yiyixuxu since I'm on diffusers==0.26.3 the method from_pipe does not seems to work.

May 16 '24 13:05 Fqlox

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.

Sep 14 '24 15:09 github-actions[bot]