Combining community pipeline for image generation
Describe the bug
I cannot use both stable diffusion XL reference and Instant ID in the same pipeline. I get 'FrozenDict' object has no attribute 'block_out_channels'"
Reproduction
from stable_diffusion_xl_reference import StableDiffusionXLReferencePipeline
from pipeline_stable_diffusion_xl_instantid import StableDiffusionXLInstantIDPipeline, draw_kps
controlnet_path = f'path/to/instant/id'
# load IdentityNet
identityNet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
pipe = StableDiffusionXLReferencePipeline.from_pretrained(
"../path/to/model",
torch_dtype=torch.float16,
#use_safetensors=True,
variant="fp16").to('cuda')
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe_instant = StableDiffusionXLInstantIDPipeline(
pipe,
#vae = pipe.vae, # I tried both witout and with the VAE
text_encoder = pipe.text_encoder,
text_encoder_2 = pipe.text_encoder_2,
tokenizer = pipe.tokenizer,
tokenizer_2 = pipe.tokenizer_2,
unet = pipe.unet,
scheduler = pipe.scheduler,
feature_extractor = pipe.feature_extractor,
controlnet= [identityNet],
)
Logs
{
"name": "AttributeError",
"message": "'FrozenDict' object has no attribute 'block_out_channels'",
"stack": "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)\nCell \u001b[1;32mIn[3], line 28\u001b[0m\n\u001b[0;32m 20\u001b[0m pipe \u001b[38;5;241m=\u001b[39m StableDiffusionXLReferencePipeline\u001b[38;5;241m.\u001b[39mfrom_pretrained(\n\u001b[0;32m 21\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m../models/StableDiffusion/RealvisXLv40_lightning\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 22\u001b[0m torch_dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat16,\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m#use_safetensors=True,\u001b[39;00m\n\u001b[0;32m 24\u001b[0m variant\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfp16\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 26\u001b[0m pipe\u001b[38;5;241m.\u001b[39mscheduler \u001b[38;5;241m=\u001b[39m UniPCMultistepScheduler\u001b[38;5;241m.\u001b[39mfrom_config(pipe\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mconfig)\n\u001b[1;32m---> 28\u001b[0m pipe_instant \u001b[38;5;241m=\u001b[39m \u001b[43mStableDiffusionXLInstantIDPipeline\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 29\u001b[0m \u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 30\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m#vae = pipe.vae, \u001b[39;49;00m\n\u001b[0;32m 31\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext_encoder\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext_encoder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 32\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext_encoder_2\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext_encoder_2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 33\u001b[0m \u001b[43m \u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 34\u001b[0m \u001b[43m \u001b[49m\u001b[43mtokenizer_2\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer_2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 35\u001b[0m \u001b[43m \u001b[49m\u001b[43munet\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43munet\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 36\u001b[0m \u001b[43m \u001b[49m\u001b[43mscheduler\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscheduler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 37\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m#safety_checker = pipe.safety_checker,\u001b[39;49;00m\n\u001b[0;32m 38\u001b[0m \u001b[43m \u001b[49m\u001b[43mfeature_extractor\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeature_extractor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 39\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontrolnet\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43midentityNet\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 40\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m#torch_dtype=torch.float16\u001b[39;49;00m\n\u001b[0;32m 41\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 44\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 45\u001b[0m \u001b[38;5;124;03mresult_img = pipe_instant(ref_image=input_image,\u001b[39;00m\n\u001b[0;32m 46\u001b[0m \u001b[38;5;124;03m prompt=\"1girl\",\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;124;03mresult_img.show()\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\nFile \u001b[1;32me:\\conda\\envs\\rayban\\lib\\site-packages\\diffusers\\pipelines\\controlnet\\pipeline_controlnet_sd_xl.py:211\u001b[0m, in \u001b[0;36mStableDiffusionXLControlNetPipeline.__init__\u001b[1;34m(self, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2, unet, controlnet, scheduler, force_zeros_for_empty_prompt, add_watermarker, feature_extractor, image_encoder)\u001b[0m\n\u001b[0;32m 197\u001b[0m controlnet \u001b[38;5;241m=\u001b[39m MultiControlNetModel(controlnet)\n\u001b[0;32m 199\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mregister_modules(\n\u001b[0;32m 200\u001b[0m vae\u001b[38;5;241m=\u001b[39mvae,\n\u001b[0;32m 201\u001b[0m text_encoder\u001b[38;5;241m=\u001b[39mtext_encoder,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 209\u001b[0m image_encoder\u001b[38;5;241m=\u001b[39mimage_encoder,\n\u001b[0;32m 210\u001b[0m )\n\u001b[1;32m--> 211\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvae_scale_factor \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m (\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvae\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblock_out_channels\u001b[49m) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m 212\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mimage_processor \u001b[38;5;241m=\u001b[39m VaeImageProcessor(vae_scale_factor\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvae_scale_factor, do_convert_rgb\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 213\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol_image_processor \u001b[38;5;241m=\u001b[39m VaeImageProcessor(\n\u001b[0;32m 214\u001b[0m vae_scale_factor\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvae_scale_factor, do_convert_rgb\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, do_normalize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m 215\u001b[0m )\n\n\u001b[1;31mAttributeError\u001b[0m: 'FrozenDict' object has no attribute 'block_out_channels'"
}
System Info
-
diffusersversion: 0.25.0 - Platform: Windows-10-10.0.19045-SP0
- Python version: 3.10.14
- PyTorch version (GPU?): 2.2.2 (True)
- Huggingface_hub version: 0.22.2
- Transformers version: 4.36.2
- Accelerate version: 0.29.2
- xFormers version: not installed
- Using GPU in script?: yes
- Using distributed or parallel set-up in script?: no
Who can help?
@yiyixuxu @sayakpaul @DN6 @stevhliu
I installed diffusers from the source and this seems to work:
import torch
from diffusers import DiffusionPipeline, ControlNetModel
from diffusers import UniPCMultistepScheduler
controlnet_path = f'path/to/instant/id'
# load IdentityNet
identityNet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
pipe = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
custom_pipeline="stable_diffusion_xl_reference",
torch_dtype=torch.float16,
use_safetensors=True,
variant="fp16").to('cuda')
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe_instant = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
custom_pipeline="pipeline_stable_diffusion_xl_instantid",
#vae = pipe.vae, # I tried both witout and with the VAE
text_encoder = pipe.text_encoder,
text_encoder_2 = pipe.text_encoder_2,
tokenizer = pipe.tokenizer,
tokenizer_2 = pipe.tokenizer_2,
unet = pipe.unet,
scheduler = pipe.scheduler,
feature_extractor = pipe.feature_extractor,
controlnet = identityNet,
)
you can create your instant ID pipeline from SDXL reference pipeline with this script
import torch
from diffusers import DiffusionPipeline, ControlNetModel
from diffusers import UniPCMultistepScheduler
# load IdentityNet
identityNet = ControlNetModel.from_pretrained("InstantX/InstantID", subfolder ="ControlNetModel", torch_dtype=torch.float16)
pipe = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
custom_pipeline="stable_diffusion_xl_reference",
torch_dtype=torch.float16,
use_safetensors=True,
variant="fp16").to('cuda')
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe_instant = DiffusionPipeline.from_pipe(
pipe,
custom_pipeline="pipeline_stable_diffusion_xl_instantid",
controlnet = identityNet,
)
@standardAI I did install diffuser from source then I had to downgrade due to an error on instantID. When I generate using both reference and instantId arguments, and with only instant id argument. It does not impact the generation.
image_plus_ref = pipe_instant(
prompt,
negative_prompt=negative_prompt,
num_inference_steps=4,
guidance_scale=1.2,
image_proj_model_in_features=face_emb,
image_embeds=face_emb,
image=face_kps,
controlnet_conditioning_scale=0.8,
seed = 42,
reference_attn=True,
reference_adain=True,
ref_image = ref_image
).images[0]
And
image = pipe_instant(
prompt,
negative_prompt=negative_prompt,
num_inference_steps=4,
guidance_scale=1.2,
image_proj_model_in_features=face_emb,
image_embeds=face_emb,
image=face_kps,
controlnet_conditioning_scale=0.8,
seed = 42,
).images[0]
[note that I use lightning diffuser model]
@yiyixuxu since I'm on diffusers==0.26.3 the method from_pipe does not seems to work.
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.