How to use ip-adapter-faceid-plusv2_sdxl.bin in diffusers? #8595

xddun · 2024-06-17T03:33:24Z

xddun
Jun 17, 2024

I did it this way, but there were errors.

import torch
from diffusers import StableDiffusionXLPipeline, DDIMScheduler
from diffusers.utils import load_image
from insightface.app import FaceAnalysis
from insightface.utils import face_align
import cv2
import numpy as np
import torch
from diffusers import AutoPipelineForText2Image, DDIMScheduler
from transformers import CLIPVisionModelWithProjection
from diffusers.utils import load_image


# 弄出图片的 embeddings 模型
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
    "/wlk/IP-Adapter/sdxl_models",
    subfolder="image_encoder",
    torch_dtype=torch.float16,
)
# 加载 pipeline
pipeline = AutoPipelineForText2Image.from_pretrained(
    "/wlk/portrait_25",
    torch_dtype=torch.float16,
    image_encoder=image_encoder,
).to("cuda")

pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)

pipeline.load_ip_adapter("/wlk/IP-Adapter/IP-Adapter-FaceID",
                         subfolder=None,
                         weight_name="ip-adapter-faceid-plusv2_sdxl.bin",
                         image_encoder_folder=None)

pipeline.set_ip_adapter_scale(0.6)
image = load_image("./ip_adapter_einstein_base.png")

# ref_images_embeds = []
# app = FaceAnalysis(root="/wlk/insightface",name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
# app.prepare(ctx_id=0, det_size=(640, 640))
# image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
# faces = app.get(image)
# image = torch.from_numpy(faces[0].normed_embedding)
# ref_images_embeds.append(image.unsqueeze(0))
# ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
# neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
# id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")

num_images = 1
ref_images_embeds = []
ip_adapter_images = []
app = FaceAnalysis(root="/wlk/insightface",name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))
image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
faces = app.get(image)
ip_adapter_images.append(face_align.norm_crop(image, landmark=faces[0].kps, image_size=224))

image = torch.from_numpy(faces[0].normed_embedding)
ref_images_embeds.append(image.unsqueeze(0))
ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")

clip_embeds = pipeline.prepare_ip_adapter_image_embeds([ip_adapter_images], None, torch.device("cuda"), num_images, True)[0]

# clip_embeds shape
print(f"clip_embeds shape: {clip_embeds.shape}")

# id_embeds shape
print(f"id_embeds shape: {id_embeds.shape}")


pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16)
pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = True # True if Plus v2


generator = torch.Generator(device="cpu").manual_seed(42)

images = pipeline(
    prompt="A photo of a boy",
    ip_adapter_image_embeds=[id_embeds],
    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
    num_inference_steps=30,
    num_images_per_prompt=1,
    generator=generator
).images
images[0].save("output.png")

clip_embeds shape: torch.Size([2, 1, 257, 1664])
id_embeds shape: torch.Size([2, 1, 1, 512])
0%| | 0/20 [00:00<?, ?it/s]
Traceback (most recent call last):
File "/wlk/x04.py", line 80, in
images = pipeline(
File "/opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py", line 1209, in call
noise_pred = self.unet(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/diffusers/models/unets/unet_2d_condition.py", line 1157, in forward
encoder_hidden_states = self.process_encoder_hidden_states(
File "/opt/conda/lib/python3.10/site-packages/diffusers/models/unets/unet_2d_condition.py", line 1028, in process_encoder_hidden_states
image_embeds = self.encoder_hid_proj(image_embeds)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/diffusers/models/embeddings.py", line 1268, in forward
image_embed = image_projection_layer(image_embed)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/diffusers/models/embeddings.py", line 1151, in forward
x = self.proj_in(x)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 116, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (2x512 and 1280x1280)

xddun · 2024-06-17T07:18:59Z

xddun
Jun 17, 2024
Author

This issue has been resolved by using CLIP-ViT-H-14-laion2B-s32B-b79K. However, I have encountered a new problem. When I call the model using this process, the generated images are ugly, but the images generated in sdwebuiA1111's controlnet ipadapter are much better. I don't understand where the difference lies between the two.

import torch
from diffusers import StableDiffusionXLPipeline, DDIMScheduler
from diffusers.utils import load_image
from insightface.app import FaceAnalysis
from insightface.utils import face_align
import cv2
import numpy as np
import torch
from diffusers import AutoPipelineForText2Image, DDIMScheduler
from transformers import CLIPVisionModelWithProjection
from diffusers.utils import load_image


# 弄出图片的 embeddings 模型
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
    "/wlk/CLIP-ViT-H-14-laion2B-s32B-b79K",
    torch_dtype=torch.float16,
)

noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)

# 加载 pipeline
pipeline = AutoPipelineForText2Image.from_pretrained(
    "/wlk/portrait_sdxl1.0_finetune-000029",
    torch_dtype=torch.float16,
    image_encoder=image_encoder,
    scheduler=noise_scheduler,
    safety_checker=None
).to("cuda")

pipeline.load_ip_adapter("/wlk/IP-Adapter/IP-Adapter-FaceID",
                         subfolder=None,
                         weight_name="ip-adapter-faceid-plusv2_sdxl.bin",
                         image_encoder_folder=None)
pipeline.set_ip_adapter_scale(0.5)
pipeline.load_lora_weights("/wlk/IP-Adapter/IP-Adapter-FaceID",
                           weight_name="ip-adapter-faceid-plusv2_sdxl_lora.safetensors")
pipeline.fuse_lora(lora_scale=0.5)

image = load_image("./huge.jpg")


num_images = 1
ref_images_embeds = []
ip_adapter_images = []
app = FaceAnalysis(root="/wlk/insightface",name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))
image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
faces = app.get(image)
ip_adapter_images.append(face_align.norm_crop(image, landmark=faces[0].kps, image_size=224))

image = torch.from_numpy(faces[0].normed_embedding)
ref_images_embeds.append(image.unsqueeze(0))
ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")

clip_embeds = pipeline.prepare_ip_adapter_image_embeds([ip_adapter_images], None, torch.device("cuda"), num_images, True)[0]

# clip_embeds shape
print(f"clip_embeds shape: {clip_embeds.shape}")

# id_embeds shape
print(f"id_embeds shape: {id_embeds.shape}")


pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16)
pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = True # True if Plus v2


generator = torch.Generator(device="cpu").manual_seed(42)

images = pipeline(
    prompt="In a snowy mountain range, the young man is dressed in winter attire, facing the camera with a determined gaze. He sports a thick wool coat, knit hat, and gloves to keep warm in the frigid temperatures. His eyes, piercing and resolute, reflect the strength and resolve needed to conquer the elements and the challenging terrain.",
    ip_adapter_image_embeds=[id_embeds],
    negative_prompt="paintings, sketches, worst quality, low quality, normal quality, lowres, blurry, text, logo, monochrome, grayscale, skin spots, acnes, skin blemishes, age spot, strabismus, wrong finger, bad anatomy, bad hands, error, missing fingers, cropped, jpeg artifacts, signature, watermark, username, dark skin, fused girls, fushion, bad feet, ugly, pregnant, vore, duplicate, morbid, mutilated, transexual, hermaphrodite, long neck, mutated hands, poorly drawn face, mutation, deformed, bad proportions, malformed limbs, extra limbs, cloned face, disfigured, gross proportions, missing arms, missing legs, extra arms, extra legs, plump, open mouth, tooth, teeth, nsfw,",
    num_inference_steps=30,
    num_images_per_prompt=1,
    width=1024,
    height=1024,
    generator=generator
).images
images[0].save("output.png")

output is：

but A1111 ‘ output is:

1 reply

asomoza Jun 17, 2024
Maintainer

Not sure if it is going to fix all of the bad generation, but you're using a really long negative prompt. To be able to use it as is, you'll need to use the lpw community pipeline or compel, by default diffusers doesn't support going over the 77 token limit of the clip model and yours has 169 tokens.

xddun · 2024-06-18T09:30:20Z

xddun
Jun 18, 2024
Author

I noticed that there is a cross-attention weight setting in ControlNet. I am wondering if I made a mistake while writing it in diffusers.

pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16)
pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = True # True if Plus v2

Do you have any more suggestions?

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to use ip-adapter-faceid-plusv2_sdxl.bin in diffusers? #8595

{{title}}

Replies: 2 comments 1 reply

{{title}}

{{title}}

{{title}}

Select a reply

How to use ip-adapter-faceid-plusv2_sdxl.bin in diffusers? #8595

xddun Jun 17, 2024

Replies: 2 comments · 1 reply

xddun Jun 17, 2024 Author

asomoza Jun 17, 2024 Maintainer

xddun Jun 18, 2024 Author

xddun
Jun 17, 2024

Replies: 2 comments 1 reply

xddun
Jun 17, 2024
Author

asomoza Jun 17, 2024
Maintainer

xddun
Jun 18, 2024
Author