Skip to content

Commit

Permalink
update stable diffusion v2 demo with new api (openvinotoolkit#1284)
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova authored Aug 30, 2023
1 parent 43a496d commit f8e9074
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 157 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install -q \"diffusers>=0.14.0\" openvino-dev openvino \"transformers >= 4.25.1\" accelerate"
"!pip install -q \"diffusers>=0.14.0\" \"openvino==2023.1.0dev20230811\" \"transformers >= 4.25.1\" accelerate"
]
},
{
Expand Down Expand Up @@ -142,18 +142,16 @@
"source": [
"Now that we've retrieved the three parts for both of these pipelines, we now need to:\n",
"\n",
"1. Convert the original PyTorch models to ONNX format\n",
"1. Convert the original PyTorch models to OpenVINO format using Model Conversion API\n",
"\n",
"```\n",
"with torch.no_grad():\n",
" torch.onnx.export(model_part, image, onnx_path, input_names=[\n",
" '...'], output_names=['...'])\n",
"ov_model_part = ov.convert_model(model_part, example_input=input_data)\n",
"```\n",
"\n",
"2. Convert these ONNX models to OpenVINO IR format, using `mo` command-line tool:\n",
"2. Save OpenVINO models on disk:\n",
"\n",
"```\n",
"!mo --input_model $onnx_file_path --output_dir $model_dir\n",
"ov.save_model(ov_model_part, xml_file_path)\n",
"```\n",
"\n",
"We can then run our Stable Diffusion v2 text to image and inpainting pipelines in OpenVINO on our own data!"
Expand Down Expand Up @@ -196,18 +194,17 @@
}
],
"source": [
"from implementation.conversion_helper_utils import convert_txt_encoder_onnx_OV, convert_unet_onnx_OV\n",
"from implementation.conversion_helper_utils import convert_vae_encoder_onnx_OV, convert_vae_decoder_onnx_OV\n",
"from implementation.conversion_helper_utils import convert_encoder, convert_unet, convert_vae_decoder, convert_vae_encoder \n",
"\n",
"# Convert the Text-to-Image models from PyTorch -> Onnx -> OpenVINO\n",
"# 1. Convert the Text Encoder\n",
"txt_encoder_ov_path = convert_txt_encoder_onnx_OV(txt2img_model_dir, text_encoder)\n",
"txt_encoder_ov_path = convert_encoder(text_encoder, txt2img_model_dir / \"text_encoder.xml\")\n",
"# 2. Convert the U-NET\n",
"unet_ov_path = convert_unet_onnx_OV(txt2img_model_dir, unet, num_channels=4, width=96, height=96)\n",
"unet_ov_path = convert_unet(unet, txt2img_model_dir / \"unet.xml\", num_channels=4, width=96, height=96)\n",
"# 3. Convert the VAE encoder\n",
"vae_encoder_ov_path = convert_vae_encoder_onnx_OV(txt2img_model_dir, vae, width=768, height=768)\n",
"vae_encoder_ov_path = convert_vae_encoder(vae, txt2img_model_dir / \"vae_encoder.xml\", width=768, height=768)\n",
"# 4. Convert the VAE decoder\n",
"vae_decoder_ov_path = convert_vae_decoder_onnx_OV(txt2img_model_dir, vae, width=96, height=96)"
"vae_decoder_ov_path = convert_vae_decoder(vae, txt2img_model_dir / \"vae_decoder.xml\", width=96, height=96)"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,83 +2,98 @@
import gc
import torch
import numpy as np
from openvino.tools.mo import convert_model
from openvino.runtime import serialize
import openvino as ov


def convert_encoder_onnx(text_encoder: torch.nn.Module, onnx_path:Path):
def cleanup_torchscript_cache():
"""
Convert Text Encoder model to ONNX.
Function accepts pipeline, prepares example inputs for ONNX conversion via torch.export,
Helper for removing cached model representation
"""
torch._C._jit_clear_class_registry()
torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
torch.jit._state._clear_class_state()


def convert_encoder(text_encoder: torch.nn.Module, ir_path:Path):
"""
Convert Text Encoder model to IR.
Function accepts pipeline, prepares example inputs for conversion
Parameters:
text_encoder (torch.nn.Module): text encoder PyTorch model
onnx_path (Path): File for storing onnx model
ir_path (Path): File for storing model
Returns:
None
"""
if not onnx_path.exists():
if not ir_path.exists():
input_ids = torch.ones((1, 77), dtype=torch.long)
# switch model to inference mode
text_encoder.eval()

# disable gradients calculation for reducing memory consumption
with torch.no_grad():
# export model to ONNX format
torch.onnx._export(
# export model
ov_model = ov.convert_model(
text_encoder, # model instance
input_ids, # inputs for model tracing
onnx_path, # output file for saving result
input_names=['tokens'], # model input name for onnx representation
output_names=['last_hidden_state', 'pooler_out'], # model output names for onnx representation
opset_version=14, # onnx opset version for export,
onnx_shape_inference=False
example_input=input_ids, # example inputs for model tracing
input=([1,77],) # input shape for conversion
)
print('Text Encoder successfully converted to ONNX')
ov.save_model(ov_model, ir_path)
del ov_model
cleanup_torchscript_cache()
print('Text Encoder successfully converted to IR')


def convert_unet_onnx(unet:torch.nn.Module, onnx_path:Path, num_channels:int = 4, width:int = 64, height:int = 64):
def convert_unet(unet:torch.nn.Module, ir_path:Path, num_channels:int = 4, width:int = 64, height:int = 64):
"""
Convert Unet model to ONNX, then IR format.
Function accepts pipeline, prepares example inputs for ONNX conversion via torch.export,
Convert Unet model to IR format.
Function accepts pipeline, prepares example inputs for conversion
Parameters:
unet (torch.nn.Module): UNet PyTorch model
onnx_path (Path): File for storing onnx model
ir_path (Path): File for storing model
num_channels (int, optional, 4): number of input channels
width (int, optional, 64): input width
height (int, optional, 64): input height
Returns:
None
"""
if not onnx_path.exists():
dtype_mapping = {
torch.float32: ov.Type.f32,
torch.float64: ov.Type.f64
}
if not ir_path.exists():
# prepare inputs
encoder_hidden_state = torch.ones((2, 77, 1024))
latents_shape = (2, num_channels, width, height)
latents = torch.randn(latents_shape)
t = torch.from_numpy(np.array(1, dtype=np.float32))

# model size > 2Gb, it will be represented as onnx with external data files, we will store it in separated directory for avoid a lot of files in current directory
onnx_path.parent.mkdir(exist_ok=True, parents=True)
unet.eval()
dummy_inputs = (latents, t, encoder_hidden_state)
input_info = []
for input_tensor in dummy_inputs:
shape = ov.PartialShape(tuple(input_tensor.shape))
element_type = dtype_mapping[input_tensor.dtype]
input_info.append((shape, element_type))

with torch.no_grad():
torch.onnx._export(
ov_model = ov.convert_model(
unet,
(latents, t, encoder_hidden_state), str(onnx_path),
input_names=['latent_model_input', 't', 'encoder_hidden_states'],
output_names=['out_sample'],
onnx_shape_inference=False
example_input=dummy_inputs,
input=input_info
)
print('U-Net successfully converted to ONNX')
ov.save_model(ov_model, ir_path)
del ov_model
cleanup_torchscript_cache()
print('U-Net successfully converted to IR')


def convert_vae_encoder_onnx(vae: torch.nn.Module, onnx_path: Path, width:int = 512, height:int = 512):
def convert_vae_encoder(vae: torch.nn.Module, ir_path: Path, width:int = 512, height:int = 512):
"""
Convert VAE model to ONNX, then IR format.
Function accepts pipeline, creates wrapper class for export only necessary for inference part,
prepares example inputs for ONNX conversion via torch.export,
Convert VAE model to IR format.
VAE model, creates wrapper class for export only necessary for inference part,
prepares example inputs for onversion
Parameters:
vae (torch.nn.Module): VAE PyTorch model
onnx_path (Path): File for storing onnx model
ir_path (Path): File for storing model
width (int, optional, 512): input width
height (int, optional, 512): input height
Returns:
Expand All @@ -90,28 +105,28 @@ def __init__(self, vae):
self.vae = vae

def forward(self, image):
h = self.vae.encoder(image)
moments = self.vae.quant_conv(h)
return moments
return self.vae.encode(x=image)["latent_dist"].sample()

if not onnx_path.exists():
if not ir_path.exists():
vae_encoder = VAEEncoderWrapper(vae)
vae_encoder.eval()
image = torch.zeros((1, 3, width, height))
with torch.no_grad():
torch.onnx.export(vae_encoder, image, onnx_path, input_names=[
'init_image'], output_names=['image_latent'])
print('VAE encoder successfully converted to ONNX')
ov_model = ov.convert_model(vae_encoder, example_input=image, input=([1,3, width, height],))
ov.save_model(ov_model, ir_path)
del ov_model
cleanup_torchscript_cache()
print('VAE encoder successfully converted to IR')


def convert_vae_decoder_onnx(vae: torch.nn.Module, onnx_path: Path, width:int = 64, height:int = 64):
def convert_vae_decoder(vae: torch.nn.Module, ir_path: Path, width:int = 64, height:int = 64):
"""
Convert VAE model to ONNX, then IR format.
Function accepts pipeline, creates wrapper class for export only necessary for inference part,
prepares example inputs for ONNX conversion via torch.export,
Convert VAE decoder model to IR format.
Function accepts VAE model, creates wrapper class for export only necessary for inference part,
prepares example inputs for conversion
Parameters:
vae:
onnx_path (Path): File for storing onnx model
vae (torch.nn.Module): VAE model
ir_path (Path): File for storing model
width (int, optional, 64): input width
height (int, optional, 64): input height
Returns:
Expand All @@ -123,82 +138,16 @@ def __init__(self, vae):
self.vae = vae

def forward(self, latents):
latents = 1 / 0.18215 * latents
return self.vae.decode(latents)

if not onnx_path.exists():
if not ir_path.exists():
vae_decoder = VAEDecoderWrapper(vae)
latents = torch.zeros((1, 4, width, height))

vae_decoder.eval()
with torch.no_grad():
torch.onnx.export(vae_decoder, latents, onnx_path, input_names=[
'latents'], output_names=['sample'])
print('VAE decoder successfully converted to ONNX')

# Helper code to convert models


def convert_txt_encoder_onnx_OV(model_dir, text_encoder):
# Convert Text Encoder to ONNX then OpenVINO
txt_encoder_onnx_path = model_dir / 'text_encoder.onnx'
txt_encoder_ov_path = txt_encoder_onnx_path.with_suffix('.xml')

if not txt_encoder_ov_path.exists():
convert_encoder_onnx(text_encoder, txt_encoder_onnx_path)
txt_encoder_ov_model = convert_model(txt_encoder_onnx_path)
serialize(model=txt_encoder_ov_model, xml_path=str(txt_encoder_ov_path))
else:
print(f"Text encoder will be loaded from {txt_encoder_ov_path}")

del text_encoder
gc.collect();
return txt_encoder_ov_path


def convert_unet_onnx_OV(model_dir, unet, num_channels=4, width=96, height=96):
# Convert U-Net to ONNX then OpenVINO
unet_onnx_path = model_dir / 'unet/unet.onnx'
unet_ov_path = unet_onnx_path.parents[1] / 'unet.xml'

if not unet_ov_path.exists():
convert_unet_onnx(unet, unet_onnx_path, num_channels=num_channels, width=width, height=height)
unet_ov_model = convert_model(unet_onnx_path)
serialize(model=unet_ov_model, xml_path=str(unet_ov_path))
else:
print(f"U-Net will be loaded from {unet_ov_path}")

del unet
gc.collect();
return unet_ov_path


def convert_vae_encoder_onnx_OV(model_dir, vae, width=768, height=768):
# Converts the encoder VAE component to ONNX then OpenVINO
vae_encoder_onnx_path = model_dir / 'vae_encoder.onnx'
vae_encoder_ov_path = vae_encoder_onnx_path.with_suffix('.xml')

if not vae_encoder_ov_path.exists():
convert_vae_encoder_onnx(vae, vae_encoder_onnx_path, width=width, height=height)
encoder_ov_model = convert_model(vae_encoder_onnx_path)
serialize(model=encoder_ov_model, xml_path=str(vae_encoder_ov_path))
else:
print(f"VAE-Encoder will be loaded from {vae_encoder_ov_path}")
return vae_encoder_ov_path


def convert_vae_decoder_onnx_OV(model_dir, vae, width=96, height=96):
# Converts the VAE decoder to ONNX then OpenVINO
vae_decoder_onnx_path = model_dir / 'vae_decoder.onnx'
vae_decoder_ov_path = vae_decoder_onnx_path.with_suffix('.xml')

if not vae_decoder_ov_path.exists():
convert_vae_decoder_onnx(vae, vae_decoder_onnx_path, width=width, height=height)
decoder_ov_model = convert_model(vae_decoder_onnx_path)
serialize(model=decoder_ov_model, xml_path=str(vae_decoder_ov_path))
else:
print(f"VAE decoder will be loaded from {vae_decoder_ov_path}")

del vae
gc.collect();
return vae_decoder_ov_path
ov_model = ov.convert_model(vae_decoder, example_input=latents, input=([1,4, width, height],))
ov.save_model(ov_model, ir_path)
del ov_model
cleanup_torchscript_cache()
print('VAE decoder successfully converted to IR')
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from transformers import CLIPTokenizer
from diffusers.pipeline_utils import DiffusionPipeline
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from openvino.runtime import Model
import openvino as ov


def prepare_mask_and_masked_image(image:PIL.Image.Image, mask:PIL.Image.Image):
Expand Down Expand Up @@ -65,12 +65,12 @@ def prepare_mask_and_masked_image(image:PIL.Image.Image, mask:PIL.Image.Image):
class OVStableDiffusionInpaintingPipeline(DiffusionPipeline):
def __init__(
self,
vae_decoder: Model,
text_encoder: Model,
vae_decoder: ov.Model,
text_encoder:ov. Model,
tokenizer: CLIPTokenizer,
unet: Model,
unet: ov.Model,
scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
vae_encoder: Model = None,
vae_encoder: ov.Model = None,
):
"""
Pipeline for text-to-image generation using Stable Diffusion.
Expand Down Expand Up @@ -129,10 +129,8 @@ def prepare_mask_latents(
mask = mask.numpy()

# encode the mask image into latents space so we can concatenate it to the latents
moments = self.vae_encoder(masked_image)[self._vae_e_output]
mean, logvar = np.split(moments, 2, axis=1)
std = np.exp(logvar * 0.5)
masked_image_latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215
logits = self.vae_encoder(masked_image)[self._vae_e_output]
masked_image_latents = logits * 0.18215

mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask
masked_image_latents = (
Expand Down Expand Up @@ -260,7 +258,7 @@ def __call__(
**extra_step_kwargs,
)["prev_sample"].numpy()
# scale and decode the image latents with vae
image = self.vae_decoder(latents)[self._vae_d_output]
image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output]

image = self.postprocess_image(image, meta, output_type)
return {"sample": image}
Expand Down Expand Up @@ -353,10 +351,8 @@ def prepare_latents(self, image:PIL.Image.Image = None, latent_timestep:torch.Te
noise = noise * self.scheduler.sigmas[0].numpy()
return noise, {}
input_image, meta = preprocess(image)
moments = self.vae_encoder(input_image)[self._vae_e_output]
mean, logvar = np.split(moments, 2, axis=1)
std = np.exp(logvar * 0.5)
latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215
latents = self.vae_encoder(input_image)[self._vae_e_output]
latents = latents * 0.18215
latents = self.scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy()
return latents, meta

Expand Down
Loading

0 comments on commit f8e9074

Please sign in to comment.