update stable diffusion v2 demo with new api (openvinotoolkit#1284)

nikita-savelyevv · Aug 30, 2023 · f8e9074 · f8e9074
1 parent 43a496d
commit f8e9074
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 157 deletions.
diff --git a/notebooks/236-stable-diffusion-v2/236-stable-diffusion-v2-text-to-image-demo.ipynb b/notebooks/236-stable-diffusion-v2/236-stable-diffusion-v2-text-to-image-demo.ipynb
@@ -61,7 +61,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install -q \"diffusers>=0.14.0\" openvino-dev openvino \"transformers >= 4.25.1\" accelerate"
+    "!pip install -q \"diffusers>=0.14.0\" \"openvino==2023.1.0dev20230811\" \"transformers >= 4.25.1\" accelerate"
    ]
   },
   {
@@ -142,18 +142,16 @@
    "source": [
     "Now that we've retrieved the three parts for both of these pipelines, we now need to:\n",
     "\n",
-    "1. Convert the original PyTorch models to ONNX format\n",
+    "1. Convert the original PyTorch models to OpenVINO format using Model Conversion API\n",
     "\n",
     "```\n",
-    "with torch.no_grad():\n",
-    "    torch.onnx.export(model_part, image, onnx_path, input_names=[\n",
-    "                      '...'], output_names=['...'])\n",
+    "ov_model_part = ov.convert_model(model_part, example_input=input_data)\n",
     "```\n",
     "\n",
-    "2. Convert these ONNX models to OpenVINO IR format, using `mo` command-line tool:\n",
+    "2. Save OpenVINO models on disk:\n",
     "\n",
     "```\n",
-    "!mo --input_model $onnx_file_path --output_dir $model_dir\n",
+    "ov.save_model(ov_model_part, xml_file_path)\n",
     "```\n",
     "\n",
     "We can then run our Stable Diffusion v2 text to image and inpainting pipelines in OpenVINO on our own data!"
@@ -196,18 +194,17 @@
     }
    ],
    "source": [
-    "from implementation.conversion_helper_utils import convert_txt_encoder_onnx_OV, convert_unet_onnx_OV\n",
-    "from implementation.conversion_helper_utils import convert_vae_encoder_onnx_OV, convert_vae_decoder_onnx_OV\n",
+    "from implementation.conversion_helper_utils import convert_encoder, convert_unet, convert_vae_decoder, convert_vae_encoder \n",
     "\n",
     "# Convert the Text-to-Image models from PyTorch -> Onnx -> OpenVINO\n",
     "# 1. Convert the Text Encoder\n",
-    "txt_encoder_ov_path = convert_txt_encoder_onnx_OV(txt2img_model_dir, text_encoder)\n",
+    "txt_encoder_ov_path = convert_encoder(text_encoder, txt2img_model_dir / \"text_encoder.xml\")\n",
     "# 2. Convert the U-NET\n",
-    "unet_ov_path = convert_unet_onnx_OV(txt2img_model_dir, unet, num_channels=4, width=96, height=96)\n",
+    "unet_ov_path = convert_unet(unet, txt2img_model_dir / \"unet.xml\", num_channels=4, width=96, height=96)\n",
     "# 3. Convert the VAE encoder\n",
-    "vae_encoder_ov_path = convert_vae_encoder_onnx_OV(txt2img_model_dir, vae, width=768, height=768)\n",
+    "vae_encoder_ov_path = convert_vae_encoder(vae, txt2img_model_dir / \"vae_encoder.xml\", width=768, height=768)\n",
     "# 4. Convert the VAE decoder\n",
-    "vae_decoder_ov_path = convert_vae_decoder_onnx_OV(txt2img_model_dir, vae, width=96, height=96)"
+    "vae_decoder_ov_path = convert_vae_decoder(vae, txt2img_model_dir / \"vae_decoder.xml\", width=96, height=96)"
    ]
   },
   {

diff --git a/notebooks/236-stable-diffusion-v2/implementation/conversion_helper_utils.py b/notebooks/236-stable-diffusion-v2/implementation/conversion_helper_utils.py
@@ -2,83 +2,98 @@
 import gc
 import torch
 import numpy as np
-from openvino.tools.mo import convert_model
-from openvino.runtime import serialize
+import openvino as ov
 
 
-def convert_encoder_onnx(text_encoder: torch.nn.Module, onnx_path:Path):
+def cleanup_torchscript_cache():
     """
-    Convert Text Encoder model to ONNX. 
-    Function accepts pipeline, prepares example inputs for ONNX conversion via torch.export, 
+    Helper for removing cached model representation
+    """
+    torch._C._jit_clear_class_registry()
+    torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
+    torch.jit._state._clear_class_state()
+
+
+def convert_encoder(text_encoder: torch.nn.Module, ir_path:Path):
+    """
+    Convert Text Encoder model to IR. 
+    Function accepts pipeline, prepares example inputs for conversion 
     Parameters: 
         text_encoder (torch.nn.Module): text encoder PyTorch model
-        onnx_path (Path): File for storing onnx model
+        ir_path (Path): File for storing model
     Returns:
         None
     """
-    if not onnx_path.exists():
+    if not ir_path.exists():
         input_ids = torch.ones((1, 77), dtype=torch.long)
         # switch model to inference mode
         text_encoder.eval()
 
         # disable gradients calculation for reducing memory consumption
         with torch.no_grad():
-            # export model to ONNX format
-            torch.onnx._export(
+            # export model
+            ov_model = ov.convert_model(
                 text_encoder,  # model instance
-                input_ids,  # inputs for model tracing
-                onnx_path,  # output file for saving result
-                input_names=['tokens'],  # model input name for onnx representation
-                output_names=['last_hidden_state', 'pooler_out'],  # model output names for onnx representation
-                opset_version=14,  # onnx opset version for export,
-                onnx_shape_inference=False
+                example_input=input_ids,  # example inputs for model tracing
+                input=([1,77],)  # input shape for conversion
             )
-        print('Text Encoder successfully converted to ONNX')
+            ov.save_model(ov_model, ir_path)
+            del ov_model
+            cleanup_torchscript_cache()
+        print('Text Encoder successfully converted to IR')
 
 
-def convert_unet_onnx(unet:torch.nn.Module, onnx_path:Path, num_channels:int = 4, width:int = 64, height:int = 64):
+def convert_unet(unet:torch.nn.Module, ir_path:Path, num_channels:int = 4, width:int = 64, height:int = 64):
     """
-    Convert Unet model to ONNX, then IR format. 
-    Function accepts pipeline, prepares example inputs for ONNX conversion via torch.export, 
+    Convert Unet model to IR format. 
+    Function accepts pipeline, prepares example inputs for conversion 
     Parameters: 
         unet (torch.nn.Module): UNet PyTorch model
-        onnx_path (Path): File for storing onnx model
+        ir_path (Path): File for storing model
         num_channels (int, optional, 4): number of input channels
         width (int, optional, 64): input width
         height (int, optional, 64): input height
     Returns:
         None
     """
-    if not onnx_path.exists():
+    dtype_mapping = {
+        torch.float32: ov.Type.f32,
+        torch.float64: ov.Type.f64
+    }
+    if not ir_path.exists():
         # prepare inputs
         encoder_hidden_state = torch.ones((2, 77, 1024))
         latents_shape = (2, num_channels, width, height)
         latents = torch.randn(latents_shape)
         t = torch.from_numpy(np.array(1, dtype=np.float32))
-
-        # model size > 2Gb, it will be represented as onnx with external data files, we will store it in separated directory for avoid a lot of files in current directory
-        onnx_path.parent.mkdir(exist_ok=True, parents=True)
         unet.eval()
+        dummy_inputs = (latents, t, encoder_hidden_state)
+        input_info = []
+        for input_tensor in dummy_inputs:
+            shape = ov.PartialShape(tuple(input_tensor.shape))
+            element_type = dtype_mapping[input_tensor.dtype]
+            input_info.append((shape, element_type))
 
         with torch.no_grad():
-            torch.onnx._export(
+            ov_model = ov.convert_model(
                 unet, 
-                (latents, t, encoder_hidden_state), str(onnx_path),
-                input_names=['latent_model_input', 't', 'encoder_hidden_states'],
-                output_names=['out_sample'],
-                onnx_shape_inference=False
+                example_input=dummy_inputs,
+                input=input_info
             )
-        print('U-Net successfully converted to ONNX')
+        ov.save_model(ov_model, ir_path)
+        del ov_model
+        cleanup_torchscript_cache()
+        print('U-Net successfully converted to IR')
 
 
-def convert_vae_encoder_onnx(vae: torch.nn.Module, onnx_path: Path, width:int = 512, height:int = 512):
+def convert_vae_encoder(vae: torch.nn.Module, ir_path: Path, width:int = 512, height:int = 512):
     """
-    Convert VAE model to ONNX, then IR format. 
-    Function accepts pipeline, creates wrapper class for export only necessary for inference part, 
-    prepares example inputs for ONNX conversion via torch.export, 
+    Convert VAE model to IR format. 
+    VAE model, creates wrapper class for export only necessary for inference part, 
+    prepares example inputs for onversion 
     Parameters: 
         vae (torch.nn.Module): VAE PyTorch model
-        onnx_path (Path): File for storing onnx model
+        ir_path (Path): File for storing model
         width (int, optional, 512): input width
         height (int, optional, 512): input height
     Returns:
@@ -90,28 +105,28 @@ def __init__(self, vae):
             self.vae = vae
 
         def forward(self, image):
-            h = self.vae.encoder(image)
-            moments = self.vae.quant_conv(h)
-            return moments
+            return self.vae.encode(x=image)["latent_dist"].sample()
 
-    if not onnx_path.exists():
+    if not ir_path.exists():
         vae_encoder = VAEEncoderWrapper(vae)
         vae_encoder.eval()
         image = torch.zeros((1, 3, width, height))
         with torch.no_grad():
-            torch.onnx.export(vae_encoder, image, onnx_path, input_names=[
-                              'init_image'], output_names=['image_latent'])
-        print('VAE encoder successfully converted to ONNX')
+            ov_model = ov.convert_model(vae_encoder, example_input=image, input=([1,3, width, height],))
+        ov.save_model(ov_model, ir_path)
+        del ov_model
+        cleanup_torchscript_cache()
+        print('VAE encoder successfully converted to IR')
 
 
-def convert_vae_decoder_onnx(vae: torch.nn.Module, onnx_path: Path, width:int = 64, height:int = 64):
+def convert_vae_decoder(vae: torch.nn.Module, ir_path: Path, width:int = 64, height:int = 64):
     """
-    Convert VAE model to ONNX, then IR format. 
-    Function accepts pipeline, creates wrapper class for export only necessary for inference part, 
-    prepares example inputs for ONNX conversion via torch.export, 
+    Convert VAE decoder model to IR format. 
+    Function accepts VAE model, creates wrapper class for export only necessary for inference part, 
+    prepares example inputs for conversion 
     Parameters: 
-        vae: 
-        onnx_path (Path): File for storing onnx model
+        vae (torch.nn.Module): VAE model 
+        ir_path (Path): File for storing model
         width (int, optional, 64): input width
         height (int, optional, 64): input height
     Returns:
@@ -123,82 +138,16 @@ def __init__(self, vae):
             self.vae = vae
 
         def forward(self, latents):
-            latents = 1 / 0.18215 * latents 
             return self.vae.decode(latents)
 
-    if not onnx_path.exists():
+    if not ir_path.exists():
         vae_decoder = VAEDecoderWrapper(vae)
         latents = torch.zeros((1, 4, width, height))
 
         vae_decoder.eval()
         with torch.no_grad():
-            torch.onnx.export(vae_decoder, latents, onnx_path, input_names=[
-                              'latents'], output_names=['sample'])
-        print('VAE decoder successfully converted to ONNX')
-
-# Helper code to convert models
-
-
-def convert_txt_encoder_onnx_OV(model_dir, text_encoder):
-    # Convert Text Encoder to ONNX then OpenVINO
-    txt_encoder_onnx_path = model_dir / 'text_encoder.onnx'
-    txt_encoder_ov_path = txt_encoder_onnx_path.with_suffix('.xml')
-
-    if not txt_encoder_ov_path.exists():
-        convert_encoder_onnx(text_encoder, txt_encoder_onnx_path)
-        txt_encoder_ov_model = convert_model(txt_encoder_onnx_path)
-        serialize(model=txt_encoder_ov_model, xml_path=str(txt_encoder_ov_path)) 
-    else:
-        print(f"Text encoder will be loaded from {txt_encoder_ov_path}")
-
-    del text_encoder
-    gc.collect();
-    return txt_encoder_ov_path
-
-
-def convert_unet_onnx_OV(model_dir, unet, num_channels=4, width=96, height=96):
-    # Convert U-Net to ONNX then OpenVINO
-    unet_onnx_path = model_dir / 'unet/unet.onnx'
-    unet_ov_path = unet_onnx_path.parents[1] / 'unet.xml'
-
-    if not unet_ov_path.exists():
-        convert_unet_onnx(unet, unet_onnx_path, num_channels=num_channels, width=width, height=height)
-        unet_ov_model = convert_model(unet_onnx_path)
-        serialize(model=unet_ov_model, xml_path=str(unet_ov_path)) 
-    else:
-        print(f"U-Net will be loaded from {unet_ov_path}")
-
-    del unet
-    gc.collect();
-    return unet_ov_path
-
-
-def convert_vae_encoder_onnx_OV(model_dir, vae, width=768, height=768):
-    # Converts the encoder VAE component to ONNX then OpenVINO
-    vae_encoder_onnx_path = model_dir / 'vae_encoder.onnx'
-    vae_encoder_ov_path = vae_encoder_onnx_path.with_suffix('.xml')
-
-    if not vae_encoder_ov_path.exists():
-        convert_vae_encoder_onnx(vae, vae_encoder_onnx_path, width=width, height=height)
-        encoder_ov_model = convert_model(vae_encoder_onnx_path)
-        serialize(model=encoder_ov_model, xml_path=str(vae_encoder_ov_path))
-    else:
-        print(f"VAE-Encoder will be loaded from {vae_encoder_ov_path}")   
-    return vae_encoder_ov_path
-
-
-def convert_vae_decoder_onnx_OV(model_dir, vae, width=96, height=96):
-    # Converts the VAE decoder to ONNX then OpenVINO
-    vae_decoder_onnx_path = model_dir / 'vae_decoder.onnx'
-    vae_decoder_ov_path = vae_decoder_onnx_path.with_suffix('.xml')
-
-    if not vae_decoder_ov_path.exists():
-        convert_vae_decoder_onnx(vae, vae_decoder_onnx_path, width=width, height=height)
-        decoder_ov_model = convert_model(vae_decoder_onnx_path)
-        serialize(model=decoder_ov_model, xml_path=str(vae_decoder_ov_path)) 
-    else:
-        print(f"VAE decoder will be loaded from {vae_decoder_ov_path}")
-
-    del vae
-    gc.collect();
-    return vae_decoder_ov_path
+            ov_model = ov.convert_model(vae_decoder, example_input=latents, input=([1,4, width, height],))
+        ov.save_model(ov_model, ir_path)
+        del ov_model
+        cleanup_torchscript_cache()
+        print('VAE decoder successfully converted to IR')
diff --git a/notebooks/236-stable-diffusion-v2/implementation/ov_stable_diffusion_inpainting_pipeline.py b/notebooks/236-stable-diffusion-v2/implementation/ov_stable_diffusion_inpainting_pipeline.py
@@ -10,7 +10,7 @@
 from transformers import CLIPTokenizer
 from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from openvino.runtime import Model
+import openvino as ov
 
 
 def prepare_mask_and_masked_image(image:PIL.Image.Image, mask:PIL.Image.Image):
@@ -65,12 +65,12 @@ def prepare_mask_and_masked_image(image:PIL.Image.Image, mask:PIL.Image.Image):
 class OVStableDiffusionInpaintingPipeline(DiffusionPipeline):
     def __init__(
         self,
-        vae_decoder: Model,
-        text_encoder: Model,
+        vae_decoder: ov.Model,
+        text_encoder:ov. Model,
         tokenizer: CLIPTokenizer,
-        unet: Model,
+        unet: ov.Model,
         scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        vae_encoder: Model = None,
+        vae_encoder: ov.Model = None,
     ):
         """
         Pipeline for text-to-image generation using Stable Diffusion.
@@ -129,10 +129,8 @@ def prepare_mask_latents(
         mask = mask.numpy()
 
         # encode the mask image into latents space so we can concatenate it to the latents
-        moments = self.vae_encoder(masked_image)[self._vae_e_output]
-        mean, logvar = np.split(moments, 2, axis=1)
-        std = np.exp(logvar * 0.5)
-        masked_image_latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215
+        logits = self.vae_encoder(masked_image)[self._vae_e_output]
+        masked_image_latents = logits * 0.18215
 
         mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask
         masked_image_latents = (
@@ -260,7 +258,7 @@ def __call__(
                 **extra_step_kwargs,
             )["prev_sample"].numpy()
         # scale and decode the image latents with vae
-        image = self.vae_decoder(latents)[self._vae_d_output]
+        image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output]
 
         image = self.postprocess_image(image, meta, output_type)
         return {"sample": image}
@@ -353,10 +351,8 @@ def prepare_latents(self, image:PIL.Image.Image = None, latent_timestep:torch.Te
                 noise = noise * self.scheduler.sigmas[0].numpy()
             return noise, {}
         input_image, meta = preprocess(image)
-        moments = self.vae_encoder(input_image)[self._vae_e_output]
-        mean, logvar = np.split(moments, 2, axis=1) 
-        std = np.exp(logvar * 0.5)
-        latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215
+        latents = self.vae_encoder(input_image)[self._vae_e_output]
+        latents = latents * 0.18215
         latents = self.scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy()
         return latents, meta