diff --git a/ai_edge_torch/generative/examples/stable_diffusion/pipeline.py b/ai_edge_torch/generative/examples/stable_diffusion/pipeline.py index df7293e1..fb3b26ef 100644 --- a/ai_edge_torch/generative/examples/stable_diffusion/pipeline.py +++ b/ai_edge_torch/generative/examples/stable_diffusion/pipeline.py @@ -15,16 +15,16 @@ import argparse import os -from pathlib import Path -from typing import Dict, Optional +import pathlib +from typing import Optional -import ai_edge_torch.generative.examples.stable_diffusion.samplers as samplers -from ai_edge_torch.generative.examples.stable_diffusion.tokenizer import Tokenizer # NOQA -import ai_edge_torch.generative.examples.stable_diffusion.util as util -from ai_edge_torch.model import TfLiteModel +import ai_edge_torch +from ai_edge_torch.generative.examples.stable_diffusion import samplers +from ai_edge_torch.generative.examples.stable_diffusion import tokenizer +from ai_edge_torch.generative.examples.stable_diffusion import util import numpy as np from PIL import Image -from tqdm import tqdm +import tqdm arg_parser = argparse.ArgumentParser() arg_parser.add_argument( @@ -104,12 +104,12 @@ def __init__( diffusion_ckpt: str, decoder_ckpt: str ): - self.tokenizer = Tokenizer(tokenizer_vocab_dir) - self.clip = TfLiteModel.load(clip_ckpt) - self.decoder = TfLiteModel.load(decoder_ckpt) - self.diffusion = TfLiteModel.load(diffusion_ckpt) + self.tokenizer = tokenizer.Tokenizer(tokenizer_vocab_dir) + self.clip = ai_edge_torch.model.TfLiteModel.load(clip_ckpt) + self.decoder = ai_edge_torch.model.TfLiteModel.load(decoder_ckpt) + self.diffusion = ai_edge_torch.model.TfLiteModel.load(diffusion_ckpt) if encoder_ckpt is not None: - self.encoder = TfLiteModel.load(encoder_ckpt) + self.encoder = ai_edge_torch.model.TfLiteModel.load(encoder_ckpt) def run_tflite_pipeline( @@ -128,48 +128,32 @@ def run_tflite_pipeline( ): """Run stable diffusion pipeline with tflite model. - model: - - StableDiffsuion model. - prompt: - The prompt to guide the image generation. - output_path: - The path to the generated output image. - uncond_prompt: - The prompt not to guide the image generation. - cfg_scale: - Guidance scale of classifier-free guidance. Higher guidance scale encourages - to generate - images that are closely linked to the text `prompt`, usually at the expense - of lower - image quality. - height: - The height in pixels of the generated image. - width: - The width in pixels of the generated image. - sampler: - A sampler to be used to denoise the encoded image latents. Can be one of - `k_lms, `k_euler`, - or `k_euler_ancestral`. - n_inference_steps: - The number of denoising steps. More denoising steps usually lead to a higher - quality image at the - expense of slower inference. This parameter will be modulated by `strength`. - seed: - A seed to make generation deterministic. - strength: - Conceptually, indicates how much to transform the reference `input_image`. - Must be between 0 and 1. - `input_image` will be used as a starting point, adding more noise to it the - larger the `strength`. - The number of denoising steps depends on the amount of noise initially - added. When `strength` is 1, - added noise will be maximum and the denoising process will run for the full - number of iterations - specified in `n_inference_steps`. A value of 1, therefore, essentially - ignores `input_image`. - input_image: - Image which is served as the starting point for the image generation. + Args: + model: StableDiffsuion model. + prompt: The prompt to guide the image generation. + output_path: The path to the generated output image. + uncond_prompt: The prompt not to guide the image generation. + cfg_scale: Guidance scale of classifier-free guidance. Higher guidance scale + encourages to generate images that are closely linked to the text + `prompt`, usually at the expense of lower image quality. + height: The height in pixels of the generated image. + width: The width in pixels of the generated image. + sampler: A sampler to be used to denoise the encoded image latents. Can be + one of `k_lms, `k_euler`, or `k_euler_ancestral`. + n_inference_steps: The number of denoising steps. More denoising steps + usually lead to a higher quality image at the expense of slower inference. + This parameter will be modulated by `strength`. + seed: A seed to make generation deterministic. + strength: Conceptually, indicates how much to transform the reference + `input_image`. Must be between 0 and 1. `input_image` will be used as a + starting point, adding more noise to it the larger the `strength`. The + number of denoising steps depends on the amount of noise initially added. + When `strength` is 1, added noise will be maximum and the denoising + process will run for the full number of iterations specified in + `n_inference_steps`. A value of 1, therefore, essentially ignores + `input_image`. + input_image: Image which is served as the starting point for the image + generation. """ if not 0 < strength < 1: raise ValueError('strength must be between 0 and 1') @@ -202,7 +186,8 @@ def run_tflite_pipeline( context = np.concatenate([cond_context, uncond_context], axis=0) noise_shape = (1, 4, height // 8, width // 8) - # Initialization starts from input_image if any, otherwise, starts from a random sampling. + # Initialization starts from input_image if any, otherwise, starts from a + # random sampling. if input_image: if not hasattr(model, 'encoder'): raise AttributeError( @@ -210,7 +195,6 @@ def run_tflite_pipeline( ' input_image.' ) input_image = input_image.resize((width, height)) - input_image_np = np.array(input_image).astype(np.float32) input_image_np = util.rescale(input_image, (0, 255), (-1, 1)) input_image_np = util.move_channel(input_image_np, to='first') encoder_noise = np.random.normal(size=noise_shape).astype(np.float32) @@ -223,8 +207,8 @@ def run_tflite_pipeline( latents *= sampler.initial_scale # Diffusion process. - timesteps = tqdm(sampler.timesteps) - for i, timestep in enumerate(timesteps): + timesteps = tqdm.tqdm(sampler.timesteps) + for _, timestep in enumerate(timesteps): time_embedding = util.get_time_embedding(timestep) input_latents = latents * sampler.get_input_scale() @@ -242,7 +226,7 @@ def run_tflite_pipeline( images = util.rescale(images, (-1, 1), (0, 255), clamp=True) images = util.move_channel(images, to='last') if not os.path.exists(output_path): - Path(output_path).parent.mkdir(parents=True, exist_ok=True) + pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) Image.fromarray(images[0].astype(np.uint8)).save(output_path)