anotherjesse · anaibol · Mar 24, 2023 · Mar 24, 2023 · Mar 24, 2023 · Mar 24, 2023
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -1 +1,5 @@
-weights
+weights
+.vscode
+.cog
+__pycache__/
+diffusers-cache
diff --git a/Dockerfile b/Dockerfile
diff --git a/README.md b/README.md
@@ -1,3 +1,9 @@
+# to get weights
+
+curl -L -o output.zip https://replicate.delivery/pbxt/4lrw9387HOaKKR7kHv5YX4N59OSP0ltIcfaYWdm2lHAGVCBIA/output.zip
+mkdir weights
+unzip output.zip -d weights
+
 # dreambooth-builder
 
 This is a prototype to build a replicate model using existing replicate models as the base.
@@ -8,7 +14,7 @@ These concepts might be incorporated into cog/replicate's [dreambooth api](https
 
 using `cog push` to build/push a model many times results in a lot of work - downloading / building / ... But each model you build technically just has a difference of the weights.
 
-if we could take an existing popular stable diffusion model, and throw our weights and any customization to predict.py - our image should be much smaller/faster builds/... 
+if we could take an existing popular stable diffusion model, and throw our weights and any customization to predict.py - our image should be much smaller/faster builds/...
 
 we only need to upload our changes (weights & predict.py)
 

diff --git a/cog.yaml b/cog.yaml
@@ -0,0 +1,27 @@
+build:
+  gpu: true
+  cuda: "11.7"
+  python_version: "3.10"
+  system_packages:
+    - "libgl1-mesa-glx"
+    - "libglib2.0-0"
+  python_packages:
+    - "ipython==8.10.0"
+    - "Pillow==9.4.0"
+    - "numpy==1.24.2"
+    - "tqdm==4.64.1"
+    - "torch==1.12.1"
+    - "torchvision==0.13.1"
+    - "ftfy==6.1.1"
+    - "diffusers==0.11.1"
+    - "scipy==1.9.3"
+    - "transformers==4.25.1"
+    - "accelerate==0.15.0"
+    - "huggingface-hub==0.13.2"
+    - "opencv-python==4.7.0.68"
+
+  run:
+    - pip install git+https://github.com/sberbank-ai/Real-ESRGAN.git
+
+predict: "predict.py:Predictor"
+image: "r8.im/andreasjansson/stable-diffusion-inpainting"
diff --git a/masks/.DS_Store b/masks/.DS_Store
diff --git a/masks/desktop-mask.jpg b/masks/desktop-mask.jpg
diff --git a/masks/mask.png b/masks/mask.png
diff --git a/masks/mask2.png b/masks/mask2.png
diff --git a/masks/that_would_be_great_mask.png b/masks/that_would_be_great_mask.png
diff --git a/memes/.DS_Store b/memes/.DS_Store
diff --git a/memes/desktop.jpg b/memes/desktop.jpg
diff --git a/memes/looking-back.png b/memes/looking-back.png
diff --git a/memes/person.png b/memes/person.png
diff --git a/memes/that_would_be_great.png b/memes/that_would_be_great.png
diff --git a/memes/tyrone_biggums.jpg b/memes/tyrone_biggums.jpg
diff --git a/predict.py b/predict.py
@@ -1,8 +1,11 @@
 import json
 import os
+from io import BytesIO
 from typing import List
 
 import torch
+import torch.nn as nn
+import numpy as np
 from cog import BasePredictor, Input, Path
 from diffusers import (
     StableDiffusionPipeline,
@@ -14,13 +17,19 @@
     EulerAncestralDiscreteScheduler,
     DPMSolverMultistepScheduler,
 )
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
 from diffusers.pipelines.stable_diffusion.safety_checker import (
     StableDiffusionSafetyChecker,
 )
 from PIL import Image
 from transformers import CLIPFeatureExtractor
+from RealESRGAN import RealESRGAN
 
+# Add this import for PIL ImageOps
+import PIL.ImageOps
 
+MODEL_ID = "stabilityai/stable-diffusion-2-1"
+MODEL_CACHE = "diffusers-cache"
 SAFETY_MODEL_CACHE = "diffusers-cache"
 SAFETY_MODEL_ID = "CompVis/stable-diffusion-safety-checker"
 
@@ -44,9 +53,20 @@
 if not DEFAULT_PROMPT:
     DEFAULT_PROMPT = "a photo of an astronaut riding a horse on mars"
 
+SAFETY_MODEL_CACHE = "diffusers-cache"
+SAFETY_MODEL_ID = "CompVis/stable-diffusion-safety-checker"
+
 
 class Predictor(BasePredictor):
     def setup(self):
+        self.models = {}
+
+        for scale in [2, 4, 8]:
+            self.models[scale] = RealESRGAN("cuda", scale=scale)
+            self.models[scale].load_weights(
+                f"weights/RealESRGAN_x{scale}.pth", download=False
+            )
+
         """Load the model into memory to make running multiple predictions efficient"""
         print("Loading Safety pipeline...")
         self.safety_checker = StableDiffusionSafetyChecker.from_pretrained(
@@ -77,6 +97,19 @@ def setup(self):
             feature_extractor=self.txt2img_pipe.feature_extractor,
         ).to("cuda")
 
+        # Add this setup code for inpainting_pipe
+        print("Loading Inpainting pipeline...")
+        self.inpainting_pipe = StableDiffusionInpaintPipelineLegacy(
+            vae=self.img2img_pipe.vae,
+            text_encoder=self.img2img_pipe.text_encoder,
+            tokenizer=self.img2img_pipe.tokenizer,
+            unet=self.img2img_pipe.unet,
+            scheduler=self.img2img_pipe.scheduler,
+            safety_checker=self.img2img_pipe.safety_checker,
+            feature_extractor=self.img2img_pipe.feature_extractor,
+        ).to("cuda")
+
+    # Add additional inpainting-related inputs to the predict function
     @torch.inference_mode()
     def predict(
         self,
@@ -94,12 +127,14 @@ def predict(
         ),
         width: int = Input(
             description="Width of output image. Maximum size is 1024x768 or 768x1024 because of memory limits",
-            choices=[128, 256, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024],
+            choices=[128, 256, 384, 448, 512, 576,
+                     640, 704, 768, 832, 896, 960, 1024],
             default=DEFAULT_WIDTH,
         ),
         height: int = Input(
             description="Height of output image. Maximum size is 1024x768 or 768x1024 because of memory limits",
-            choices=[128, 256, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024],
+            choices=[128, 256, 384, 448, 512, 576,
+                     640, 704, 768, 832, 896, 960, 1024],
             default=DEFAULT_HEIGHT,
         ),
         prompt_strength: float = Input(
@@ -131,30 +166,80 @@ def predict(
             description="Choose a scheduler",
         ),
         seed: int = Input(
-            description="Random seed. Leave blank to randomize the seed", default=None
+            description="Random seed. Leave blank to randomize the seed",
+            default=None
         ),
         disable_safety_check: bool = Input(
-            description="Disable safety check. Use at your own risk!", default=False
+            description="Disable safety check. Use at your own risk!",
+            default=True
+        ),
+        mode: str = Input(
+            description="Choose the mode of operation: 'txt2img', 'img2img', or 'inpaint'.",
+            choices=["txt2img", "img2img", "inpaint"],
+            default="inpaint",
+        ),
+        mask: Path = Input(
+            description="Black and white image to use as mask. Required only in 'inpaint' mode. White pixels are inpainted and black pixels are preserved.",
+            default=None,
+        ),
+        invert_mask: bool = Input(
+            description="If this is true, then black pixels are inpainted and white pixels are preserved. Used only in 'inpaint' mode.",
+            default=False,
+        ),
+        upscale: int = Input(
+            choices=[2, 4, 8],
+            description="Upscaling factor",
+            default=4
         ),
     ) -> List[Path]:
         """Run a single prediction on the model"""
         if seed is None:
             seed = int.from_bytes(os.urandom(2), "big")
         print(f"Using seed: {seed}")
 
+        model = self.models[upscale]
+
         if width * height > 786432:
             raise ValueError(
                 "Maximum size is 1024x768 or 768x1024 pixels, because of memory limits. Please select a lower width or height."
             )
 
-        if image is not None:
+        if mode == "inpaint":
+            print("using inpaint")
+            pipe = self.inpainting_pipe
+
+            # Load the mask image and invert it if needed
+            image = Image.open(image).convert("RGB")
+            mask = Image.open(mask).convert("RGB")
+
+            if invert_mask:
+                mask = PIL.ImageOps.invert(mask)
+
+            if image.width % 8 != 0 or image.height % 8 != 0:
+                if mask.size == image.size:
+                    mask = crop(mask)
+                image = crop(image)
+
+            if mask.size != image.size:
+                print(
+                    f"WARNING: Mask size ({mask.width}, {mask.height}) is different to image size ({image.width}, {image.height}). Mask will be resized to image size."
+                )
+                mask = mask.resize(image.size)
+
+            extra_kwargs = {
+                "init_image": image,
+                "mask_image": mask,
+            }
+
+        if mode == "img2img":
             print("using img2img")
             pipe = self.img2img_pipe
             extra_kwargs = {
                 "image": Image.open(image).convert("RGB"),
                 "strength": prompt_strength,
             }
-        else:
+
+        if mode == "txt2img":
             print("using txt2img")
             pipe = self.txt2img_pipe
             extra_kwargs = {
@@ -172,23 +257,47 @@ def predict(
             pipe.safety_checker = self.safety_checker
 
         output = pipe(
-            prompt=[prompt] * num_outputs if prompt is not None else None,
-            negative_prompt=[negative_prompt] * num_outputs
-            if negative_prompt is not None
-            else None,
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_outputs,
             guidance_scale=guidance_scale,
             generator=generator,
             num_inference_steps=num_inference_steps,
             **extra_kwargs,
         )
 
+        # samples = [
+        # output.images[i]
+        # for i, nsfw_flag in enumerate(output.nsfw_content_detected)
+        # if not nsfw_flag
+        # ]
+
+        # if len(samples) == 0:
+        #     raise Exception(
+        #         f"NSFW content detected. Try running it again, or try a different prompt."
+        #     )
+
+        # if num_outputs > len(samples):
+        #     print(
+        #         f"NSFW content detected in {num_outputs - len(samples)} outputs, returning the remaining {len(samples)} images."
+        #     )
+
         output_paths = []
-        for i, sample in enumerate(output.images):
-            if output.nsfw_content_detected and output.nsfw_content_detected[i]:
-                continue
+        for i, image in enumerate(output.images):
+            if mode == "inpaint":
+                output_path = f"/tmp/out-{i}.png"
+                sr_image = model.predict(image)
+                sr_image.save(output_path)
+            if mode == "txt2img":
+                output_path = f"/tmp/out-{i}.png"
+                image.save(output_path)
+            if mode == "img2img":
+                output_path = f"/tmp/out-img2img-{i}.png"
+                sr_image = model.predict(image)
+                sr_image.save(output_path)
 
-            output_path = f"/tmp/out-{i}.png"
-            sample.save(output_path)
+            print(f"Creating output path: {mode}")
+            print(f"Creating output path: {output_path}")
             output_paths.append(Path(output_path))
 
         if len(output_paths) == 0:
@@ -208,3 +317,14 @@ def make_scheduler(name, config):
         "K_EULER_ANCESTRAL": EulerAncestralDiscreteScheduler.from_config(config),
         "DPMSolverMultistep": DPMSolverMultistepScheduler.from_config(config),
     }[name]
+
+
+def crop(image):
+    height = (image.height // 8) * 8
+    width = (image.width // 8) * 8
+    left = int((image.width - width) / 2)
+    right = left + width
+    top = int((image.height - height) / 2)
+    bottom = top + height
+    image = image.crop((left, top, right, bottom))
+    return image
diff --git a/prompts.txt b/prompts.txt
@@ -0,0 +1,19 @@
+cog predict -i prompt="hairy face of a <1> man looking back over left shoulder, Distracted Boyfriend Meme" -i negative_prompt="Hand, 3d render, comic, bad art, body out of frame, cloned face, cross-eye, deformed, disfigured, extra fingers, extra limbs, fused fingers, malformed limbs, mutated hands, mutation, out of frame, Photoshop, plastic, poorly drawn face, poorly drawn feet, poorly drawn hands, tiling, too many fingers, ugly, video game, text, signature, watermark, letters" -i [email protected] -i [email protected]
+
+cog predict -i prompt="hairy detailed realistic meme of <1> man looking straight ahead, hyper detail, octane, perfect eyes, natural lips, face fits perfect into mask area, professional blending, clean lines, beautiful face, skin pores" -i negative_prompt="Hand, crooked, bad eyes, outside mask area, looking sideways, 3d render, comic, bad art, body out of frame, cloned face, cross-eye, deformed, disfigured, extra fingers, extra limbs, fused fingers, malformed limbs, mutated hands, mutation, out of frame, Photoshop, plastic, poorly drawn face, poorly drawn feet, poorly drawn hands, tiling, too many fingers, ugly, video game, text, signature, watermark, letters" -i [email protected] -i [email protected] -i num_inference_steps=60 -i guidance_scale=8.5
+
+cog predict -i prompt="clean up any weird artifacts" -i [email protected] -i prompt_strength=0.8 -i num_inference_steps=90 -i guidance_scale=8.5
+
+docker run -ti r8.im/bluematter/e3b559f0-c82d-11ed-b78f-cd99e16426bb@sha256:e2073ab4b2879c6f18f7ff1c6c2649a6e1c4cd14f659783da63be34b37514f33 /bin/bash
+
+cog predict -i prompt="a meme of <1> as distracted boyfriend meme looking back" -i negative_prompt="Hand, crooked, bad eyes, outside mask area, looking sideways, 3d render, comic, bad art, body out of frame, cloned face, cross-eye, deformed, disfigured, extra fingers, extra limbs, fused fingers, malformed limbs, mutated hands, mutation, out of frame, Photoshop, plastic, poorly drawn face, poorly drawn feet, poorly drawn hands, tiling, too many fingers, ugly, video game, text, signature, watermark, letters" -i [email protected] -i [email protected] -i num_inference_steps=60 -i guidance_scale=8.5
+
+// that would be great
+cog predict -i prompt="a <1> in the jungle" -i negative_prompt="Hand, crooked, bad eyes, outside mask area, looking sideways, 3d render, comic, bad art, body out of frame, cloned face, cross-eye, deformed, disfigured, extra fingers, extra limbs, fused fingers, malformed limbs, mutated hands, mutation, out of frame, Photoshop, plastic, poorly drawn face, poorly drawn feet, poorly drawn hands, tiling, too many fingers, ugly, video game, text, signature, watermark, letters" -i image=@memes/that_would_be_great.png -i mask=@masks/that_would_be_great_mask.png -i num_inference_steps=60 -i guidance_scale=8.5
+
+// professional
+cog predict -i prompt="centered closeup portrait of a meme of a <1> man in a professional setting wearing business casual, centered in frame, beautiful modern office, looks exactly like <1> man, photorealistc headshot, 100mm closeup DSLR photo by a professional photographer, ultra realistic pores, full face looking at camera, chest up, natural and cinematic lighting, golden hour" -i negative_prompt="selfie, Hand, crooked, bad eyes, outside mask area, looking sideways, 3d render, comic, bad art, body out of frame, cloned face, cross-eye, deformed, disfigured, extra fingers, extra limbs, fused fingers, malformed limbs, mutated hands, mutation, out of frame, Photoshop, plastic, poorly drawn face, poorly drawn feet, poorly drawn hands, tiling, too many fingers, ugly, video game, text, signature, watermark, letters" -i image=@memes/that_would_be_great.png -i mask=@masks/that_would_be_great_mask.png -i mode="txt2img" -i num_inference_steps=60 -i guidance_scale=6.5
+
+cog predict -i prompt="keep the original image with minor improvements specifically on <1> man face" -i [email protected] -i mode="img2img" -i num_inference_steps=60 -i guidance_scale=6.5
+
+cog predict -i prompt="hairy <1> man posing for a photo with original background, cinematic, hyper real pores" -i invert_mask=True -i [email protected] -i mode="img2img" -i [email protected] -i num_inference_steps=30 -i guidance_scale=4.5 -i prompt_strength=0.35
diff --git a/script/download-weights b/script/download-weights
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+from RealESRGAN import RealESRGAN
+
+for scale in [2, 4, 8]:
+    model = RealESRGAN("cuda", scale=scale)
+    model.load_weights(f"weights/RealESRGAN_x{scale}.pth", download=True)
+
+# #!/usr/bin/env python
+
+# from predict import MODEL_CACHE, MODEL_ID, SAFETY_MODEL_ID
+# import os
+# import sys
+# import shutil
+# import torch
+
+# from diffusers import StableDiffusionPipeline
+# from diffusers.pipelines.stable_diffusion.safety_checker import \
+#     StableDiffusionSafetyChecker
+
+# # append project directory to path so predict.py can be imported
+# sys.path.append('.')
+# # sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# # Create directories for both models
+# os.makedirs(MODEL_CACHE, exist_ok=True)
+# os.makedirs("diffusers-cache", exist_ok=True)
+
+# # Download weights for the safety checker model
+# safety_checker = StableDiffusionSafetyChecker.from_pretrained(
+#     SAFETY_MODEL_ID,
+#     cache_dir=MODEL_CACHE,
+# )
+
+# # Download weights for the main inpainting model from the first repository
+# pipe1 = StableDiffusionPipeline.from_pretrained(
+#     MODEL_ID,
+#     cache_dir=MODEL_CACHE,
+# )
+
+# # Download weights for the main inpainting model from the second repository
+# pipe2 = StableDiffusionPipeline.from_pretrained(
+#     "runwayml/stable-diffusion-inpainting",
+#     cache_dir="diffusers-cache",
+#     revision="fp16",
+#     torch_dtype=torch.float16,
+#     use_auth_token=sys.argv[1],
+# )