Inference pipeline for videos #408

psycoplankton · 2024-10-21T06:24:17Z

Hey!
I have been trying to optimize the inference pipeline using batch inference. But everything used in the pipeline is written considering one image at a time. I was successful in creating batches of data but somehow the after process of pasting the upscaled image back on the original image takes unusually long periods of time.

My video is 15 sec long and in total has 433 frames. I am running on colab and using T4, current time for inferenc is 4 min 37 sec.
I am using onnx version of the model.

# -------------------- start to processing ---------------------
    # No need to optimize this loop, done in 37 secs on T4.
    # Convert the input_img_list to a tensor for batching.
    def process_batch_for_inference(cropped_images_list):
        """ Function to handle batch inference. """
        #print("Processing the cropped faces into batches for smooth inference!")

        print(f"Number of images being processed: {len(cropped_images_list)}")
        cropped_face_t = img2tensor(cropped_images_list, bgr2rgb=True, float32=True)
        cropped_face_t = torch.stack(cropped_face_t).to(device)
        normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)
        print(f"Shape of cropped faces tensor: {cropped_face_t.shape}")

        try:
            forward_time = time.time()
            with torch.no_grad():
                ort_inputs = {ort_session.get_inputs()[0].name: cropped_face_t.cpu().numpy()}
                #print(f"shape of the inputs : {ort_inputs.shape}")
                ort_outs = ort_session.run(None, ort_inputs)
                output = torch.from_numpy(ort_outs[0])
                print(f"shape of the output : {output.shape}")
                restored_face = tensor2img(output, rgb2bgr=True, min_max=(-1, 1))
                assert type(restored_face) == list, f"Output should be a list, got {type(restored_face)}"
                assert all(x.ndim == 3 for x in restored_face), "Image should be 3-dimensional"
            print(f'Inference time: {time.time() - forward_time:.2f}s')
            del output
            torch.cuda.empty_cache()
        except Exception as error:
            print(f'Failed inference for CodeFormer: {error}')
            traceback.print_exc()
            restored_face = tensor2img(cropped_face_t, rgb2bgr=True, min_max=(-1, 1))

        for face in restored_face:
            face = face.astype('uint8')

        for i in range(len(cropped_images_list)):
            face_helper.add_restored_face(restored_face[i], cropped_images_list[i])

This is my forward pass function.

# Process each image in the current batch
    imgs = []
    all_cropped_images_list = []
    affine_matrices = []
    for i, img_path in enumerate(input_img_list):
        # clean all the intermediate results to process the next image
        face_helper.clean_all()

        if isinstance(img_path, str):
            img_name = os.path.basename(img_path)
            basename, ext = os.path.splitext(img_name)
            print(f'[{i+1}/{test_img_num}] Processing: {img_name}')
            img = cv2.imread(img_path, cv2.IMREAD_COLOR)
        else: # for video processing
            basename = str(i).zfill(6)
            img_name = f'{video_name}_{basename}' if input_video else basename
            print(f'[{i+1}/{test_img_num}] Processing: {img_name}')
            img = img_path

        if args.has_aligned:
            # the input faces are already cropped and aligned
            img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_LINEAR)
            face_helper.is_gray = is_gray(img, threshold=10)
            if face_helper.is_gray:
                print('Grayscale input: True')
            face_helper.cropped_faces = [img]
        else:
            face_helper.read_image(img)
            imgs.append(face_helper.input_img)
            # get face landmarks for each face
            num_det_faces = face_helper.get_face_landmarks_5(
                only_center_face=args.only_center_face, resize=640, eye_dist_threshold=5)
            print(f'\tdetect {num_det_faces} faces')
            # align and warp each face
            #print(face_helper.all_landmarks_5)
            face_helper.align_warp_face()
            affine_matrices.append(face_helper.affine_matrices[0])
        for idx, cropped_faces in enumerate(face_helper.cropped_faces):
            all_cropped_images_list.append(cropped_faces / 255.)
           
    face_helper.affine_matrices = affine_matrices
    print(len(affine_matrices))
    print(len(face_helper.affine_matrices))
    
    # Start of processing
    print(f"Processing batches of images for cropping out faces")
    batched_input_img_list = []
    batch_num = 0
    for cropped_images in all_cropped_images_list:
        batched_input_img_list.append(cropped_images)
        if len(batched_input_img_list) == args.batch_size:
            # Batch inference
            batch_num += 1
            print(f"Processing batch no {batch_num}")
            process_batch_for_inference(batched_input_img_list)

            # Clear the batched input list for next batch
            batched_input_img_list = []

    if len(batched_input_img_list) > 0 and len(batched_input_img_list) < args.batch_size:
        batch_num+=1
        print(f"Processing last batch {batch_num}")
        # Batch inference for the last batch
        process_batch_for_inference(batched_input_img_list)

        # Clear the batched input list for next batch
        batched_input_img_list = []

This is how i create batches to be processed.

#restoring back the faces on the images.
    print(f"length of imgs {len(imgs)}")
    #for img in imgs:
    # paste_back
    if not args.has_aligned:
        # upsample the background
        if bg_upsampler is not None:
            # Now only support RealESRGAN for upsampling background
            bg_img = bg_upsampler.enhance(img, outscale=args.upscale)[0]
        else:
            bg_img = None
        #print(f"shape of affine matrices {len(face_helper.affine_matrices)}")
        face_helper.get_inverse_affine(None)
        # paste each restored face to the input image
        if args.face_upsample and face_upsampler is not None:
            restored_img = face_helper.paste_faces_to_input_image(upsample_img=bg_img, draw_box=args.draw_box, face_upsampler=face_upsampler)
        else:
            print(f"Length of restored faces {len(face_helper.restored_faces)}")
            print(f" length of inverse affine matrices {len(face_helper.inverse_affine_matrices)}")
            if bg_img is not None:
                print(f"Shape of bg_img : {bg_img.shape}")
            restored_imgs = face_helper.paste_faces_to_input_image(upsample_img=bg_img, draw_box=args.draw_box)
            assert type(restored_imgs) == list, f"Output should be a list, got {type(restored_imgs)}"
            

    # save faces
    assert len(all_cropped_images_list) == len(restored_imgs), "Length of cropped faces and restored faces should be the same"
    for idx, (cropped_face, restored_face) in enumerate(zip(all_cropped_images_list, restored_imgs)):
        # save cropped face
        if not args.has_aligned:
            save_crop_path = os.path.join(result_root, 'cropped_faces', f'{basename}_{idx:02d}.png')
            imwrite(cropped_face, save_crop_path)
        # save restored face
        if args.has_aligned:
            save_face_name = f'{basename}.png'
        else:
            save_face_name = f'{basename}_{idx:02d}.png'
        if args.suffix is not None:
            save_face_name = f'{save_face_name[:-4]}_{args.suffix}.png'
            save_restore_path = os.path.join(result_root, 'restored_faces', save_face_name)
            imwrite(restored_face, save_restore_path)
    
    for restored_img in restored_imgs:
        # save restored img
        if not args.has_aligned and restored_img is not None:
            if args.suffix is not None:
                basename = f'{basename}_{args.suffix}'
            save_restore_path = os.path.join(result_root, 'final_results', f'{basename}.png')
            imwrite(restored_img, save_restore_path)

And for some unknown reason this particular part takes around 2 to 3 minutes to be completed. Has anyone worked out a faster pipeline for videos? Or any suggestions on optimizations on this? Appreciate the help. Thanks

Rahul23071995 · 2024-10-22T04:09:34Z

Rahul23071995 · 2024-10-22T04:09:56Z

8k photos

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Inference pipeline for videos #408

Inference pipeline for videos #408

psycoplankton commented Oct 21, 2024

Rahul23071995 commented Oct 22, 2024

Rahul23071995 commented Oct 22, 2024

Inference pipeline for videos #408

Inference pipeline for videos #408

Comments

psycoplankton commented Oct 21, 2024

Rahul23071995 commented Oct 22, 2024

Rahul23071995 commented Oct 22, 2024