Merge pull request #4 from mobiusml/basic_video_processing

Basic Video Processing
mobiusml · Nov 14, 2023 · 318cf4d · 318cf4d
2 parents 7d7ad12 + c665b76
commit 318cf4d
Show file tree

Hide file tree

Showing 18 changed files with 1,214 additions and 121 deletions.
diff --git a/aana/configs/endpoints.py b/aana/configs/endpoints.py
@@ -21,9 +21,23 @@
     "blip2": [
         Endpoint(
             name="blip2_generate",
-            path="/blip2/generate_captions",
-            summary="Generate captions using BLIP2 OPT-2.7B",
+            path="/image/generate_captions",
+            summary="Generate captions for images using BLIP2 OPT-2.7B",
             outputs=["captions_hf_blip2_opt_2_7b"],
+        ),
+        Endpoint(
+            name="blip2_video_generate",
+            path="/video/generate_captions",
+            summary="Generate captions for videos using BLIP2 OPT-2.7B",
+            outputs=["video_captions_hf_blip2_opt_2_7b"],
+        ),
+    ],
+    "video": [
+        Endpoint(
+            name="video_extract_frames",
+            path="/video/extract_frames",
+            summary="Extract frames from a video",
+            outputs=["timestamps", "duration"],
         )
     ],
 }
diff --git a/aana/configs/pipeline.py b/aana/configs/pipeline.py
@@ -3,9 +3,12 @@
 It is used to generate the pipeline and the API endpoints.
 """
 
-from aana.models.pydantic.image_input import ImageListInput
+from aana.models.pydantic.captions import CaptionsList, VideoCaptionsList
+from aana.models.pydantic.image_input import ImageInputList
 from aana.models.pydantic.prompt import Prompt
 from aana.models.pydantic.sampling_params import SamplingParams
+from aana.models.pydantic.video_input import VideoInputList
+from aana.models.pydantic.video_params import VideoParams
 
 # container data model
 # we don't enforce this data model for now but it's a good reference for writing paths and flatten_by
@@ -17,13 +20,27 @@
 #     vllm_zephyr_7b_beta_output_stream: str
 #     vllm_zephyr_7b_beta_output: str
 #     image_batch: ImageBatch
+#     video_batch: VideoBatch
 #
 # class ImageBatch:
 #     images: list[Image]
 #
 # class Image:
 #     image: ImageInput
 #     caption_hf_blip2_opt_2_7b: str
+#
+# class VideoBatch:
+#     videos: list[Video]
+#     params: VideoParams
+# class Video:
+#     video: VideoInput
+#     frames: Frame
+#     timestamps: Timestamps
+#     duration: float
+# class Frame:
+#     image: Image
+#     caption_hf_blip2_opt_2_7b: str
+
 
 # pipeline configuration
 
@@ -147,7 +164,7 @@
                 "name": "images",
                 "key": "images",
                 "path": "image_batch.images.[*].image",
-                "data_model": ImageListInput,
+                "data_model": ImageInputList,
             }
         ],
     },
@@ -161,14 +178,91 @@
                 "name": "images",
                 "key": "images",
                 "path": "image_batch.images.[*].image",
-                "data_model": ImageListInput,
+                "data_model": ImageInputList,
             }
         ],
         "outputs": [
             {
                 "name": "captions_hf_blip2_opt_2_7b",
                 "key": "captions",
                 "path": "image_batch.images.[*].caption_hf_blip2_opt_2_7b",
+                "data_model": CaptionsList,
+            }
+        ],
+    },
+    {
+        "name": "videos",
+        "type": "input",
+        "inputs": [],
+        "outputs": [
+            {
+                "name": "videos",
+                "key": "videos",
+                "path": "video_batch.videos.[*].video",
+                "data_model": VideoInputList,
+            }
+        ],
+    },
+    {
+        "name": "video_params",
+        "type": "input",
+        "inputs": [],
+        "outputs": [
+            {
+                "name": "video_params",
+                "key": "video_params",
+                "path": "video_batch.params",
+                "data_model": VideoParams,
+            }
+        ],
+    },
+    {
+        "name": "frame_extraction",
+        "type": "ray_task",
+        "function": "aana.utils.video.extract_frames_decord",
+        "batched": True,
+        "flatten_by": "video_batch.videos.[*]",
+        "inputs": [
+            {"name": "videos", "key": "video", "path": "video_batch.videos.[*].video"},
+            {"name": "video_params", "key": "params", "path": "video_batch.params"},
+        ],
+        "outputs": [
+            {
+                "name": "frames",
+                "key": "frames",
+                "path": "video_batch.videos.[*].frames.[*].image",
+            },
+            {
+                "name": "timestamps",
+                "key": "timestamps",
+                "path": "video_batch.videos.[*].timestamp",
+            },
+            {
+                "name": "duration",
+                "key": "duration",
+                "path": "video_batch.videos.[*].duration",
+            },
+        ],
+    },
+    {
+        "name": "hf_blip2_opt_2_7b_video",
+        "type": "ray_deployment",
+        "deployment_name": "hf_blip2_deployment_opt_2_7b",
+        "method": "generate_batch",
+        "flatten_by": "video_batch.videos.[*].frames.[*]",
+        "inputs": [
+            {
+                "name": "frames",
+                "key": "images",
+                "path": "video_batch.videos.[*].frames.[*].image",
+            }
+        ],
+        "outputs": [
+            {
+                "name": "video_captions_hf_blip2_opt_2_7b",
+                "key": "captions",
+                "path": "video_batch.videos.[*].frames.[*].caption_hf_blip2_opt_2_7b",
+                "data_model": VideoCaptionsList,
             }
         ],
     },

diff --git a/aana/exceptions/general.py b/aana/exceptions/general.py
@@ -1,8 +1,10 @@
 from mobius_pipeline.exceptions import BaseException
 from typing import TYPE_CHECKING
 
+
 if TYPE_CHECKING:
     from aana.models.core.image import Image
+    from aana.models.core.video import Video
 
 
 class InferenceException(BaseException):
@@ -94,3 +96,36 @@ def __init__(self, url: str):
 
     def __reduce__(self):
         return (self.__class__, (self.url,))
+
+
+class VideoException(BaseException):
+    """
+    Exception raised when working with videos.
+
+    Attributes:
+        video (Video): the video that caused the exception
+    """
+
+    def __init__(self, video: "Video"):
+        """
+        Initialize the exception.
+
+        Args:
+            video (Video): the video that caused the exception
+        """
+        super().__init__(video=video)
+        self.video = video
+
+    def __reduce__(self):
+        return (self.__class__, (self.video,))
+
+
+class VideoReadingException(VideoException):
+    """
+    Exception raised when there is an error reading a video.
+
+    Attributes:
+        video (Video): the video that caused the exception
+    """
+
+    pass