From a4c9cc15cfa03ba9c9bd43dde1f95ff363cd792a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?The=CC=81o=20Monnom?= <theo.monnom@outlook.com>
Date: Wed, 25 Oct 2023 14:39:35 -0700
Subject: [PATCH] wip python allocated video frames

---
 livekit-rtc/livekit/rtc/video_frame.py | 361 ++++++++++++++++++-------
 1 file changed, 266 insertions(+), 95 deletions(-)

diff --git a/livekit-rtc/livekit/rtc/video_frame.py b/livekit-rtc/livekit/rtc/video_frame.py
index 3be238ec..9ddd47d5 100644
--- a/livekit-rtc/livekit/rtc/video_frame.py
+++ b/livekit-rtc/livekit/rtc/video_frame.py
@@ -13,11 +13,13 @@
 # limitations under the License.
 
 import ctypes
+from typing import Union
 
 from ._ffi_client import FfiHandle, ffi_client
 from ._proto import ffi_pb2 as proto_ffi
 from ._proto import video_frame_pb2 as proto_video_frame
-from ._proto.video_frame_pb2 import VideoFormatType, VideoFrameBufferType, VideoRotation
+from ._proto.video_frame_pb2 import VideoFormatType, VideoFrameBufferType, VideoFrameReceived, VideoRotation
+from abc import ABC
 
 
 class VideoFrame:
@@ -29,30 +31,38 @@ def __init__(self, timestamp_us: int,
         self.rotation = rotation
 
 
-class VideoFrameBuffer:
-    def __init__(self, owned_info: proto_video_frame.OwnedVideoFrameBuffer) -> None:
-        self._info = owned_info.info
-        self._ffi_handle = FfiHandle(owned_info.handle.id)
+class VideoFrameBuffer(ABC):
+
+    def __init__(self,
+                 data: bytearray,
+                 width: int,
+                 height: int,
+                 buffer_type: VideoFrameBufferType.ValueType) -> None:
+        self._data = data
+        self._width = width
+        self._height = height
+        self._buffer_type = buffer_type
 
     @property
     def width(self) -> int:
-        return self._info.width
+        return self._width
 
     @property
     def height(self) -> int:
-        return self._info.height
+        return self._height
 
     @property
     def type(self) -> VideoFrameBufferType.ValueType:
-        return self._info.buffer_type
+        return self._buffer_type
 
+    # TODO(theomonnom): Need Rust modification
     def to_i420(self) -> 'I420Buffer':
         req = proto_ffi.FfiRequest()
         req.to_i420.yuv_handle = self._ffi_handle.handle
-
         resp = ffi_client.request(req)
         return I420Buffer(resp.to_i420.buffer)
 
+    # TODO(theomonnom): Need Rust modification
     def to_argb(self, dst: 'ArgbFrame') -> None:
         req = proto_ffi.FfiRequest()
         req.to_argb.buffer_handle = self._ffi_handle.handle
@@ -61,11 +71,10 @@ def to_argb(self, dst: 'ArgbFrame') -> None:
         req.to_argb.dst_stride = dst.width * 4
         req.to_argb.dst_width = dst.width
         req.to_argb.dst_height = dst.height
-
         ffi_client.request(req)
 
     @staticmethod
-    def create(owned_info: proto_video_frame.OwnedVideoFrameBuffer) \
+    def _from_owned_info(owned_info: proto_video_frame.OwnedVideoFrameBuffer) \
             -> 'VideoFrameBuffer':
         """
         Create the right class instance from the VideoFrameBufferInfo
@@ -92,155 +101,317 @@ def create(owned_info: proto_video_frame.OwnedVideoFrameBuffer) \
 
 # TODO(theomonnom): Ability to get GPU texture directly
 class NativeVideoFrameBuffer(VideoFrameBuffer):
-    def __init__(self, owned_info: proto_video_frame.OwnedVideoFrameBuffer) -> None:
-        super().__init__(owned_info)
+    def __init__(self, width: int, height: int) -> None:
+        super().__init__(bytearray(), width, height, VideoFrameBufferType.NATIVE)
 
 
-class PlanarYuvBuffer(VideoFrameBuffer):
-    def __init__(self, owned_info: proto_video_frame.OwnedVideoFrameBuffer) -> None:
-        super().__init__(owned_info)
+class PlanarYuvBuffer(VideoFrameBuffer, ABC):
+    def __init__(self,
+                 data: bytearray,
+                 width: int,
+                 height: int,
+                 buffer_type: VideoFrameBufferType.ValueType,
+                 stride_y: int,
+                 stride_u: int,
+                 stride_v: int,
+                 chroma_width: int,
+                 chroma_height: int) -> None:
+        super().__init__(data, width, height, buffer_type)
+        self._stride_y = stride_y
+        self._stride_u = stride_u
+        self._stride_v = stride_v
+        self._chroma_width = chroma_width
+        self._chroma_height = chroma_height
 
     @property
     def chroma_width(self) -> int:
-        return self._info.yuv.chroma_width
+        return self._chroma_width
 
     @property
     def chroma_height(self) -> int:
-        return self._info.yuv.chroma_height
+        return self._chroma_height
 
     @property
     def stride_y(self) -> int:
-        return self._info.yuv.stride_y
+        return self._stride_y
 
     @property
     def stride_u(self) -> int:
-        return self._info.yuv.stride_u
+        return self._stride_u
 
     @property
     def stride_v(self) -> int:
-        return self._info.yuv.stride_v
+        return self._stride_v
 
 
-class PlanarYuv8Buffer(PlanarYuvBuffer):
-    def __init__(self, owned_info: proto_video_frame.OwnedVideoFrameBuffer) -> None:
-        super().__init__(owned_info)
+class PlanarYuv8Buffer(PlanarYuvBuffer, ABC):
+    def __init__(self,
+                 data: bytearray,
+                 width: int,
+                 height: int,
+                 buffer_type: VideoFrameBufferType.ValueType,
+                 stride_y: int,
+                 stride_u: int,
+                 stride_v: int,
+                 chroma_width: int,
+                 chroma_height: int) -> None:
+        super().__init__(data, width, height, buffer_type, stride_u,
+                         stride_y, stride_v, chroma_width, chroma_height)
 
     @property
-    def data_y(self) -> ctypes.Array[ctypes.c_uint8]:
-        arr = ctypes.cast(self._info.yuv.data_y_ptr, ctypes.POINTER(
-            ctypes.c_uint8 * (self._info.yuv.stride_y * self._info.height))).contents
-        return arr
+    def data_y(self) -> memoryview:
+        return memoryview(self._data)[0:self._stride_y * self._height]
 
     @property
-    def data_u(self) -> ctypes.Array[ctypes.c_uint8]:
-        arr = ctypes.cast(self._info.yuv.data_u_ptr, ctypes.POINTER(
-            ctypes.c_uint8 * (self._info.yuv.stride_u *
-                              self._info.yuv.chroma_height))).contents
-        return arr
+    def data_u(self) -> memoryview:
+        return memoryview(self._data)[self._stride_y * self._height:
+                                      self._stride_y * self._height +
+                                      self._stride_u * self._chroma_height]
 
     @property
-    def data_v(self) -> ctypes.Array[ctypes.c_uint8]:
-        arr = ctypes.cast(self._info.yuv.data_v_ptr, ctypes.POINTER(
-            ctypes.c_uint8 * (self._info.yuv.stride_v *
-                              self._info.yuv.chroma_height))).contents
-        return arr
+    def data_v(self) -> memoryview:
+        return memoryview(self._data)[self._stride_y * self._height +
+                                      self._stride_u * self._chroma_height:
+                                      self._stride_y * self._height +
+                                      self._stride_u * self._chroma_height +
+                                      self._stride_v * self._chroma_height]
 
 
-class PlanarYuv16Buffer(PlanarYuvBuffer):
-    def __init__(self, owned_info: proto_video_frame.OwnedVideoFrameBuffer) -> None:
-        super().__init__(owned_info)
+class PlanarYuv16Buffer(PlanarYuvBuffer, ABC):
+    def __init__(self,
+                 data: bytearray,
+                 width: int,
+                 height: int,
+                 buffer_type: VideoFrameBufferType.ValueType,
+                 stride_y: int,
+                 stride_u: int,
+                 stride_v: int,
+                 chroma_width: int,
+                 chroma_height: int) -> None:
+        super().__init__(data, width, height, buffer_type, stride_y,
+                         stride_u, stride_v, chroma_width, chroma_height)
 
     @property
-    def data_y(self) -> ctypes.Array[ctypes.c_uint16]:
-        arr = ctypes.cast(self._info.yuv.data_y_ptr, ctypes.POINTER(
-            ctypes.c_uint16 * (self._info.yuv.stride_y // 2 *
-                               self._info.height))).contents
-        return arr
+    def data_y(self) -> memoryview:
+        return memoryview(self._data)[0:self._stride_y * self._height].cast('H')
 
     @property
-    def data_u(self) -> ctypes.Array[ctypes.c_uint16]:
-        arr = ctypes.cast(self._info.yuv.data_u_ptr, ctypes.POINTER(
-            ctypes.c_uint16 * (self._info.yuv.stride_u // 2 *
-                               self._info.yuv.chroma_height))).contents
-        return arr
+    def data_u(self) -> memoryview:
+        return memoryview(self._data)[self._stride_y * self._height:
+                                      self._stride_y * self._height +
+                                      self._stride_u * self._chroma_height].cast('H')
 
     @property
-    def data_v(self) -> ctypes.Array[ctypes.c_uint16]:
-        arr = ctypes.cast(self._info.yuv.data_v_ptr, ctypes.POINTER(
-            ctypes.c_uint16 * (self._info.yuv.stride_v // 2 *
-                               self._info.yuv.chroma_height))).contents
-        return arr
+    def data_v(self) -> memoryview:
+        return memoryview(self._data)[self._stride_y * self._height +
+                                      self._stride_u * self._chroma_height:
+                                      self._stride_y * self._height +
+                                      self._stride_u * self._chroma_height +
+                                      self._stride_v * self._chroma_height].cast('H')
 
 
-class BiplanaraYuv8Buffer(VideoFrameBuffer):
-    def __init__(self, owned_info: proto_video_frame.OwnedVideoFrameBuffer) -> None:
-        super().__init__(owned_info)
+class BiplanaraYuv8Buffer(VideoFrameBuffer, ABC):
+    def __init__(self,
+                 data: bytearray,
+                 width: int,
+                 height: int,
+                 buffer_type: VideoFrameBufferType.ValueType,
+                 stride_y: int,
+                 stride_uv: int,
+                 chroma_width: int,
+                 chroma_height: int) -> None:
+        super().__init__(data, width, height, buffer_type)
+        self._stride_y = stride_y
+        self._stride_uv = stride_uv
+        self._chroma_width = chroma_width
+        self._chroma_height = chroma_height
 
     @property
-    def data_y(self) -> ctypes.Array[ctypes.c_uint8]:
-        arr = ctypes.cast(self._info.bi_yuv.data_y_ptr, ctypes.POINTER(
-            ctypes.c_uint8 * (self._info.bi_yuv.stride_y * self._info.height))).contents
-        return arr
+    def chroma_width(self) -> int:
+        return self._chroma_width
 
     @property
-    def data_uv(self) -> ctypes.Array[ctypes.c_uint8]:
-        arr = ctypes.cast(self._info.bi_yuv.data_uv_ptr, ctypes.POINTER(
-            ctypes.c_uint8 * (self._info.bi_yuv.stride_uv *
-                              self._info.bi_yuv.chroma_height))).contents
-        return arr
+    def chroma_height(self) -> int:
+        return self._chroma_height
+
+    @property
+    def stride_y(self) -> int:
+        return self._stride_y
+
+    @property
+    def stride_uv(self) -> int:
+        return self._stride_uv
+
+    @ property
+    def data_y(self) -> memoryview:
+        return memoryview(self._data)[0:self._stride_y * self._height]
+
+    @ property
+    def data_uv(self) -> memoryview:
+        return memoryview(self._data)[self._stride_y * self._height:
+                                      self._stride_y * self._height +
+                                      self._stride_uv * self._chroma_height]
 
 
 class I420Buffer(PlanarYuv8Buffer):
-    def __init__(self, owned_info: proto_video_frame.OwnedVideoFrameBuffer) -> None:
-        super().__init__(owned_info)
+    def __init__(self,
+                 data: bytearray,
+                 width: int,
+                 height: int,
+                 stride_y: int,
+                 stride_u: int,
+                 stride_v: int) -> None:
+
+        if len(data) < I420Buffer.calc_data_size(height, stride_y, stride_u, stride_v):
+            raise ValueError(
+                'buffer too small for I420 data. Expected {} bytes, got {}.'.format(
+                    I420Buffer.calc_data_size(height, stride_y, stride_u, stride_v), len(data)))
+
+        chroma_width = (width + 1) // 2
+        chroma_height = (height + 1) // 2
+        super().__init__(data, width, height,
+                         VideoFrameBufferType.I420, stride_y, stride_u, stride_v, chroma_width, chroma_height)
+
+    @staticmethod
+    def calc_data_size(height: int, stride_y: int, stride_u: int, stride_v: int) -> int:
+        return stride_y * height + (stride_u + stride_v) * ((height + 1) // 2)
+
+    @staticmethod
+    def create(width: int, height: int) -> 'I420Buffer':
+        stride_y = width
+        stride_u = (width + 1) // 2
+        stride_v = (width + 1) // 2
+        data_size = I420Buffer.calc_data_size(
+            height, stride_y, stride_u, stride_v)
+        data = bytearray(data_size)
+        return I420Buffer(data, width, height, stride_y, stride_u, stride_v)
 
 
 class I420ABuffer(PlanarYuv8Buffer):
-    def __init__(self, owned_info: proto_video_frame.OwnedVideoFrameBuffer) -> None:
-        super().__init__(owned_info)
+    def __init__(self,
+                 data: bytearray,
+                 width: int,
+                 height: int,
+                 stride_y: int,
+                 stride_u: int,
+                 stride_v: int,
+                 stride_a: int) -> None:
+        chroma_width = (width + 1) // 2
+        chroma_height = (height + 1) // 2
+        super().__init__(data, width, height, VideoFrameBufferType.I420A,
+                         stride_y, stride_u, stride_v, chroma_width, chroma_height)
+        self._stride_a = stride_a
+
+    @staticmethod
+    def calc_data_size(height: int, stride_y: int, stride_u: int, stride_v: int, stride_a: int) -> int:
+        return (stride_y + stride_a) * height + (stride_u + stride_v) * ((height + 1) // 2)
+
+    @property
+    def stride_a(self) -> int:
+        return self._stride_a
 
     @property
-    def data_a(self) -> ctypes.Array[ctypes.c_uint8]:
-        arr = ctypes.cast(self._info.yuv.data_a_ptr, ctypes.POINTER(
-            ctypes.c_uint8 * (self._info.yuv.stride_a * self._info.height))).contents
-        return arr
+    def data_a(self) -> memoryview:
+        return memoryview(self._data)[self._stride_y * self._height +
+                                      self._stride_u * self._chroma_height +
+                                      self._stride_v * self._chroma_height:
+                                      self._stride_y * self._height +
+                                      self._stride_u * self._chroma_height +
+                                      self._stride_v * self._chroma_height +
+                                      self._stride_a * self._height]
 
 
 class I422Buffer(PlanarYuv8Buffer):
-    def __init__(self, owned_info: proto_video_frame.OwnedVideoFrameBuffer) -> None:
-        super().__init__(owned_info)
+    def __init__(self,
+                 data: bytearray,
+                 width: int,
+                 height: int,
+                 stride_y: int,
+                 stride_u: int,
+                 stride_v: int) -> None:
+        chroma_width = (width + 1) // 2
+        chroma_height = height
+        super().__init__(data, width, height, VideoFrameBufferType.I422,
+                         stride_y, stride_u, stride_v, chroma_width, chroma_height)
+
+    @staticmethod
+    def calc_data_size(height: int, stride_y: int, stride_u: int, stride_v: int) -> int:
+        return stride_y * height + stride_u * height + stride_v * height
+
 
 
 class I444Buffer(PlanarYuv8Buffer):
-    def __init__(self, owned_info: proto_video_frame.OwnedVideoFrameBuffer) -> None:
-        super().__init__(owned_info)
+    def __init__(self,
+                 data: bytearray,
+                 width: int,
+                 height: int,
+                 stride_y: int,
+                 stride_u: int,
+                 stride_v: int) -> None:
+        chroma_width = width
+        chroma_height = height
+        super().__init__(data, width, height, VideoFrameBufferType.I444,
+                         stride_y, stride_u, stride_v, chroma_width, chroma_height)
+
+    @staticmethod
+    def calc_data_size(height: int, stride_y: int, stride_u: int, stride_v: int) -> int:
+        return stride_y * height + stride_u * height + stride_v * height
 
 
 class I010Buffer(PlanarYuv16Buffer):
-    def __init__(self, owned_info: proto_video_frame.OwnedVideoFrameBuffer) -> None:
-        super().__init__(owned_info)
+    def __init__(self, data: bytearray,
+                 width: int,
+                 height: int,
+                 stride_y: int,
+                 stride_u: int,
+                 stride_v: int) -> None:
+        chroma_width = (width + 1) // 2
+        chroma_height = (height + 1) // 2
+        super().__init__(data, width, height, VideoFrameBufferType.I010,
+                         stride_y, stride_u, stride_v, chroma_width, chroma_height)
+
+    @staticmethod
+    def calc_data_size(height: int, stride_y: int, stride_u: int, stride_v: int) -> int:
+        return stride_y * height * 2 + stride_u * ((height + 1) // 2) * 2 + stride_v * ((height + 1) // 2) * 2
+
 
 
 class NV12Buffer(BiplanaraYuv8Buffer):
-    def __init__(self, owned_info: proto_video_frame.OwnedVideoFrameBuffer) -> None:
-        super().__init__(owned_info)
+    def __init__(self, data: bytearray,
+                 width: int,
+                 height: int,
+                 stride_y: int,
+                 stride_uv: int) -> None:
+        chroma_width = (width + 1) // 2
+        chroma_height = (height + 1) // 2
+        super().__init__(data, width, height, VideoFrameBufferType.NV12,
+                         stride_y, stride_uv, chroma_width, chroma_height)
+
+    @staticmethod
+    def calc_data_size(height: int, stride_y: int, stride_uv: int) -> int:
+        return stride_y * height + stride_uv * ((height + 1) // 2)
 
 
-class ArgbFrame:
-    """
-    Mainly used to simplify the usage of to_argb method
-    So the users don't need to deal with ctypes
-    """
 
+class ArgbFrame:
     def __init__(self,
+                 data: Union[bytes, bytearray, memoryview],
                  format: VideoFormatType.ValueType,
                  width: int,
-                 height: int) -> None:
+                 height: int,
+                 stride: int = 0) -> None:
+
+        if stride == 0:
+            stride = width * ctypes.sizeof(ctypes.c_uint32)
+
+        if len(data) < stride * height:
+            raise ValueError("data size does not match stride and height")
+
+        self._data = bytearray(data)
         self._format = format
-        self.width = width
-        self.height = height
-        self.data = (ctypes.c_uint8 * (width * height *
-                     ctypes.sizeof(ctypes.c_uint32)))()  # alloc frame
+        self._width = width
+        self._height = height
+        self._stride = stride
 
     def to_i420(self) -> I420Buffer:
         # TODO(theomonnom): avoid unnecessary buffer allocation
@@ -254,6 +425,6 @@ def to_i420(self) -> I420Buffer:
         res = ffi_client.request(req)
         return I420Buffer(res.to_i420.buffer)
 
-    @property
+    @ property
     def format(self) -> VideoFormatType.ValueType:
         return self._format