From a2b1a88a22deba036b3a8caabe0a68e1dbfb4b4b Mon Sep 17 00:00:00 2001
From: yuygfgg <140488233+yuygfgg@users.noreply.github.com>
Date: Fri, 29 Nov 2024 09:49:45 +0800
Subject: [PATCH] Add support for MLprogram in ort_coreml (#116)

It enables fp16 computation on ANE, instead of allocating all to CPU. However, the MLprogram is not well-supported currently, supporting much less EPs than regular NeuralNetwork.
---
 .github/workflows/macos-ort.yml |  2 +-
 scripts/vsmlrt.py               |  2 ++
 vsort/README.md                 |  7 ++++++-
 vsort/vs_onnxruntime.cpp        | 26 ++++++++++++++++++++++++--
 4 files changed, 33 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/macos-ort.yml b/.github/workflows/macos-ort.yml
index 0714abf..3889eb3 100644
--- a/.github/workflows/macos-ort.yml
+++ b/.github/workflows/macos-ort.yml
@@ -103,7 +103,7 @@ jobs:
 
     - name: Setup ONNX Runtime
       run: |
-        curl -L -o ort.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.19.2/onnxruntime-osx-arm64-1.19.2.tgz
+        curl -L -o ort.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.20.0/onnxruntime-osx-arm64-1.20.0.tgz
         tar -xf ort.tgz
         mv onnxruntime-* onnxruntime
 
diff --git a/scripts/vsmlrt.py b/scripts/vsmlrt.py
index 232da5f..7598914 100644
--- a/scripts/vsmlrt.py
+++ b/scripts/vsmlrt.py
@@ -273,6 +273,7 @@ class ORT_COREML:
         verbosity: int = 0
         fp16: bool = False
         fp16_blacklist_ops: typing.Optional[typing.Sequence[str]] = None
+        ml_program: int = 0
 
         # internal backend attributes
         supports_onnx_serialization: bool = True
@@ -2483,6 +2484,7 @@ def _inference(
             fp16=backend.fp16,
             path_is_serialization=path_is_serialization,
             fp16_blacklist_ops=backend.fp16_blacklist_ops,
+            ml_program=backend.ml_program,
             **kwargs
         )
     elif isinstance(backend, Backend.ORT_CUDA):
diff --git a/vsort/README.md b/vsort/README.md
index cb5dbad..33ca0f6 100644
--- a/vsort/README.md
+++ b/vsort/README.md
@@ -27,7 +27,9 @@ Arguments:
  - `string provider`: Specifies the device to run the inference on.
    - `"CPU"` or `""`: pure CPU backend
    - `"CUDA"`: CUDA GPU backend, requires Nvidia Maxwell+ GPUs.
- - `int device_id`: select the GPU device for the CUDA backend.
+   - `"DML"`: DirectML backend
+   - `"COREML"`: CoreML backend
+ - `int device_id`: select the GPU device for the CUDA backend.'
  - `int verbosity`: specify the verbosity of logging, the default is warning.
    - 0: fatal error only, `ORT_LOGGING_LEVEL_FATAL`
    - 1: also errors, `ORT_LOGGING_LEVEL_ERROR`
@@ -40,6 +42,9 @@ Arguments:
  - `bint fp16`: whether to quantize model to fp16 for faster and memory efficient computation.
  - `bint path_is_serialization`: whether the `network_path` argument specifies an onnx serialization of type `bytes`.
  - `bint use_cuda_graph`: whether to use CUDA Graphs to improve performance and reduce CPU overhead in CUDA backend. Not all models are supported.
+ - `int ml_program`: select CoreML provider.
+   - 0: NeuralNetwork
+   - 1: MLProgram
 
 When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case.
 
diff --git a/vsort/vs_onnxruntime.cpp b/vsort/vs_onnxruntime.cpp
index 9acedc5..d3cd173 100644
--- a/vsort/vs_onnxruntime.cpp
+++ b/vsort/vs_onnxruntime.cpp
@@ -454,6 +454,10 @@ struct vsOrtData {
     std::vector<VSNodeRef *> nodes;
     std::unique_ptr<VSVideoInfo> out_vi;
 
+#ifdef ENABLE_COREML
+    bool ml_program;
+#endif //ENABLE_COREML
+
     int overlap_w, overlap_h;
 
     OrtEnv * environment;
@@ -908,6 +912,19 @@ static void VS_CC vsOrtCreate(
     if (error) {
         verbosity = ORT_LOGGING_LEVEL_WARNING;
     }
+#ifdef ENABLE_COREML
+    auto ml_program = vsapi->propGetInt(in, "ml_program", 0, &error);
+
+    if (error) {
+        d->ml_program = false;
+    } else if (ml_program == 0) {
+        d->ml_program = false;
+    } else if (ml_program == 1) {
+        d->ml_program = true;
+    } else {
+        return set_error("\"ml_program\" must be 0 or 1");
+    }
+#endif //ENABLE_COREML
 
     // match verbosity of vs-trt
     verbosity = static_cast<OrtLoggingLevel>(4 - static_cast<int>(verbosity));
@@ -1232,10 +1249,12 @@ static void VS_CC vsOrtCreate(
         }
 #endif // ENABLE_CUDA
 #ifdef ENABLE_COREML
+        uint32_t coreml_flag = 0;
+        if (ml_program) coreml_flag |= 0x010;
         if (d->backend == Backend::COREML) {
             checkError(OrtSessionOptionsAppendExecutionProvider_CoreML(
                 session_options,
-                0
+                coreml_flag
             ));
         }
 #endif // ENABLE_COREML
@@ -1394,7 +1413,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
         "network_path:data;"
         "overlap:int[]:opt;"
         "tilesize:int[]:opt;"
-        "provider:data:opt;" // "": Default (CPU), "CUDA": CUDA
+        "provider:data:opt;" // "": Default (CPU), "CUDA": CUDA, "COREML": COREML, "DML": DML
         "device_id:int:opt;"
         "num_streams:int:opt;"
         "verbosity:int:opt;"
@@ -1409,6 +1428,9 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
         "output_format:int:opt;"
         "tf32:int:opt;"
         "flexible_output_prop:data:opt;"
+#ifdef ENABLE_COREML
+        "ml_program:int:opt;"
+#endif //ENABLE_COREML
         , vsOrtCreate,
         nullptr,
         plugin