From 876bdd5a565ebd6dd1c896541c6ed4b8d464d58c Mon Sep 17 00:00:00 2001
From: MediaPipe Team <mediapipe-team@google.com>
Date: Mon, 7 Oct 2024 10:17:57 -0700
Subject: [PATCH] Add a flag to set the default number of XNNPACK threads

XNNPACK defaults to 1 thread for gLinux runners. This is very slow for desktop runner apps, where usually a number of #cores/2 is a good value, but it's a) risky to change this default and b) there are situations where this is not wanted.

PiperOrigin-RevId: 683230250
---
 mediapipe/calculators/tensor/BUILD            |   6 +
 .../tensor/inference_calculator_utils.cc      |  12 +-
 .../tensor/inference_calculator_utils.h       |   8 +-
 .../tensor/inference_calculator_utils_test.cc | 131 +++++++++++++-----
 4 files changed, 116 insertions(+), 41 deletions(-)
diff --git a/mediapipe/calculators/tensor/BUILD b/mediapipe/calculators/tensor/BUILD
index 6dfb8b3785..4fa4faf399 100644
--- a/mediapipe/calculators/tensor/BUILD
+++ b/mediapipe/calculators/tensor/BUILD
@@ -453,6 +453,7 @@ cc_library_with_tflite(
         "//mediapipe/framework/port:status",
         "//mediapipe/framework/stream_handler:fixed_size_input_stream_handler",
         "//mediapipe/framework/tool:subgraph_expansion",
+        "//mediapipe/util:cpu_util",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -816,6 +817,8 @@ cc_library(
         "//mediapipe/framework/formats:tensor",
         "//mediapipe/framework/port:ret_check",
         "//mediapipe/framework/port:status",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/log:absl_check",
         "@com_google_absl//absl/log:absl_log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -841,9 +844,12 @@ cc_test(
     name = "inference_calculator_utils_test",
     srcs = ["inference_calculator_utils_test.cc"],
     deps = [
+        ":inference_calculator_cc_proto",
         ":inference_calculator_utils",
         "//mediapipe/framework/formats:tensor",
         "//mediapipe/framework/port:gtest_main",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:absl_check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
diff --git a/mediapipe/calculators/tensor/inference_calculator_utils.cc b/mediapipe/calculators/tensor/inference_calculator_utils.cc
index 12aa179ae7..829bc676e5 100644
--- a/mediapipe/calculators/tensor/inference_calculator_utils.cc
+++ b/mediapipe/calculators/tensor/inference_calculator_utils.cc
@@ -20,6 +20,7 @@
 #include <string>
 #include <vector>
 
+#include "absl/flags/flag.h"
 #include "absl/log/absl_log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -38,15 +39,22 @@
 #include "tensorflow/lite/portable_type_to_tflitetype.h"
 #include "tensorflow/lite/string_util.h"
 
+ABSL_FLAG(int, xnnpack_default_num_threads, 0,
+          "Default number of xnnpack threads to use. If unset, determines a "
+          "good default number based on the platform.");
+
 #if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__)
 #include "mediapipe/util/cpu_util.h"
 #endif  // !__EMSCRIPTEN__ || __EMSCRIPTEN_PTHREADS__
 
 namespace mediapipe {
-
 namespace {
 
 int GetXnnpackDefaultNumThreads() {
+  int default_from_flag = absl::GetFlag(FLAGS_xnnpack_default_num_threads);
+  if (default_from_flag > 0) {
+    return default_from_flag;
+  }
 #if defined(MEDIAPIPE_ANDROID) || defined(MEDIAPIPE_IOS) || \
     defined(__EMSCRIPTEN_PTHREADS__)
   constexpr int kMinNumThreadsByDefault = 1;
@@ -216,7 +224,7 @@ absl::Status CopyTfLiteTensorToTensor<char>(const TfLiteTensor& tflite_tensor,
 }  // namespace
 
 int GetXnnpackNumThreads(
-    const bool opts_has_delegate,
+    bool opts_has_delegate,
     const mediapipe::InferenceCalculatorOptions::Delegate& opts_delegate) {
   static constexpr int kDefaultNumThreads = -1;
   if (opts_has_delegate && opts_delegate.has_xnnpack() &&
diff --git a/mediapipe/calculators/tensor/inference_calculator_utils.h b/mediapipe/calculators/tensor/inference_calculator_utils.h
index 375e12efe7..917700d8e3 100644
--- a/mediapipe/calculators/tensor/inference_calculator_utils.h
+++ b/mediapipe/calculators/tensor/inference_calculator_utils.h
@@ -18,6 +18,7 @@
 #include <cstddef>
 #include <cstdint>
 
+#include "absl/flags/declare.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "mediapipe/calculators/tensor/inference_calculator.pb.h"
@@ -27,13 +28,16 @@
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/util.h"
 
+ABSL_DECLARE_FLAG(int, xnnpack_default_num_threads);
+
 namespace mediapipe {
 
 // Returns number of threads to configure XNNPACK delegate with.
 // Returns user provided value if specified. Otherwise, tries to choose optimal
-// number of threads depending on the device.
+// number of threads depending on the device. The default can be overridden by
+// setting the --xnnpack_default_num_threads flag.
 int GetXnnpackNumThreads(
-    const bool opts_has_delegate,
+    bool opts_has_delegate,
     const mediapipe::InferenceCalculatorOptions::Delegate& opts_delegate);
 
 absl::Status CopyCpuInputIntoTfLiteTensor(const Tensor& input_tensor,
diff --git a/mediapipe/calculators/tensor/inference_calculator_utils_test.cc b/mediapipe/calculators/tensor/inference_calculator_utils_test.cc
index 3ad8313ab3..10145929ab 100644
--- a/mediapipe/calculators/tensor/inference_calculator_utils_test.cc
+++ b/mediapipe/calculators/tensor/inference_calculator_utils_test.cc
@@ -23,9 +23,12 @@
 #include <utility>
 #include <vector>
 
+#include "absl/flags/flag.h"
 #include "absl/log/absl_check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
+#include "mediapipe/calculators/tensor/inference_calculator.pb.h"
 #include "mediapipe/framework/formats/tensor.h"
 #include "mediapipe/framework/port/gmock.h"
 #include "mediapipe/framework/port/gtest.h"
@@ -40,6 +43,8 @@
 namespace mediapipe {
 namespace {
 
+constexpr int kDefaultNumXnnpackThreads = 1;
+
 using ElementType = ::mediapipe::Tensor::ElementType;
 using ::testing::ElementsAreArray;
 using ::testing::HasSubstr;
@@ -103,8 +108,60 @@ std::vector<char> TfLiteInputTensorData<char>(const Interpreter& interpreter,
   return std::vector<char>(str.begin(), str.end());
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyCpuInputIntoInterpreterTensorWorksCorrectlyForInt32) {
+class InferenceCalculatorUtilsTest : public ::testing::Test {
+ protected:
+  void TearDown() override {
+    absl::SetFlag(&FLAGS_xnnpack_default_num_threads, 0);
+  }
+};
+
+TEST_F(InferenceCalculatorUtilsTest, GetXnnpackNumThreadsReturnsDefault) {
+  EXPECT_EQ(GetXnnpackNumThreads(/*opts_has_delegate=*/false,
+                                 /*opts_delegate=*/{}),
+            kDefaultNumXnnpackThreads);
+}
+
+TEST_F(InferenceCalculatorUtilsTest, GetXnnpackNumThreadsReturnsSetDefault) {
+  absl::SetFlag(&FLAGS_xnnpack_default_num_threads, 42);
+  EXPECT_EQ(GetXnnpackNumThreads(/*opts_has_delegate=*/false,
+                                 /*opts_delegate=*/{}),
+            42);
+}
+
+TEST_F(InferenceCalculatorUtilsTest,
+       GetXnnpackNumThreadsReturnsDefaultIfHasDelegateIsTrueButUnset) {
+  EXPECT_EQ(GetXnnpackNumThreads(/*opts_has_delegate=*/true,
+                                 /*opts_delegate=*/{}),
+            kDefaultNumXnnpackThreads);
+}
+
+TEST_F(InferenceCalculatorUtilsTest,
+       GetXnnpackNumThreadsReturnsDefaultIfThreadsNotSpecified) {
+  mediapipe::InferenceCalculatorOptions::Delegate opts_delegate;
+  opts_delegate.mutable_xnnpack();
+  EXPECT_EQ(GetXnnpackNumThreads(/*opts_has_delegate=*/true, opts_delegate),
+            kDefaultNumXnnpackThreads);
+}
+
+TEST_F(InferenceCalculatorUtilsTest,
+       GetXnnpackNumThreadsReturnsSetNumberOfThreads) {
+  absl::SetFlag(&FLAGS_xnnpack_default_num_threads, 42);
+  mediapipe::InferenceCalculatorOptions::Delegate opts_delegate;
+  opts_delegate.mutable_xnnpack()->set_num_threads(43);
+  EXPECT_EQ(GetXnnpackNumThreads(/*opts_has_delegate=*/true, opts_delegate),
+            43);
+}
+
+TEST_F(InferenceCalculatorUtilsTest,
+       GetXnnpackNumThreadsReturnsDefaultIfHasDelegateIsFalse) {
+  mediapipe::InferenceCalculatorOptions::Delegate opts_delegate;
+  opts_delegate.mutable_xnnpack()->set_num_threads(44);
+  EXPECT_EQ(GetXnnpackNumThreads(/*opts_has_delegate=*/false, opts_delegate),
+            kDefaultNumXnnpackThreads);
+}
+
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyCpuInputIntoInterpreterTensorWorksCorrectlyForInt32) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_len = 4;
   AddInterpreterInput(kTfLiteInt32, tensor_len, tensor_index,
@@ -120,8 +177,8 @@ TEST(InferenceCalculatorUtilsTest,
               ElementsAreArray(values));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyCpuInputIntoInterpreterTensorWorksCorrectlyForInt64) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyCpuInputIntoInterpreterTensorWorksCorrectlyForInt64) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_len = 4;
   AddInterpreterInput(kTfLiteInt64, tensor_len, tensor_index,
@@ -137,8 +194,8 @@ TEST(InferenceCalculatorUtilsTest,
               ElementsAreArray(values));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyCpuInputIntoInterpreterTensorWorksCorrectlyForUInt8) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyCpuInputIntoInterpreterTensorWorksCorrectlyForUInt8) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_len = 4;
   AddInterpreterInput(kTfLiteUInt8, tensor_len, tensor_index,
@@ -154,8 +211,8 @@ TEST(InferenceCalculatorUtilsTest,
               ElementsAreArray(values));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyCpuInputIntoInterpreterTensorWorksCorrectlyForInt8) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyCpuInputIntoInterpreterTensorWorksCorrectlyForInt8) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_len = 4;
   AddInterpreterInput(kTfLiteInt8, tensor_len, tensor_index,
@@ -171,8 +228,8 @@ TEST(InferenceCalculatorUtilsTest,
               ElementsAreArray(values));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyCpuInputIntoInterpreterTensorWorksCorrectlyForFloat32) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyCpuInputIntoInterpreterTensorWorksCorrectlyForFloat32) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_len = 4;
   AddInterpreterInput(kTfLiteFloat32, tensor_len, tensor_index,
@@ -188,8 +245,8 @@ TEST(InferenceCalculatorUtilsTest,
               ElementsAreArray(values));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyCpuInputIntoInterpreterTensorWorksCorrectlyForString) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyCpuInputIntoInterpreterTensorWorksCorrectlyForString) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_len = 4;
   AddInterpreterInput(kTfLiteString, tensor_len, tensor_index,
@@ -205,8 +262,8 @@ TEST(InferenceCalculatorUtilsTest,
               ElementsAreArray(values));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyCpuInputIntoInterpreterTensorTypeMismatch) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyCpuInputIntoInterpreterTensorTypeMismatch) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_len = 4;
   AddInterpreterInput(kTfLiteInt32, tensor_len, tensor_index,
@@ -223,8 +280,8 @@ TEST(InferenceCalculatorUtilsTest,
               HasSubstr("Input and interpreter tensor type do not match"));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyCpuInputIntoInterpreterTensorSizeMismatch) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyCpuInputIntoInterpreterTensorSizeMismatch) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_len = 5;
   AddInterpreterInput(kTfLiteFloat32, tensor_len, tensor_index,
@@ -241,8 +298,8 @@ TEST(InferenceCalculatorUtilsTest,
               HasSubstr("TfLiteTensor and Tensor sizes do not match"));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyCpuInputIntoInterpreterTensorNullBuffer) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyCpuInputIntoInterpreterTensorNullBuffer) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_len = 4;
   // Make TFLite interpreter's buffer null.
@@ -259,8 +316,8 @@ TEST(InferenceCalculatorUtilsTest,
   EXPECT_THAT(status.message(), HasSubstr("TfLiteTensor data is null"));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyCpuInputIntoInterpreterTensorUnsupportedType) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyCpuInputIntoInterpreterTensorUnsupportedType) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_len = 4;
 
@@ -283,8 +340,8 @@ TEST(InferenceCalculatorUtilsTest,
   EXPECT_THAT(status.message(), HasSubstr("Unsupported input data type:"));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyInterpreterTensorIntoCpuOutputWorksCorrectlyForFloat32) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyInterpreterTensorIntoCpuOutputWorksCorrectlyForFloat32) {
   std::vector<float> values{100.f, 200.f, 300.f, 400.f, 500.f, 600.f};
 
   tflite::CastOpModel m({TensorType_INT32, {2, 3}},
@@ -300,8 +357,8 @@ TEST(InferenceCalculatorUtilsTest,
               ElementsAreArray(values));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyInterpreterTensorIntoCpuOutputWorksCorrectlyForString) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyInterpreterTensorIntoCpuOutputWorksCorrectlyForString) {
   std::vector<char> values{'a', 'b', 'c', 'd'};
   int values_len = values.size();
   Tensor tensor(ElementType::kChar, Tensor::Shape({values_len}));
@@ -323,8 +380,8 @@ TEST(InferenceCalculatorUtilsTest,
               ElementsAreArray(values));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyInterpreterTensorIntoCpuOutputTypeMismatch) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyInterpreterTensorIntoCpuOutputTypeMismatch) {
   std::vector<float> values{100.f, 200.f, 300.f, 400.f, 500.f, 600.f};
 
   tflite::CastOpModel m({TensorType_INT32, {2, 3}},
@@ -341,8 +398,8 @@ TEST(InferenceCalculatorUtilsTest,
               HasSubstr("Output and TfLiteTensor types do not match"));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyInterpreterTensorIntoCpuOutputSizeMismatch) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyInterpreterTensorIntoCpuOutputSizeMismatch) {
   std::vector<float> values{100.f, 200.f, 300.f, 400.f, 500.f, 600.f};
 
   tflite::CastOpModel m({TensorType_INT32, {2, 3}},
@@ -359,8 +416,8 @@ TEST(InferenceCalculatorUtilsTest,
               HasSubstr("TfLiteTensor and Tensor shape do not match"));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyInterpreterTensorIntoCpuOutputNullBuffer) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyInterpreterTensorIntoCpuOutputNullBuffer) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_len = 4;
   // Make TFLite interpreter's buffer null.
@@ -378,8 +435,8 @@ TEST(InferenceCalculatorUtilsTest,
               HasSubstr("TfLiteTensor tensor buffer is null"));
 }
 
-TEST(InferenceCalculatorUtilsTest,
-     CopyInterpreterTensorIntoCpuOutputUnsupportedType) {
+TEST_F(InferenceCalculatorUtilsTest,
+       CopyInterpreterTensorIntoCpuOutputUnsupportedType) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_len = 4;
 
@@ -402,7 +459,7 @@ TEST(InferenceCalculatorUtilsTest,
   EXPECT_THAT(status.message(), HasSubstr("Unsupported output data type:"));
 }
 
-TEST(InferenceCalculatorUtilsTest, ConvertTfLiteTensorToFloat32) {
+TEST_F(InferenceCalculatorUtilsTest, ConvertTfLiteTensorToFloat32) {
   const std::vector<float> expected_values{100.f, 200.f, 300.f,
                                            400.f, 500.f, 600.f};
 
@@ -420,7 +477,7 @@ TEST(InferenceCalculatorUtilsTest, ConvertTfLiteTensorToFloat32) {
               ElementsAreArray(expected_values));
 }
 
-TEST(InferenceCalculatorUtilsTest, ShouldSetCustomAllocatorForCpuWriteView) {
+TEST_F(InferenceCalculatorUtilsTest, ShouldSetCustomAllocatorForCpuWriteView) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_size = 4;
   AddInterpreterInput(kTfLiteInt32, tensor_size, tensor_index,
@@ -449,7 +506,7 @@ TEST(InferenceCalculatorUtilsTest, ShouldSetCustomAllocatorForCpuWriteView) {
   }
 }
 
-TEST(InferenceCalculatorUtilsTest, ShouldSetCustomAllocatorForCpuReadView) {
+TEST_F(InferenceCalculatorUtilsTest, ShouldSetCustomAllocatorForCpuReadView) {
   tflite::Interpreter interpreter;
   int tensor_index, tensor_size = 4;
   AddInterpreterInput(kTfLiteInt32, tensor_size, tensor_index,
@@ -470,7 +527,7 @@ TEST(InferenceCalculatorUtilsTest, ShouldSetCustomAllocatorForCpuReadView) {
               ElementsAreArray(values));
 }
 
-TEST(InferenceCalculatorUtilsTest, ShouldConfirmTfLiteMemoryAlignment) {
+TEST_F(InferenceCalculatorUtilsTest, ShouldConfirmTfLiteMemoryAlignment) {
   std::vector<int32_t> values{1, 2, 3, 4};
   int values_len = values.size();
   Tensor tensor(ElementType::kInt32, Tensor::Shape({values_len}),
@@ -481,7 +538,7 @@ TEST(InferenceCalculatorUtilsTest, ShouldConfirmTfLiteMemoryAlignment) {
   EXPECT_TRUE(IsAlignedWithTFLiteDefaultAlignment(read_view.buffer<int32_t>()));
 }
 
-TEST(InferenceCalculatorUtilsTest, ShouldNotConfirmTfLiteMemoryAlignment) {
+TEST_F(InferenceCalculatorUtilsTest, ShouldNotConfirmTfLiteMemoryAlignment) {
   std::vector<int32_t> values{1, 2, 3, 4};
   int values_len = values.size();
   Tensor tensor(ElementType::kInt32, Tensor::Shape({values_len}),