From 876bdd5a565ebd6dd1c896541c6ed4b8d464d58c Mon Sep 17 00:00:00 2001 From: MediaPipe Team Date: Mon, 7 Oct 2024 10:17:57 -0700 Subject: [PATCH] Add a flag to set the default number of XNNPACK threads XNNPACK defaults to 1 thread for gLinux runners. This is very slow for desktop runner apps, where usually a number of #cores/2 is a good value, but it's a) risky to change this default and b) there are situations where this is not wanted. PiperOrigin-RevId: 683230250 --- mediapipe/calculators/tensor/BUILD | 6 + .../tensor/inference_calculator_utils.cc | 12 +- .../tensor/inference_calculator_utils.h | 8 +- .../tensor/inference_calculator_utils_test.cc | 131 +++++++++++++----- 4 files changed, 116 insertions(+), 41 deletions(-) diff --git a/mediapipe/calculators/tensor/BUILD b/mediapipe/calculators/tensor/BUILD index 6dfb8b3785..4fa4faf399 100644 --- a/mediapipe/calculators/tensor/BUILD +++ b/mediapipe/calculators/tensor/BUILD @@ -453,6 +453,7 @@ cc_library_with_tflite( "//mediapipe/framework/port:status", "//mediapipe/framework/stream_handler:fixed_size_input_stream_handler", "//mediapipe/framework/tool:subgraph_expansion", + "//mediapipe/util:cpu_util", "@com_google_absl//absl/memory", "@com_google_absl//absl/status", "@com_google_absl//absl/status:statusor", @@ -816,6 +817,8 @@ cc_library( "//mediapipe/framework/formats:tensor", "//mediapipe/framework/port:ret_check", "//mediapipe/framework/port:status", + "@com_google_absl//absl/flags:flag", + "@com_google_absl//absl/log:absl_check", "@com_google_absl//absl/log:absl_log", "@com_google_absl//absl/status", "@com_google_absl//absl/status:statusor", @@ -841,9 +844,12 @@ cc_test( name = "inference_calculator_utils_test", srcs = ["inference_calculator_utils_test.cc"], deps = [ + ":inference_calculator_cc_proto", ":inference_calculator_utils", "//mediapipe/framework/formats:tensor", "//mediapipe/framework/port:gtest_main", + "@com_google_absl//absl/flags:flag", + "@com_google_absl//absl/log", "@com_google_absl//absl/log:absl_check", "@com_google_absl//absl/status", "@com_google_absl//absl/types:span", diff --git a/mediapipe/calculators/tensor/inference_calculator_utils.cc b/mediapipe/calculators/tensor/inference_calculator_utils.cc index 12aa179ae7..829bc676e5 100644 --- a/mediapipe/calculators/tensor/inference_calculator_utils.cc +++ b/mediapipe/calculators/tensor/inference_calculator_utils.cc @@ -20,6 +20,7 @@ #include #include +#include "absl/flags/flag.h" #include "absl/log/absl_log.h" #include "absl/status/status.h" #include "absl/status/statusor.h" @@ -38,15 +39,22 @@ #include "tensorflow/lite/portable_type_to_tflitetype.h" #include "tensorflow/lite/string_util.h" +ABSL_FLAG(int, xnnpack_default_num_threads, 0, + "Default number of xnnpack threads to use. If unset, determines a " + "good default number based on the platform."); + #if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__) #include "mediapipe/util/cpu_util.h" #endif // !__EMSCRIPTEN__ || __EMSCRIPTEN_PTHREADS__ namespace mediapipe { - namespace { int GetXnnpackDefaultNumThreads() { + int default_from_flag = absl::GetFlag(FLAGS_xnnpack_default_num_threads); + if (default_from_flag > 0) { + return default_from_flag; + } #if defined(MEDIAPIPE_ANDROID) || defined(MEDIAPIPE_IOS) || \ defined(__EMSCRIPTEN_PTHREADS__) constexpr int kMinNumThreadsByDefault = 1; @@ -216,7 +224,7 @@ absl::Status CopyTfLiteTensorToTensor(const TfLiteTensor& tflite_tensor, } // namespace int GetXnnpackNumThreads( - const bool opts_has_delegate, + bool opts_has_delegate, const mediapipe::InferenceCalculatorOptions::Delegate& opts_delegate) { static constexpr int kDefaultNumThreads = -1; if (opts_has_delegate && opts_delegate.has_xnnpack() && diff --git a/mediapipe/calculators/tensor/inference_calculator_utils.h b/mediapipe/calculators/tensor/inference_calculator_utils.h index 375e12efe7..917700d8e3 100644 --- a/mediapipe/calculators/tensor/inference_calculator_utils.h +++ b/mediapipe/calculators/tensor/inference_calculator_utils.h @@ -18,6 +18,7 @@ #include #include +#include "absl/flags/declare.h" #include "absl/status/status.h" #include "absl/status/statusor.h" #include "mediapipe/calculators/tensor/inference_calculator.pb.h" @@ -27,13 +28,16 @@ #include "tensorflow/lite/interpreter.h" #include "tensorflow/lite/util.h" +ABSL_DECLARE_FLAG(int, xnnpack_default_num_threads); + namespace mediapipe { // Returns number of threads to configure XNNPACK delegate with. // Returns user provided value if specified. Otherwise, tries to choose optimal -// number of threads depending on the device. +// number of threads depending on the device. The default can be overridden by +// setting the --xnnpack_default_num_threads flag. int GetXnnpackNumThreads( - const bool opts_has_delegate, + bool opts_has_delegate, const mediapipe::InferenceCalculatorOptions::Delegate& opts_delegate); absl::Status CopyCpuInputIntoTfLiteTensor(const Tensor& input_tensor, diff --git a/mediapipe/calculators/tensor/inference_calculator_utils_test.cc b/mediapipe/calculators/tensor/inference_calculator_utils_test.cc index 3ad8313ab3..10145929ab 100644 --- a/mediapipe/calculators/tensor/inference_calculator_utils_test.cc +++ b/mediapipe/calculators/tensor/inference_calculator_utils_test.cc @@ -23,9 +23,12 @@ #include #include +#include "absl/flags/flag.h" #include "absl/log/absl_check.h" +#include "absl/log/log.h" #include "absl/status/status.h" #include "absl/types/span.h" +#include "mediapipe/calculators/tensor/inference_calculator.pb.h" #include "mediapipe/framework/formats/tensor.h" #include "mediapipe/framework/port/gmock.h" #include "mediapipe/framework/port/gtest.h" @@ -40,6 +43,8 @@ namespace mediapipe { namespace { +constexpr int kDefaultNumXnnpackThreads = 1; + using ElementType = ::mediapipe::Tensor::ElementType; using ::testing::ElementsAreArray; using ::testing::HasSubstr; @@ -103,8 +108,60 @@ std::vector TfLiteInputTensorData(const Interpreter& interpreter, return std::vector(str.begin(), str.end()); } -TEST(InferenceCalculatorUtilsTest, - CopyCpuInputIntoInterpreterTensorWorksCorrectlyForInt32) { +class InferenceCalculatorUtilsTest : public ::testing::Test { + protected: + void TearDown() override { + absl::SetFlag(&FLAGS_xnnpack_default_num_threads, 0); + } +}; + +TEST_F(InferenceCalculatorUtilsTest, GetXnnpackNumThreadsReturnsDefault) { + EXPECT_EQ(GetXnnpackNumThreads(/*opts_has_delegate=*/false, + /*opts_delegate=*/{}), + kDefaultNumXnnpackThreads); +} + +TEST_F(InferenceCalculatorUtilsTest, GetXnnpackNumThreadsReturnsSetDefault) { + absl::SetFlag(&FLAGS_xnnpack_default_num_threads, 42); + EXPECT_EQ(GetXnnpackNumThreads(/*opts_has_delegate=*/false, + /*opts_delegate=*/{}), + 42); +} + +TEST_F(InferenceCalculatorUtilsTest, + GetXnnpackNumThreadsReturnsDefaultIfHasDelegateIsTrueButUnset) { + EXPECT_EQ(GetXnnpackNumThreads(/*opts_has_delegate=*/true, + /*opts_delegate=*/{}), + kDefaultNumXnnpackThreads); +} + +TEST_F(InferenceCalculatorUtilsTest, + GetXnnpackNumThreadsReturnsDefaultIfThreadsNotSpecified) { + mediapipe::InferenceCalculatorOptions::Delegate opts_delegate; + opts_delegate.mutable_xnnpack(); + EXPECT_EQ(GetXnnpackNumThreads(/*opts_has_delegate=*/true, opts_delegate), + kDefaultNumXnnpackThreads); +} + +TEST_F(InferenceCalculatorUtilsTest, + GetXnnpackNumThreadsReturnsSetNumberOfThreads) { + absl::SetFlag(&FLAGS_xnnpack_default_num_threads, 42); + mediapipe::InferenceCalculatorOptions::Delegate opts_delegate; + opts_delegate.mutable_xnnpack()->set_num_threads(43); + EXPECT_EQ(GetXnnpackNumThreads(/*opts_has_delegate=*/true, opts_delegate), + 43); +} + +TEST_F(InferenceCalculatorUtilsTest, + GetXnnpackNumThreadsReturnsDefaultIfHasDelegateIsFalse) { + mediapipe::InferenceCalculatorOptions::Delegate opts_delegate; + opts_delegate.mutable_xnnpack()->set_num_threads(44); + EXPECT_EQ(GetXnnpackNumThreads(/*opts_has_delegate=*/false, opts_delegate), + kDefaultNumXnnpackThreads); +} + +TEST_F(InferenceCalculatorUtilsTest, + CopyCpuInputIntoInterpreterTensorWorksCorrectlyForInt32) { tflite::Interpreter interpreter; int tensor_index, tensor_len = 4; AddInterpreterInput(kTfLiteInt32, tensor_len, tensor_index, @@ -120,8 +177,8 @@ TEST(InferenceCalculatorUtilsTest, ElementsAreArray(values)); } -TEST(InferenceCalculatorUtilsTest, - CopyCpuInputIntoInterpreterTensorWorksCorrectlyForInt64) { +TEST_F(InferenceCalculatorUtilsTest, + CopyCpuInputIntoInterpreterTensorWorksCorrectlyForInt64) { tflite::Interpreter interpreter; int tensor_index, tensor_len = 4; AddInterpreterInput(kTfLiteInt64, tensor_len, tensor_index, @@ -137,8 +194,8 @@ TEST(InferenceCalculatorUtilsTest, ElementsAreArray(values)); } -TEST(InferenceCalculatorUtilsTest, - CopyCpuInputIntoInterpreterTensorWorksCorrectlyForUInt8) { +TEST_F(InferenceCalculatorUtilsTest, + CopyCpuInputIntoInterpreterTensorWorksCorrectlyForUInt8) { tflite::Interpreter interpreter; int tensor_index, tensor_len = 4; AddInterpreterInput(kTfLiteUInt8, tensor_len, tensor_index, @@ -154,8 +211,8 @@ TEST(InferenceCalculatorUtilsTest, ElementsAreArray(values)); } -TEST(InferenceCalculatorUtilsTest, - CopyCpuInputIntoInterpreterTensorWorksCorrectlyForInt8) { +TEST_F(InferenceCalculatorUtilsTest, + CopyCpuInputIntoInterpreterTensorWorksCorrectlyForInt8) { tflite::Interpreter interpreter; int tensor_index, tensor_len = 4; AddInterpreterInput(kTfLiteInt8, tensor_len, tensor_index, @@ -171,8 +228,8 @@ TEST(InferenceCalculatorUtilsTest, ElementsAreArray(values)); } -TEST(InferenceCalculatorUtilsTest, - CopyCpuInputIntoInterpreterTensorWorksCorrectlyForFloat32) { +TEST_F(InferenceCalculatorUtilsTest, + CopyCpuInputIntoInterpreterTensorWorksCorrectlyForFloat32) { tflite::Interpreter interpreter; int tensor_index, tensor_len = 4; AddInterpreterInput(kTfLiteFloat32, tensor_len, tensor_index, @@ -188,8 +245,8 @@ TEST(InferenceCalculatorUtilsTest, ElementsAreArray(values)); } -TEST(InferenceCalculatorUtilsTest, - CopyCpuInputIntoInterpreterTensorWorksCorrectlyForString) { +TEST_F(InferenceCalculatorUtilsTest, + CopyCpuInputIntoInterpreterTensorWorksCorrectlyForString) { tflite::Interpreter interpreter; int tensor_index, tensor_len = 4; AddInterpreterInput(kTfLiteString, tensor_len, tensor_index, @@ -205,8 +262,8 @@ TEST(InferenceCalculatorUtilsTest, ElementsAreArray(values)); } -TEST(InferenceCalculatorUtilsTest, - CopyCpuInputIntoInterpreterTensorTypeMismatch) { +TEST_F(InferenceCalculatorUtilsTest, + CopyCpuInputIntoInterpreterTensorTypeMismatch) { tflite::Interpreter interpreter; int tensor_index, tensor_len = 4; AddInterpreterInput(kTfLiteInt32, tensor_len, tensor_index, @@ -223,8 +280,8 @@ TEST(InferenceCalculatorUtilsTest, HasSubstr("Input and interpreter tensor type do not match")); } -TEST(InferenceCalculatorUtilsTest, - CopyCpuInputIntoInterpreterTensorSizeMismatch) { +TEST_F(InferenceCalculatorUtilsTest, + CopyCpuInputIntoInterpreterTensorSizeMismatch) { tflite::Interpreter interpreter; int tensor_index, tensor_len = 5; AddInterpreterInput(kTfLiteFloat32, tensor_len, tensor_index, @@ -241,8 +298,8 @@ TEST(InferenceCalculatorUtilsTest, HasSubstr("TfLiteTensor and Tensor sizes do not match")); } -TEST(InferenceCalculatorUtilsTest, - CopyCpuInputIntoInterpreterTensorNullBuffer) { +TEST_F(InferenceCalculatorUtilsTest, + CopyCpuInputIntoInterpreterTensorNullBuffer) { tflite::Interpreter interpreter; int tensor_index, tensor_len = 4; // Make TFLite interpreter's buffer null. @@ -259,8 +316,8 @@ TEST(InferenceCalculatorUtilsTest, EXPECT_THAT(status.message(), HasSubstr("TfLiteTensor data is null")); } -TEST(InferenceCalculatorUtilsTest, - CopyCpuInputIntoInterpreterTensorUnsupportedType) { +TEST_F(InferenceCalculatorUtilsTest, + CopyCpuInputIntoInterpreterTensorUnsupportedType) { tflite::Interpreter interpreter; int tensor_index, tensor_len = 4; @@ -283,8 +340,8 @@ TEST(InferenceCalculatorUtilsTest, EXPECT_THAT(status.message(), HasSubstr("Unsupported input data type:")); } -TEST(InferenceCalculatorUtilsTest, - CopyInterpreterTensorIntoCpuOutputWorksCorrectlyForFloat32) { +TEST_F(InferenceCalculatorUtilsTest, + CopyInterpreterTensorIntoCpuOutputWorksCorrectlyForFloat32) { std::vector values{100.f, 200.f, 300.f, 400.f, 500.f, 600.f}; tflite::CastOpModel m({TensorType_INT32, {2, 3}}, @@ -300,8 +357,8 @@ TEST(InferenceCalculatorUtilsTest, ElementsAreArray(values)); } -TEST(InferenceCalculatorUtilsTest, - CopyInterpreterTensorIntoCpuOutputWorksCorrectlyForString) { +TEST_F(InferenceCalculatorUtilsTest, + CopyInterpreterTensorIntoCpuOutputWorksCorrectlyForString) { std::vector values{'a', 'b', 'c', 'd'}; int values_len = values.size(); Tensor tensor(ElementType::kChar, Tensor::Shape({values_len})); @@ -323,8 +380,8 @@ TEST(InferenceCalculatorUtilsTest, ElementsAreArray(values)); } -TEST(InferenceCalculatorUtilsTest, - CopyInterpreterTensorIntoCpuOutputTypeMismatch) { +TEST_F(InferenceCalculatorUtilsTest, + CopyInterpreterTensorIntoCpuOutputTypeMismatch) { std::vector values{100.f, 200.f, 300.f, 400.f, 500.f, 600.f}; tflite::CastOpModel m({TensorType_INT32, {2, 3}}, @@ -341,8 +398,8 @@ TEST(InferenceCalculatorUtilsTest, HasSubstr("Output and TfLiteTensor types do not match")); } -TEST(InferenceCalculatorUtilsTest, - CopyInterpreterTensorIntoCpuOutputSizeMismatch) { +TEST_F(InferenceCalculatorUtilsTest, + CopyInterpreterTensorIntoCpuOutputSizeMismatch) { std::vector values{100.f, 200.f, 300.f, 400.f, 500.f, 600.f}; tflite::CastOpModel m({TensorType_INT32, {2, 3}}, @@ -359,8 +416,8 @@ TEST(InferenceCalculatorUtilsTest, HasSubstr("TfLiteTensor and Tensor shape do not match")); } -TEST(InferenceCalculatorUtilsTest, - CopyInterpreterTensorIntoCpuOutputNullBuffer) { +TEST_F(InferenceCalculatorUtilsTest, + CopyInterpreterTensorIntoCpuOutputNullBuffer) { tflite::Interpreter interpreter; int tensor_index, tensor_len = 4; // Make TFLite interpreter's buffer null. @@ -378,8 +435,8 @@ TEST(InferenceCalculatorUtilsTest, HasSubstr("TfLiteTensor tensor buffer is null")); } -TEST(InferenceCalculatorUtilsTest, - CopyInterpreterTensorIntoCpuOutputUnsupportedType) { +TEST_F(InferenceCalculatorUtilsTest, + CopyInterpreterTensorIntoCpuOutputUnsupportedType) { tflite::Interpreter interpreter; int tensor_index, tensor_len = 4; @@ -402,7 +459,7 @@ TEST(InferenceCalculatorUtilsTest, EXPECT_THAT(status.message(), HasSubstr("Unsupported output data type:")); } -TEST(InferenceCalculatorUtilsTest, ConvertTfLiteTensorToFloat32) { +TEST_F(InferenceCalculatorUtilsTest, ConvertTfLiteTensorToFloat32) { const std::vector expected_values{100.f, 200.f, 300.f, 400.f, 500.f, 600.f}; @@ -420,7 +477,7 @@ TEST(InferenceCalculatorUtilsTest, ConvertTfLiteTensorToFloat32) { ElementsAreArray(expected_values)); } -TEST(InferenceCalculatorUtilsTest, ShouldSetCustomAllocatorForCpuWriteView) { +TEST_F(InferenceCalculatorUtilsTest, ShouldSetCustomAllocatorForCpuWriteView) { tflite::Interpreter interpreter; int tensor_index, tensor_size = 4; AddInterpreterInput(kTfLiteInt32, tensor_size, tensor_index, @@ -449,7 +506,7 @@ TEST(InferenceCalculatorUtilsTest, ShouldSetCustomAllocatorForCpuWriteView) { } } -TEST(InferenceCalculatorUtilsTest, ShouldSetCustomAllocatorForCpuReadView) { +TEST_F(InferenceCalculatorUtilsTest, ShouldSetCustomAllocatorForCpuReadView) { tflite::Interpreter interpreter; int tensor_index, tensor_size = 4; AddInterpreterInput(kTfLiteInt32, tensor_size, tensor_index, @@ -470,7 +527,7 @@ TEST(InferenceCalculatorUtilsTest, ShouldSetCustomAllocatorForCpuReadView) { ElementsAreArray(values)); } -TEST(InferenceCalculatorUtilsTest, ShouldConfirmTfLiteMemoryAlignment) { +TEST_F(InferenceCalculatorUtilsTest, ShouldConfirmTfLiteMemoryAlignment) { std::vector values{1, 2, 3, 4}; int values_len = values.size(); Tensor tensor(ElementType::kInt32, Tensor::Shape({values_len}), @@ -481,7 +538,7 @@ TEST(InferenceCalculatorUtilsTest, ShouldConfirmTfLiteMemoryAlignment) { EXPECT_TRUE(IsAlignedWithTFLiteDefaultAlignment(read_view.buffer())); } -TEST(InferenceCalculatorUtilsTest, ShouldNotConfirmTfLiteMemoryAlignment) { +TEST_F(InferenceCalculatorUtilsTest, ShouldNotConfirmTfLiteMemoryAlignment) { std::vector values{1, 2, 3, 4}; int values_len = values.size(); Tensor tensor(ElementType::kInt32, Tensor::Shape({values_len}),