From 2c1ddb46fe5c37d8db546bc011dacbb83f0e2015 Mon Sep 17 00:00:00 2001 From: Viraj Wadhwa Date: Fri, 20 Dec 2024 16:42:48 -0700 Subject: [PATCH] (FIX) Changed test case B value with correct zp/scale values & corrected nuances in op implementation --- .../com.microsoft/dynamic_quantize_matmul.cpp | 18 +++++++++--------- .../tests/onnx_import_com_microsoft.in.cpp | 11 +++++------ 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/frontends/onnx/frontend/src/op/com.microsoft/dynamic_quantize_matmul.cpp b/src/frontends/onnx/frontend/src/op/com.microsoft/dynamic_quantize_matmul.cpp index e278a3a94e5034..5ac0705feb8b13 100644 --- a/src/frontends/onnx/frontend/src/op/com.microsoft/dynamic_quantize_matmul.cpp +++ b/src/frontends/onnx/frontend/src/op/com.microsoft/dynamic_quantize_matmul.cpp @@ -6,7 +6,7 @@ #include "exceptions.hpp" #include "openvino/frontend/exception.hpp" #include "openvino/op/subtract.hpp" -#include "openvino/op/divide.hpp" +#include "openvino/op/multiply.hpp" #include "openvino/op/matmul.hpp" #include "openvino/op/add.hpp" #include "openvino/op/convert.hpp" @@ -77,14 +77,14 @@ ov::OutputVector dynamic_quantize_matmul(const ov::frontend::onnx::Node& node) { } // At time of writing, ov::MatMul does not support int8/uint8 types. To get the correct output, we need to dequantize B. - // Technically this does not do DynamicQuantization, but is required for correct output of the operator - // B_dequantized = (B - b_zero_point) / b_scale - - // We also need to convert b_scale dtype to match B_dequantized, as OV "Divide" op requires both argument dtypes to match - - ov::Output B_dequantized = std::make_shared(B, b_zero_point); - B_dequantized = std::make_shared(B_dequantized, b_scale.get_element_type()); - B_dequantized = std::make_shared(B_dequantized, b_scale); + // Technically this does not do DynamicQuantization, but is required for correct output of the operator. It will implement A * B_dequantized + bias + // According to ONNX RT docs, they do linear quantization shown here https://tomwildenhain-microsoft.github.io/onnxruntime/docs/performance/quantization.html + // B_dequantized = (B - b_zero_point) * b_scale + + ov::Output B_dequantized = std::make_shared(B, b_scale.get_element_type()); + b_zero_point = std::make_shared(b_zero_point, b_scale.get_element_type()); + B_dequantized = std::make_shared(B_dequantized, b_zero_point); + B_dequantized = std::make_shared(B_dequantized, b_scale); // A, B are N-dimensional matrices. According to example ONNX models for this operator, the suboperations pass input A/B such that B's shape is already transposed. // E.g. https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.onnx diff --git a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp index cedef2acc9079d..f8b9904e609d08 100644 --- a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp +++ b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp @@ -1363,12 +1363,11 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_dynamic_quantize_matmul) { // Fill test case here const std::vector input_A{1.29292f, 2.47473f, 3.291903f, 4.1728945f, 5.213912f, 6.1293125f}; - const std::vector input_B{1, 2, 3, 125, 126, -128}; - const std::vector b_scale{6.28947502f}; - const std::vector b_zero_point{69}; + const std::vector input_B{-2, 29, 61, 61, 29, 125}; + const std::vector b_scale{0.003137f}; + const std::vector b_zero_point{-34}; - const std::vector expected{3.16867157e+02f, 3.40601959e+02f, -3.60632910e+03f, 6.47037888e+01f, 1.11719864e+02f, -6.54899121e+03f, - -7.19602890e+01f, -6.04708314e-01f, -9.75953906e+03f}; + const std::vector expected{0.8681802f, 0.7458673f, 1.6218146f, 1.5770973f, 1.4774824f, 3.0677009f, 2.3504133f, 2.2423527f, 4.611995f}; // add_input needs to be called in order of model inputs (order matters) test_case.add_input(Shape{3,2}, input_A); @@ -1379,7 +1378,7 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_dynamic_quantize_matmul) { test_case.add_expected_output(Shape{3,3}, expected); if (std::string("${BACKEND_NAME}") == std::string("IE_GPU")) { - test_case.run_with_tolerance_as_fp(0.0001f); + test_case.run_with_tolerance_as_fp(0.003f); } else { test_case.run(); }