From 2c1ddb46fe5c37d8db546bc011dacbb83f0e2015 Mon Sep 17 00:00:00 2001
From: Viraj Wadhwa <viraj.wadhwa@intel.com>
Date: Fri, 20 Dec 2024 16:42:48 -0700
Subject: [PATCH] (FIX) Changed test case B value with correct zp/scale values
 & corrected nuances in op implementation

---
 .../com.microsoft/dynamic_quantize_matmul.cpp  | 18 +++++++++---------
 .../tests/onnx_import_com_microsoft.in.cpp     | 11 +++++------
 2 files changed, 14 insertions(+), 15 deletions(-)
diff --git a/src/frontends/onnx/frontend/src/op/com.microsoft/dynamic_quantize_matmul.cpp b/src/frontends/onnx/frontend/src/op/com.microsoft/dynamic_quantize_matmul.cpp
index e278a3a94e5034..5ac0705feb8b13 100644
--- a/src/frontends/onnx/frontend/src/op/com.microsoft/dynamic_quantize_matmul.cpp
+++ b/src/frontends/onnx/frontend/src/op/com.microsoft/dynamic_quantize_matmul.cpp
@@ -6,7 +6,7 @@
 #include "exceptions.hpp"
 #include "openvino/frontend/exception.hpp"
 #include "openvino/op/subtract.hpp"
-#include "openvino/op/divide.hpp"
+#include "openvino/op/multiply.hpp"
 #include "openvino/op/matmul.hpp"
 #include "openvino/op/add.hpp"
 #include "openvino/op/convert.hpp"
@@ -77,14 +77,14 @@ ov::OutputVector dynamic_quantize_matmul(const ov::frontend::onnx::Node& node) {
     }
 
     // At time of writing, ov::MatMul does not support int8/uint8 types. To get the correct output, we need to dequantize B. 
-    // Technically this does not do DynamicQuantization, but is required for correct output of the operator
-    // B_dequantized = (B - b_zero_point) / b_scale
-    
-    // We also need to convert b_scale dtype to match B_dequantized, as OV "Divide" op requires both argument dtypes to match
-
-    ov::Output<ov::Node> B_dequantized = std::make_shared<v1::Subtract>(B, b_zero_point);
-    B_dequantized = std::make_shared<v0::Convert>(B_dequantized, b_scale.get_element_type());
-    B_dequantized = std::make_shared<v1::Divide>(B_dequantized, b_scale);
+    // Technically this does not do DynamicQuantization, but is required for correct output of the operator. It will implement A * B_dequantized + bias
+    // According to ONNX RT docs, they do linear quantization shown here https://tomwildenhain-microsoft.github.io/onnxruntime/docs/performance/quantization.html
+    // B_dequantized = (B - b_zero_point) * b_scale
+
+    ov::Output<ov::Node> B_dequantized = std::make_shared<v0::Convert>(B, b_scale.get_element_type());
+    b_zero_point = std::make_shared<v0::Convert>(b_zero_point, b_scale.get_element_type());
+    B_dequantized = std::make_shared<v1::Subtract>(B_dequantized, b_zero_point);
+    B_dequantized = std::make_shared<v1::Multiply>(B_dequantized, b_scale);
 
     // A, B are N-dimensional matrices. According to example ONNX models for this operator, the suboperations pass input A/B such that B's shape is already transposed.
     // E.g. https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.onnx
diff --git a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp
index cedef2acc9079d..f8b9904e609d08 100644
--- a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp
+++ b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp
@@ -1363,12 +1363,11 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_dynamic_quantize_matmul) {
 
     // Fill test case here
     const std::vector<float> input_A{1.29292f, 2.47473f, 3.291903f, 4.1728945f, 5.213912f, 6.1293125f};
-    const std::vector<int8_t> input_B{1, 2, 3, 125, 126, -128};
-    const std::vector<float> b_scale{6.28947502f};
-    const std::vector<int8_t> b_zero_point{69};
+    const std::vector<int8_t> input_B{-2, 29, 61, 61, 29, 125};
+    const std::vector<float> b_scale{0.003137f};
+    const std::vector<int8_t> b_zero_point{-34};
 
-    const std::vector<float> expected{3.16867157e+02f, 3.40601959e+02f, -3.60632910e+03f, 6.47037888e+01f, 1.11719864e+02f, -6.54899121e+03f,
-    -7.19602890e+01f, -6.04708314e-01f, -9.75953906e+03f};
+    const std::vector<float> expected{0.8681802f, 0.7458673f, 1.6218146f, 1.5770973f, 1.4774824f, 3.0677009f, 2.3504133f, 2.2423527f, 4.611995f};
 
     // add_input needs to be called in order of model inputs (order matters)
     test_case.add_input<float>(Shape{3,2}, input_A);
@@ -1379,7 +1378,7 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_dynamic_quantize_matmul) {
     test_case.add_expected_output<float>(Shape{3,3}, expected);
 
     if (std::string("${BACKEND_NAME}") == std::string("IE_GPU")) {
-        test_case.run_with_tolerance_as_fp(0.0001f);
+        test_case.run_with_tolerance_as_fp(0.003f);
     } else {
         test_case.run();
     }