Automated sync from github.com/tensorflow/tensorflow (#2445)

BUG=automated sync from upstream NO_CHECK_TFLITE_FILES=automated sync from upstream
tensorflow · Feb 6, 2024 · 42f4bb8 · 42f4bb8
1 parent 9d1dd7b
commit 42f4bb8
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 64 deletions.
diff --git a/tensorflow/lite/kernels/internal/common.cc b/tensorflow/lite/kernels/internal/common.cc
@@ -17,6 +17,53 @@ limitations under the License.
 
 namespace tflite {
 
+// Single-rounding MultiplyByQuantizedMultiplier
+#if TFLITE_SINGLE_ROUNDING
+int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier,
+                                      int shift) {
+  TFLITE_DCHECK(quantized_multiplier >= 0);
+  TFLITE_DCHECK(shift >= -31 && shift <= 30);
+
+  const int64_t total_shift = 31 - shift;
+  const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
+  int64_t result = x * static_cast<int64_t>(quantized_multiplier) + round;
+  result = result >> total_shift;
+
+  TFLITE_DCHECK(result >= std::numeric_limits<int32_t>::min() &&
+                result <= std::numeric_limits<int32_t>::max());
+  return static_cast<int32_t>(result);
+}
+
+int32_t MultiplyByQuantizedMultiplier(int64_t x, int32_t quantized_multiplier,
+                                      int shift) {
+  // Inputs:
+  // - quantized_multiplier has fixed point at bit 31
+  // - shift is -31 to +7 (negative for right shift)
+  //
+  // Assumptions: The following input ranges are assumed
+  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
+  // - scaling is chosen so final scaled result fits in int32_t
+  // - input x is in the range -(1<<47) <= x < (1<<47)
+  TFLITE_DCHECK(quantized_multiplier >= 0);
+  TFLITE_DCHECK(shift >= -31 && shift < 8);
+  TFLITE_DCHECK(x >= -(static_cast<int64_t>(1) << 47) &&
+                x < (static_cast<int64_t>(1) << 47));
+
+  const int32_t reduced_multiplier =
+      (quantized_multiplier < 0x7FFF0000)
+          ? ((quantized_multiplier + (1 << 15)) >> 16)
+          : 0x7FFF;
+  const int64_t total_shift = 15 - shift;
+  const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
+  int64_t result = x * static_cast<int64_t>(reduced_multiplier) + round;
+  result = result >> total_shift;
+
+  TFLITE_DCHECK(result >= std::numeric_limits<int32_t>::min() &&
+                result <= std::numeric_limits<int32_t>::max());
+  return static_cast<int32_t>(result);
+}
+// Double-rounding MultiplyByQuantizedMultiplier
+#else
 int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier,
                                       int shift) {
   using gemmlowp::RoundingDivideByPOT;
@@ -51,5 +98,6 @@ int32_t MultiplyByQuantizedMultiplier(int64_t x, int32_t quantized_multiplier,
   int32_t result = x >> total_shift;
   return result;
 }
+#endif  // TFLITE_SINGLE_ROUNDING
 
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
@@ -257,24 +257,14 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
 #endif
 }
 
-// Single-rounding MultiplyByQuantizedMultiplier
-#if TFLITE_SINGLE_ROUNDING
-inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
-  TFLITE_DCHECK(quantized_multiplier >= 0);
-  TFLITE_DCHECK(shift >= -31 && shift <= 30);
-
-  const int64_t total_shift = 31 - shift;
-  const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
-  int64_t result = x * static_cast<int64_t>(quantized_multiplier) + round;
-  result = result >> total_shift;
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int32_t x, int32_t quantized_multiplier, int shift);
 
-  TFLITE_DCHECK(result >= std::numeric_limits<int32_t>::min() &&
-                result <= std::numeric_limits<int32_t>::max());
-  return static_cast<int32_t>(result);
-}
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int64_t x, int32_t quantized_multiplier, int shift);
 
+// Single-rounding MultiplyByQuantizedMultiplier
+#if TFLITE_SINGLE_ROUNDING
 inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
     int32_t x, int32_t quantized_multiplier, int shift) {
   TFLITE_DCHECK_LE(shift, 0);
@@ -287,36 +277,6 @@ inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
   return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
 }
 
-inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
-  // Inputs:
-  // - quantized_multiplier has fixed point at bit 31
-  // - shift is -31 to +7 (negative for right shift)
-  //
-  // Assumptions: The following input ranges are assumed
-  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
-  // - scaling is chosen so final scaled result fits in int32_t
-  // - input x is in the range -(1<<47) <= x < (1<<47)
-  TFLITE_DCHECK(quantized_multiplier >= 0);
-  TFLITE_DCHECK(shift >= -31 && shift < 8);
-  TFLITE_DCHECK(x >= -(static_cast<int64_t>(1) << 47) &&
-                x < (static_cast<int64_t>(1) << 47));
-
-  const int32_t reduced_multiplier =
-      (quantized_multiplier < 0x7FFF0000)
-          ? ((quantized_multiplier + (1 << 15)) >> 16)
-          : 0x7FFF;
-  const int64_t total_shift = 15 - shift;
-  const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
-  int64_t result = x * static_cast<int64_t>(reduced_multiplier) + round;
-  result = result >> total_shift;
-
-  TFLITE_DCHECK(result >= std::numeric_limits<int32_t>::min() &&
-                result <= std::numeric_limits<int32_t>::max());
-  return static_cast<int32_t>(result);
-}
-
 #ifdef USE_NEON
 inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
     int32x4x4_t input_val, int32_t quantized_multiplier, int shift) {
@@ -366,12 +326,6 @@ inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
                                            quantized_multiplier);
 }
 
-TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
-    int32_t x, int32_t quantized_multiplier, int shift);
-
-TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
-    int64_t x, int32_t quantized_multiplier, int shift);
-
 #ifdef USE_NEON
 // Round uses ARM's rounding shift right.
 inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(

diff --git a/tensorflow/lite/kernels/internal/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/portable_tensor_utils.h
@@ -241,7 +241,8 @@ void SparseMatrixBatchVectorMultiplyAccumulate1x16(
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
-    const int32_t output_shift, const int32_t output_offset,
+    int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, int32_t output_offset,
     const int32_t output_activation_min, const int32_t output_activation_max,
     int8_t* __restrict__ result);
 

diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -157,7 +157,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
       *result += dotprod * batch_scaling_factor;
       ++result;
     }  // for row
-  }    // for batch
+  }  // for batch
 }
 
 void PortableMatrixBatchVectorMultiplyAccumulate(
@@ -200,7 +200,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
       *result += dotprod * scale;
       ++result;
     }  // for row
-  }    // for batch
+  }  // for batch
 }
 
 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
@@ -232,7 +232,8 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
-    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, const int32_t output_offset,
     const int32_t output_activation_min, const int32_t output_activation_max,
     int8_t* __restrict__ result) {
   const int kBlockSize = 16;
@@ -252,8 +253,10 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
         }
       }
       const int32_t bias_value = bias_vector != nullptr ? bias_vector[row] : 0;
-      dot_prod = MultiplyByQuantizedMultiplier(dot_prod + bias_value,
-                                               output_multiplier, output_shift);
+      dot_prod = MultiplyByQuantizedMultiplier(
+          dot_prod + bias_value,
+          per_channel_scale ? per_channel_scale[row] : output_multiplier,
+          per_channel_shift ? per_channel_shift[row] : output_shift);
       dot_prod += output_offset;
       result[batch * m_rows + row] =
           static_cast<int8_t>(ActivationFunctionWithMinMax(
@@ -319,14 +322,14 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate(
         for (int c = 0; c < kBlockSize; c++) {
           dotprod += (*row_ptr++) * (*vector_block_ptr++);
         }  // for block
-      }    // for num_nonzero_blocks
+      }  // for num_nonzero_blocks
       float scaling_factor = batch_scaling_factor;
       if (per_channel_scale) {
         scaling_factor *= per_channel_scale[row];
       }
       result[batch * m_rows + row] += dotprod * scaling_factor;
     }  // for row
-  }    // for batch
+  }  // for batch
 }
 
 template <typename T>

diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -116,14 +116,16 @@ void SparseMatrixBatchVectorMultiplyAccumulate1x16(
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
-    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, const int32_t output_offset,
     const int32_t output_activation_min, const int32_t output_activation_max,
 
     int8_t* __restrict__ result) {
   PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
       matrix, segments, indices, m_rows, m_cols, vector, bias_vector, n_batch,
-      input_offset, output_multiplier, output_shift, output_offset,
-      output_activation_min, output_activation_max, result);
+      input_offset, output_multiplier, output_shift, per_channel_scale,
+      per_channel_shift, output_offset, output_activation_min,
+      output_activation_max, result);
 }
 
 void SparseMatrixBatchVectorMultiplyAccumulate(

diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -92,7 +92,8 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
-    const int32_t output_shift, const int32_t output_offset,
+    int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, int32_t output_offset,
     const int32_t output_activation_min, const int32_t output_activation_max,
     int8_t* __restrict__ result);