Skip to content

Commit

Permalink
Automated sync from github.com/tensorflow/tensorflow (#2445)
Browse files Browse the repository at this point in the history
BUG=automated sync from upstream
NO_CHECK_TFLITE_FILES=automated sync from upstream
  • Loading branch information
TFLM-bot authored Feb 6, 2024
1 parent 9d1dd7b commit 42f4bb8
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 64 deletions.
48 changes: 48 additions & 0 deletions tensorflow/lite/kernels/internal/common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,53 @@ limitations under the License.

namespace tflite {

// Single-rounding MultiplyByQuantizedMultiplier
#if TFLITE_SINGLE_ROUNDING
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier,
int shift) {
TFLITE_DCHECK(quantized_multiplier >= 0);
TFLITE_DCHECK(shift >= -31 && shift <= 30);

const int64_t total_shift = 31 - shift;
const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
int64_t result = x * static_cast<int64_t>(quantized_multiplier) + round;
result = result >> total_shift;

TFLITE_DCHECK(result >= std::numeric_limits<int32_t>::min() &&
result <= std::numeric_limits<int32_t>::max());
return static_cast<int32_t>(result);
}

int32_t MultiplyByQuantizedMultiplier(int64_t x, int32_t quantized_multiplier,
int shift) {
// Inputs:
// - quantized_multiplier has fixed point at bit 31
// - shift is -31 to +7 (negative for right shift)
//
// Assumptions: The following input ranges are assumed
// - quantize_scale>=0 (the usual range is (1<<30) to (1>>31)-1)
// - scaling is chosen so final scaled result fits in int32_t
// - input x is in the range -(1<<47) <= x < (1<<47)
TFLITE_DCHECK(quantized_multiplier >= 0);
TFLITE_DCHECK(shift >= -31 && shift < 8);
TFLITE_DCHECK(x >= -(static_cast<int64_t>(1) << 47) &&
x < (static_cast<int64_t>(1) << 47));

const int32_t reduced_multiplier =
(quantized_multiplier < 0x7FFF0000)
? ((quantized_multiplier + (1 << 15)) >> 16)
: 0x7FFF;
const int64_t total_shift = 15 - shift;
const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
int64_t result = x * static_cast<int64_t>(reduced_multiplier) + round;
result = result >> total_shift;

TFLITE_DCHECK(result >= std::numeric_limits<int32_t>::min() &&
result <= std::numeric_limits<int32_t>::max());
return static_cast<int32_t>(result);
}
// Double-rounding MultiplyByQuantizedMultiplier
#else
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier,
int shift) {
using gemmlowp::RoundingDivideByPOT;
Expand Down Expand Up @@ -51,5 +98,6 @@ int32_t MultiplyByQuantizedMultiplier(int64_t x, int32_t quantized_multiplier,
int32_t result = x >> total_shift;
return result;
}
#endif // TFLITE_SINGLE_ROUNDING

} // namespace tflite
58 changes: 6 additions & 52 deletions tensorflow/lite/kernels/internal/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -257,24 +257,14 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
#endif
}

// Single-rounding MultiplyByQuantizedMultiplier
#if TFLITE_SINGLE_ROUNDING
inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
int32_t quantized_multiplier,
int shift) {
TFLITE_DCHECK(quantized_multiplier >= 0);
TFLITE_DCHECK(shift >= -31 && shift <= 30);

const int64_t total_shift = 31 - shift;
const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
int64_t result = x * static_cast<int64_t>(quantized_multiplier) + round;
result = result >> total_shift;
TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
int32_t x, int32_t quantized_multiplier, int shift);

TFLITE_DCHECK(result >= std::numeric_limits<int32_t>::min() &&
result <= std::numeric_limits<int32_t>::max());
return static_cast<int32_t>(result);
}
TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
int64_t x, int32_t quantized_multiplier, int shift);

// Single-rounding MultiplyByQuantizedMultiplier
#if TFLITE_SINGLE_ROUNDING
inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
int32_t x, int32_t quantized_multiplier, int shift) {
TFLITE_DCHECK_LE(shift, 0);
Expand All @@ -287,36 +277,6 @@ inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
}

inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
int32_t quantized_multiplier,
int shift) {
// Inputs:
// - quantized_multiplier has fixed point at bit 31
// - shift is -31 to +7 (negative for right shift)
//
// Assumptions: The following input ranges are assumed
// - quantize_scale>=0 (the usual range is (1<<30) to (1>>31)-1)
// - scaling is chosen so final scaled result fits in int32_t
// - input x is in the range -(1<<47) <= x < (1<<47)
TFLITE_DCHECK(quantized_multiplier >= 0);
TFLITE_DCHECK(shift >= -31 && shift < 8);
TFLITE_DCHECK(x >= -(static_cast<int64_t>(1) << 47) &&
x < (static_cast<int64_t>(1) << 47));

const int32_t reduced_multiplier =
(quantized_multiplier < 0x7FFF0000)
? ((quantized_multiplier + (1 << 15)) >> 16)
: 0x7FFF;
const int64_t total_shift = 15 - shift;
const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
int64_t result = x * static_cast<int64_t>(reduced_multiplier) + round;
result = result >> total_shift;

TFLITE_DCHECK(result >= std::numeric_limits<int32_t>::min() &&
result <= std::numeric_limits<int32_t>::max());
return static_cast<int32_t>(result);
}

#ifdef USE_NEON
inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
int32x4x4_t input_val, int32_t quantized_multiplier, int shift) {
Expand Down Expand Up @@ -366,12 +326,6 @@ inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
quantized_multiplier);
}

TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
int32_t x, int32_t quantized_multiplier, int shift);

TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
int64_t x, int32_t quantized_multiplier, int shift);

#ifdef USE_NEON
// Round uses ARM's rounding shift right.
inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
Expand Down
3 changes: 2 additions & 1 deletion tensorflow/lite/kernels/internal/portable_tensor_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,8 @@ void SparseMatrixBatchVectorMultiplyAccumulate1x16(
const int32_t* __restrict__ indices, int m_rows, int m_cols,
const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
int n_batch, const int32_t input_offset, const int32_t output_multiplier,
const int32_t output_shift, const int32_t output_offset,
int32_t output_shift, const int32_t* per_channel_scale,
const int32_t* per_channel_shift, int32_t output_offset,
const int32_t output_activation_min, const int32_t output_activation_max,
int8_t* __restrict__ result);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
*result += dotprod * batch_scaling_factor;
++result;
} // for row
} // for batch
} // for batch
}

void PortableMatrixBatchVectorMultiplyAccumulate(
Expand Down Expand Up @@ -200,7 +200,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
*result += dotprod * scale;
++result;
} // for row
} // for batch
} // for batch
}

void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
Expand Down Expand Up @@ -232,7 +232,8 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
const int32_t* __restrict__ indices, int m_rows, int m_cols,
const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
int n_batch, const int32_t input_offset, const int32_t output_multiplier,
const int32_t output_shift, const int32_t output_offset,
const int32_t output_shift, const int32_t* per_channel_scale,
const int32_t* per_channel_shift, const int32_t output_offset,
const int32_t output_activation_min, const int32_t output_activation_max,
int8_t* __restrict__ result) {
const int kBlockSize = 16;
Expand All @@ -252,8 +253,10 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
}
}
const int32_t bias_value = bias_vector != nullptr ? bias_vector[row] : 0;
dot_prod = MultiplyByQuantizedMultiplier(dot_prod + bias_value,
output_multiplier, output_shift);
dot_prod = MultiplyByQuantizedMultiplier(
dot_prod + bias_value,
per_channel_scale ? per_channel_scale[row] : output_multiplier,
per_channel_shift ? per_channel_shift[row] : output_shift);
dot_prod += output_offset;
result[batch * m_rows + row] =
static_cast<int8_t>(ActivationFunctionWithMinMax(
Expand Down Expand Up @@ -319,14 +322,14 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate(
for (int c = 0; c < kBlockSize; c++) {
dotprod += (*row_ptr++) * (*vector_block_ptr++);
} // for block
} // for num_nonzero_blocks
} // for num_nonzero_blocks
float scaling_factor = batch_scaling_factor;
if (per_channel_scale) {
scaling_factor *= per_channel_scale[row];
}
result[batch * m_rows + row] += dotprod * scaling_factor;
} // for row
} // for batch
} // for batch
}

template <typename T>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,14 +116,16 @@ void SparseMatrixBatchVectorMultiplyAccumulate1x16(
const int32_t* __restrict__ indices, int m_rows, int m_cols,
const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
int n_batch, const int32_t input_offset, const int32_t output_multiplier,
const int32_t output_shift, const int32_t output_offset,
const int32_t output_shift, const int32_t* per_channel_scale,
const int32_t* per_channel_shift, const int32_t output_offset,
const int32_t output_activation_min, const int32_t output_activation_max,

int8_t* __restrict__ result) {
PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
matrix, segments, indices, m_rows, m_cols, vector, bias_vector, n_batch,
input_offset, output_multiplier, output_shift, output_offset,
output_activation_min, output_activation_max, result);
input_offset, output_multiplier, output_shift, per_channel_scale,
per_channel_shift, output_offset, output_activation_min,
output_activation_max, result);
}

void SparseMatrixBatchVectorMultiplyAccumulate(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
const int32_t* __restrict__ indices, int m_rows, int m_cols,
const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
int n_batch, const int32_t input_offset, const int32_t output_multiplier,
const int32_t output_shift, const int32_t output_offset,
int32_t output_shift, const int32_t* per_channel_scale,
const int32_t* per_channel_shift, int32_t output_offset,
const int32_t output_activation_min, const int32_t output_activation_max,
int8_t* __restrict__ result);

Expand Down

0 comments on commit 42f4bb8

Please sign in to comment.