/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace executorch { namespace backends { namespace xnnpack { namespace utils { using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::Error; constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f; Error ChooseQuantizationParams( float min, float max, int32_t qmin, int32_t qmax, QuantizationParams& result, bool preserve_sparsity = false, bool force_scale_power_of_two = false, bool reduce_range = false) { ET_CHECK_OR_RETURN_ERROR( min <= max, Internal, "In ChooseQuantizationParams, min should be less than or equal to max. min: %f, max: %f", min, max); if (reduce_range) { qmin = qmin / 2; qmax = qmax / 2; } if (min < 0 && max > 0 && preserve_sparsity) { int symmetric_qmin = -((qmax - qmin) / 2 + 1); int symmetric_qmax = (qmax - qmin) / 2; double max_scale = std::max(fabs(min / symmetric_qmin), fabs(max / symmetric_qmax)); min = max_scale * symmetric_qmin; max = max_scale * symmetric_qmax; } // We extend the [min, max] interval to ensure that it contains 0. // Otherwise, we would not meet the requirement that 0 be an exactly // representable value. min = std::min(min, 0.f); max = std::max(max, 0.f); ET_CHECK_OR_RETURN_ERROR( qmin < qmax, Internal, "In ChooseQuantizationParams, qmin should be less than qmax"); // Use double precision for intermediate computation but use single precision // in final number to reflect the actual number used during quantization. double scale = (static_cast(max) - min) / (qmax - qmin); // If scale is 0 or too small so its reciprocal is infinity, we arbitrary // adjust the scale to 0.1 . We want to avoid scale's reciprocal being // infinity because some of fbgemm code pre-computes scale's reciprocal to do // multiplication instead of division in the time critical part of code. if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) { scale = 0.1; } ET_CHECK_OR_RETURN_ERROR( scale > 0, Internal, "quantization scale should be > 0"); if (force_scale_power_of_two) { if (scale < 1) { scale = 1.0 / (1 << static_cast(floor(log(1.0 / scale) / log(2)))); } else { scale = 1 << static_cast(ceil(log(scale) / log(2))); } } // Cut off small scale if (scale < SMALL_SCALE_THRESHOLD) { float org_scale = scale; scale = SMALL_SCALE_THRESHOLD; // Adjust the min and max based on the new scale if (min == 0.0f) { max = SMALL_SCALE_THRESHOLD * (qmax - qmin); } else if (max == 0.0f) { min = -SMALL_SCALE_THRESHOLD * (qmax - qmin); } else { float amplifier = SMALL_SCALE_THRESHOLD / org_scale; min *= amplifier; max *= amplifier; } } // Zero-point computation. // First the initial floating-point computation. The zero-point can be // determined from solving an affine equation for any known pair // (real value, corresponding quantized value). // We know two such pairs: (rmin, qmin) and (rmax, qmax). // The arithmetic error on the zero point computed from either pair // will be roughly machine_epsilon * (sum of absolute values of terms) // so we want to use the variant that adds the smaller terms. double zero_point_from_min = qmin - min / static_cast(scale); double zero_point_from_max = qmax - max / static_cast(scale); double zero_point_from_min_error = std::abs(qmin) - std::abs(min / static_cast(scale)); double zero_point_from_max_error = std::abs(qmax) - std::abs(max / static_cast(scale)); double initial_zero_point = zero_point_from_min_error < zero_point_from_max_error ? zero_point_from_min : zero_point_from_max; // for symmetric quantization (preserve_sparsity == true), we force zero_point // to be a middle value between qmin and qmax. // If either min or max is 0, then we just use 0 as zero_point. if (min < 0 && max > 0 && preserve_sparsity) { initial_zero_point = static_cast(qmin + qmax) / 2; } // Now we need to nudge the zero point to be an integer // (our zero points are integer, and this is motivated by the requirement // to be able to represent the real value "0" exactly as a quantized value, // which is required in multiple places, for example in Im2col with zero // padding). int32_t nudged_zero_point = 0; if (initial_zero_point < qmin) { nudged_zero_point = qmin; } else if (initial_zero_point > qmax) { nudged_zero_point = qmax; } else { nudged_zero_point = nearbyint(initial_zero_point); } result.scale = scale; result.zero_point = nudged_zero_point; return Error::Ok; } Error GenerateRequantizationScale( const Tensor& weight_scales, float input_scale, float output_scale, std::vector& requant_scales) { // Since weight scale is allocated with padding // weight_scales.numel() gives us padded num elements. const auto num_output_channels_padded = weight_scales.numel(); const float* weight_scales_data = weight_scales.const_data_ptr(); if (static_cast(requant_scales.size()) < num_output_channels_padded) { requant_scales.resize(num_output_channels_padded); } for (int i = 0; i < num_output_channels_padded; ++i) { const auto inverse_output_scale = 1.f / output_scale; requant_scales[i] = (weight_scales_data[i] * input_scale) * inverse_output_scale; ET_CHECK_OR_RETURN_ERROR( requant_scales[i] > 0.0f && std::isnormal(requant_scales[i]), Internal, "failed to create op with requantization scale"); } return Error::Ok; } std::pair GetMinMax(const Tensor& ft) { float min = std::numeric_limits::max(); float max = -std::numeric_limits::max(); ET_CHECK_MSG( ft.scalar_type() == ScalarType::Float, "Expected float tensor but got %" PRId8, static_cast(ft.scalar_type())); const float* d = ft.const_data_ptr(); for (int i = 0; i < ft.numel(); ++i) { min = (d[i] < min) ? d[i] : min; max = (d[i] > max) ? d[i] : max; } return std::pair(min, max); } #ifdef __aarch64__ template <> uint8x8_t vqmov(int16x8_t vraw) { return vqmovun_s16(vraw); } template <> int8x8_t vqmov(int16x8_t vraw) { return vqmovn_s16(vraw); } template <> void vst1(uint8_t* out, uint8x8_t vout) { vst1_u8(out, vout); } template <> void vst1(int8_t* out, int8x8_t vout) { vst1_s8(out, vout); } template <> void quantize_tensor_arm64_q8_wrapper( const float* __restrict__ in, uint8_t* __restrict__ out, const int64_t N, const float scale, const int32_t zero_point) { quantize_tensor_arm64_q8(in, out, N, scale, zero_point); } template <> void quantize_tensor_arm64_q8_wrapper( const float* __restrict__ in, int8_t* __restrict__ out, const int64_t N, const float scale, const int32_t zero_point) { quantize_tensor_arm64_q8(in, out, N, scale, zero_point); } #endif } // namespace utils } // namespace xnnpack } // namespace backends } // namespace executorch