runtime/utils/utils.cpp

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <executorch/backends/xnnpack/runtime/utils/utils.h>
#include <executorch/runtime/platform/assert.h>
#include <cinttypes>

namespace executorch {
namespace backends {
namespace xnnpack {
namespace utils {

using executorch::aten::ScalarType;
using executorch::aten::Tensor;
using executorch::runtime::Error;

constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;

Error ChooseQuantizationParams(
    float min,
    float max,
    int32_t qmin,
    int32_t qmax,
    QuantizationParams& result,
    bool preserve_sparsity = false,
    bool force_scale_power_of_two = false,
    bool reduce_range = false) {
  ET_CHECK_OR_RETURN_ERROR(
      min <= max,
      Internal,
      "In ChooseQuantizationParams, min should be less than or equal to max. min: %f, max: %f",
      min,
      max);

  if (reduce_range) {
    qmin = qmin / 2;
    qmax = qmax / 2;
  }
  if (min < 0 && max > 0 && preserve_sparsity) {
    int symmetric_qmin = -((qmax - qmin) / 2 + 1);
    int symmetric_qmax = (qmax - qmin) / 2;
    double max_scale =
        std::max(fabs(min / symmetric_qmin), fabs(max / symmetric_qmax));
    min = max_scale * symmetric_qmin;
    max = max_scale * symmetric_qmax;
  }

  // We extend the [min, max] interval to ensure that it contains 0.
  // Otherwise, we would not meet the requirement that 0 be an exactly
  // representable value.
  min = std::min(min, 0.f);
  max = std::max(max, 0.f);

  ET_CHECK_OR_RETURN_ERROR(
      qmin < qmax,
      Internal,
      "In ChooseQuantizationParams, qmin should be less than qmax");

  // Use double precision for intermediate computation but use single precision
  // in final number to reflect the actual number used during quantization.
  double scale = (static_cast<double>(max) - min) / (qmax - qmin);
  // If scale is 0 or too small so its reciprocal is infinity, we arbitrary
  // adjust the scale to 0.1 . We want to avoid scale's reciprocal being
  // infinity because some of fbgemm code pre-computes scale's reciprocal to do
  // multiplication instead of division in the time critical part of code.
  if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) {
    scale = 0.1;
  }
  ET_CHECK_OR_RETURN_ERROR(
      scale > 0, Internal, "quantization scale should be > 0");

  if (force_scale_power_of_two) {
    if (scale < 1) {
      scale = 1.0 / (1 << static_cast<int>(floor(log(1.0 / scale) / log(2))));
    } else {
      scale = 1 << static_cast<int>(ceil(log(scale) / log(2)));
    }
  }

  // Cut off small scale
  if (scale < SMALL_SCALE_THRESHOLD) {
    float org_scale = scale;
    scale = SMALL_SCALE_THRESHOLD;
    // Adjust the min and max based on the new scale
    if (min == 0.0f) {
      max = SMALL_SCALE_THRESHOLD * (qmax - qmin);
    } else if (max == 0.0f) {
      min = -SMALL_SCALE_THRESHOLD * (qmax - qmin);
    } else {
      float amplifier = SMALL_SCALE_THRESHOLD / org_scale;
      min *= amplifier;
      max *= amplifier;
    }
  }

  // Zero-point computation.
  // First the initial floating-point computation. The zero-point can be
  // determined from solving an affine equation for any known pair
  // (real value, corresponding quantized value).
  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
  // The arithmetic error on the zero point computed from either pair
  // will be roughly machine_epsilon * (sum of absolute values of terms)
  // so we want to use the variant that adds the smaller terms.
  double zero_point_from_min = qmin - min / static_cast<double>(scale);
  double zero_point_from_max = qmax - max / static_cast<double>(scale);
  double zero_point_from_min_error =
      std::abs(qmin) - std::abs(min / static_cast<double>(scale));
  double zero_point_from_max_error =
      std::abs(qmax) - std::abs(max / static_cast<double>(scale));
  double initial_zero_point =
      zero_point_from_min_error < zero_point_from_max_error
      ? zero_point_from_min
      : zero_point_from_max;

  // for symmetric quantization (preserve_sparsity == true), we force zero_point
  // to be a middle value between qmin and qmax.
  // If either min or max is 0, then we just use 0 as zero_point.
  if (min < 0 && max > 0 && preserve_sparsity) {
    initial_zero_point = static_cast<double>(qmin + qmax) / 2;
  }

  // Now we need to nudge the zero point to be an integer
  // (our zero points are integer, and this is motivated by the requirement
  // to be able to represent the real value "0" exactly as a quantized value,
  // which is required in multiple places, for example in Im2col with zero
  // padding).
  int32_t nudged_zero_point = 0;
  if (initial_zero_point < qmin) {
    nudged_zero_point = qmin;
  } else if (initial_zero_point > qmax) {
    nudged_zero_point = qmax;
  } else {
    nudged_zero_point = nearbyint(initial_zero_point);
  }

  result.scale = scale;
  result.zero_point = nudged_zero_point;
  return Error::Ok;
}

Error GenerateRequantizationScale(
    const Tensor& weight_scales,
    float input_scale,
    float output_scale,
    std::vector<float>& requant_scales) {
  // Since weight scale is allocated with padding
  // weight_scales.numel() gives us padded num elements.
  const auto num_output_channels_padded = weight_scales.numel();
  const float* weight_scales_data = weight_scales.const_data_ptr<float>();
  if (static_cast<int64_t>(requant_scales.size()) <
      num_output_channels_padded) {
    requant_scales.resize(num_output_channels_padded);
  }
  for (int i = 0; i < num_output_channels_padded; ++i) {
    const auto inverse_output_scale = 1.f / output_scale;
    requant_scales[i] =
        (weight_scales_data[i] * input_scale) * inverse_output_scale;
    ET_CHECK_OR_RETURN_ERROR(
        requant_scales[i] > 0.0f && std::isnormal(requant_scales[i]),
        Internal,
        "failed to create op with requantization scale");
  }
  return Error::Ok;
}

std::pair<float, float> GetMinMax(const Tensor& ft) {
  float min = std::numeric_limits<float>::max();
  float max = -std::numeric_limits<float>::max();
  ET_CHECK_MSG(
      ft.scalar_type() == ScalarType::Float,
      "Expected float tensor but got %" PRId8,
      static_cast<int8_t>(ft.scalar_type()));
  const float* d = ft.const_data_ptr<float>();
  for (int i = 0; i < ft.numel(); ++i) {
    min = (d[i] < min) ? d[i] : min;
    max = (d[i] > max) ? d[i] : max;
  }
  return std::pair<float, float>(min, max);
}

#ifdef __aarch64__
template <>
uint8x8_t vqmov<uint8x8_t>(int16x8_t vraw) {
  return vqmovun_s16(vraw);
}

template <>
int8x8_t vqmov<int8x8_t>(int16x8_t vraw) {
  return vqmovn_s16(vraw);
}

template <>
void vst1<uint8_t, uint8x8_t>(uint8_t* out, uint8x8_t vout) {
  vst1_u8(out, vout);
}

template <>
void vst1<int8_t, int8x8_t>(int8_t* out, int8x8_t vout) {
  vst1_s8(out, vout);
}

template <>
void quantize_tensor_arm64_q8_wrapper<uint8_t>(
    const float* __restrict__ in,
    uint8_t* __restrict__ out,
    const int64_t N,
    const float scale,
    const int32_t zero_point) {
  quantize_tensor_arm64_q8<uint8_t, uint8x8_t>(in, out, N, scale, zero_point);
}

template <>
void quantize_tensor_arm64_q8_wrapper<int8_t>(
    const float* __restrict__ in,
    int8_t* __restrict__ out,
    const int64_t N,
    const float scale,
    const int32_t zero_point) {
  quantize_tensor_arm64_q8<int8_t, int8x8_t>(in, out, N, scale, zero_point);
}
#endif

} // namespace utils
} // namespace xnnpack
} // namespace backends
} // namespace executorch