xref: /aosp_15_r20/external/executorch/backends/xnnpack/runtime/utils/utils.cpp (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <executorch/backends/xnnpack/runtime/utils/utils.h>
10 #include <executorch/runtime/platform/assert.h>
11 #include <cinttypes>
12 
13 namespace executorch {
14 namespace backends {
15 namespace xnnpack {
16 namespace utils {
17 
18 using executorch::aten::ScalarType;
19 using executorch::aten::Tensor;
20 using executorch::runtime::Error;
21 
22 constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;
23 
ChooseQuantizationParams(float min,float max,int32_t qmin,int32_t qmax,QuantizationParams & result,bool preserve_sparsity=false,bool force_scale_power_of_two=false,bool reduce_range=false)24 Error ChooseQuantizationParams(
25     float min,
26     float max,
27     int32_t qmin,
28     int32_t qmax,
29     QuantizationParams& result,
30     bool preserve_sparsity = false,
31     bool force_scale_power_of_two = false,
32     bool reduce_range = false) {
33   ET_CHECK_OR_RETURN_ERROR(
34       min <= max,
35       Internal,
36       "In ChooseQuantizationParams, min should be less than or equal to max. min: %f, max: %f",
37       min,
38       max);
39 
40   if (reduce_range) {
41     qmin = qmin / 2;
42     qmax = qmax / 2;
43   }
44   if (min < 0 && max > 0 && preserve_sparsity) {
45     int symmetric_qmin = -((qmax - qmin) / 2 + 1);
46     int symmetric_qmax = (qmax - qmin) / 2;
47     double max_scale =
48         std::max(fabs(min / symmetric_qmin), fabs(max / symmetric_qmax));
49     min = max_scale * symmetric_qmin;
50     max = max_scale * symmetric_qmax;
51   }
52 
53   // We extend the [min, max] interval to ensure that it contains 0.
54   // Otherwise, we would not meet the requirement that 0 be an exactly
55   // representable value.
56   min = std::min(min, 0.f);
57   max = std::max(max, 0.f);
58 
59   ET_CHECK_OR_RETURN_ERROR(
60       qmin < qmax,
61       Internal,
62       "In ChooseQuantizationParams, qmin should be less than qmax");
63 
64   // Use double precision for intermediate computation but use single precision
65   // in final number to reflect the actual number used during quantization.
66   double scale = (static_cast<double>(max) - min) / (qmax - qmin);
67   // If scale is 0 or too small so its reciprocal is infinity, we arbitrary
68   // adjust the scale to 0.1 . We want to avoid scale's reciprocal being
69   // infinity because some of fbgemm code pre-computes scale's reciprocal to do
70   // multiplication instead of division in the time critical part of code.
71   if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) {
72     scale = 0.1;
73   }
74   ET_CHECK_OR_RETURN_ERROR(
75       scale > 0, Internal, "quantization scale should be > 0");
76 
77   if (force_scale_power_of_two) {
78     if (scale < 1) {
79       scale = 1.0 / (1 << static_cast<int>(floor(log(1.0 / scale) / log(2))));
80     } else {
81       scale = 1 << static_cast<int>(ceil(log(scale) / log(2)));
82     }
83   }
84 
85   // Cut off small scale
86   if (scale < SMALL_SCALE_THRESHOLD) {
87     float org_scale = scale;
88     scale = SMALL_SCALE_THRESHOLD;
89     // Adjust the min and max based on the new scale
90     if (min == 0.0f) {
91       max = SMALL_SCALE_THRESHOLD * (qmax - qmin);
92     } else if (max == 0.0f) {
93       min = -SMALL_SCALE_THRESHOLD * (qmax - qmin);
94     } else {
95       float amplifier = SMALL_SCALE_THRESHOLD / org_scale;
96       min *= amplifier;
97       max *= amplifier;
98     }
99   }
100 
101   // Zero-point computation.
102   // First the initial floating-point computation. The zero-point can be
103   // determined from solving an affine equation for any known pair
104   // (real value, corresponding quantized value).
105   // We know two such pairs: (rmin, qmin) and (rmax, qmax).
106   // The arithmetic error on the zero point computed from either pair
107   // will be roughly machine_epsilon * (sum of absolute values of terms)
108   // so we want to use the variant that adds the smaller terms.
109   double zero_point_from_min = qmin - min / static_cast<double>(scale);
110   double zero_point_from_max = qmax - max / static_cast<double>(scale);
111   double zero_point_from_min_error =
112       std::abs(qmin) - std::abs(min / static_cast<double>(scale));
113   double zero_point_from_max_error =
114       std::abs(qmax) - std::abs(max / static_cast<double>(scale));
115   double initial_zero_point =
116       zero_point_from_min_error < zero_point_from_max_error
117       ? zero_point_from_min
118       : zero_point_from_max;
119 
120   // for symmetric quantization (preserve_sparsity == true), we force zero_point
121   // to be a middle value between qmin and qmax.
122   // If either min or max is 0, then we just use 0 as zero_point.
123   if (min < 0 && max > 0 && preserve_sparsity) {
124     initial_zero_point = static_cast<double>(qmin + qmax) / 2;
125   }
126 
127   // Now we need to nudge the zero point to be an integer
128   // (our zero points are integer, and this is motivated by the requirement
129   // to be able to represent the real value "0" exactly as a quantized value,
130   // which is required in multiple places, for example in Im2col with zero
131   // padding).
132   int32_t nudged_zero_point = 0;
133   if (initial_zero_point < qmin) {
134     nudged_zero_point = qmin;
135   } else if (initial_zero_point > qmax) {
136     nudged_zero_point = qmax;
137   } else {
138     nudged_zero_point = nearbyint(initial_zero_point);
139   }
140 
141   result.scale = scale;
142   result.zero_point = nudged_zero_point;
143   return Error::Ok;
144 }
145 
GenerateRequantizationScale(const Tensor & weight_scales,float input_scale,float output_scale,std::vector<float> & requant_scales)146 Error GenerateRequantizationScale(
147     const Tensor& weight_scales,
148     float input_scale,
149     float output_scale,
150     std::vector<float>& requant_scales) {
151   // Since weight scale is allocated with padding
152   // weight_scales.numel() gives us padded num elements.
153   const auto num_output_channels_padded = weight_scales.numel();
154   const float* weight_scales_data = weight_scales.const_data_ptr<float>();
155   if (static_cast<int64_t>(requant_scales.size()) <
156       num_output_channels_padded) {
157     requant_scales.resize(num_output_channels_padded);
158   }
159   for (int i = 0; i < num_output_channels_padded; ++i) {
160     const auto inverse_output_scale = 1.f / output_scale;
161     requant_scales[i] =
162         (weight_scales_data[i] * input_scale) * inverse_output_scale;
163     ET_CHECK_OR_RETURN_ERROR(
164         requant_scales[i] > 0.0f && std::isnormal(requant_scales[i]),
165         Internal,
166         "failed to create op with requantization scale");
167   }
168   return Error::Ok;
169 }
170 
GetMinMax(const Tensor & ft)171 std::pair<float, float> GetMinMax(const Tensor& ft) {
172   float min = std::numeric_limits<float>::max();
173   float max = -std::numeric_limits<float>::max();
174   ET_CHECK_MSG(
175       ft.scalar_type() == ScalarType::Float,
176       "Expected float tensor but got %" PRId8,
177       static_cast<int8_t>(ft.scalar_type()));
178   const float* d = ft.const_data_ptr<float>();
179   for (int i = 0; i < ft.numel(); ++i) {
180     min = (d[i] < min) ? d[i] : min;
181     max = (d[i] > max) ? d[i] : max;
182   }
183   return std::pair<float, float>(min, max);
184 }
185 
186 #ifdef __aarch64__
187 template <>
vqmov(int16x8_t vraw)188 uint8x8_t vqmov<uint8x8_t>(int16x8_t vraw) {
189   return vqmovun_s16(vraw);
190 }
191 
192 template <>
vqmov(int16x8_t vraw)193 int8x8_t vqmov<int8x8_t>(int16x8_t vraw) {
194   return vqmovn_s16(vraw);
195 }
196 
197 template <>
vst1(uint8_t * out,uint8x8_t vout)198 void vst1<uint8_t, uint8x8_t>(uint8_t* out, uint8x8_t vout) {
199   vst1_u8(out, vout);
200 }
201 
202 template <>
vst1(int8_t * out,int8x8_t vout)203 void vst1<int8_t, int8x8_t>(int8_t* out, int8x8_t vout) {
204   vst1_s8(out, vout);
205 }
206 
207 template <>
quantize_tensor_arm64_q8_wrapper(const float * __restrict__ in,uint8_t * __restrict__ out,const int64_t N,const float scale,const int32_t zero_point)208 void quantize_tensor_arm64_q8_wrapper<uint8_t>(
209     const float* __restrict__ in,
210     uint8_t* __restrict__ out,
211     const int64_t N,
212     const float scale,
213     const int32_t zero_point) {
214   quantize_tensor_arm64_q8<uint8_t, uint8x8_t>(in, out, N, scale, zero_point);
215 }
216 
217 template <>
quantize_tensor_arm64_q8_wrapper(const float * __restrict__ in,int8_t * __restrict__ out,const int64_t N,const float scale,const int32_t zero_point)218 void quantize_tensor_arm64_q8_wrapper<int8_t>(
219     const float* __restrict__ in,
220     int8_t* __restrict__ out,
221     const int64_t N,
222     const float scale,
223     const int32_t zero_point) {
224   quantize_tensor_arm64_q8<int8_t, int8x8_t>(in, out, N, scale, zero_point);
225 }
226 #endif
227 
228 } // namespace utils
229 } // namespace xnnpack
230 } // namespace backends
231 } // namespace executorch
232