xref: /aosp_15_r20/external/XNNPACK/src/math/cvt-f32-qu8-neon.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 #include <stddef.h>
8 #include <stdint.h>
9 
10 #include <arm_neon.h>
11 
12 #include <xnnpack/math-stubs.h>
13 
14 
xnn_math_f32_qu8_cvt__neon(size_t n,const float * input,uint8_t * output,uint8_t output_zero_point)15 void xnn_math_f32_qu8_cvt__neon(
16     size_t n,
17     const float* input,
18     uint8_t* output,
19     uint8_t output_zero_point)
20 {
21   assert(n % (8 * sizeof(uint8_t)) == 0);
22 
23   const float32x4_t vfmagic = vdupq_n_f32(12582912.0f);
24   const int32x4_t vimagic = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) output_zero_point);
25   for (; n != 0; n -= 8 * sizeof(uint8_t)) {
26     float32x4_t vx_lo = vld1q_f32(input); input += 4;
27     float32x4_t vx_hi = vld1q_f32(input); input += 4;
28 
29     vx_lo = vaddq_f32(vx_lo, vfmagic);
30     vx_hi = vaddq_f32(vx_hi, vfmagic);
31 
32     int32x4_t vy_lo = vreinterpretq_s32_f32(vx_lo);
33     int32x4_t vy_hi = vreinterpretq_s32_f32(vx_hi);
34 
35     vy_lo = vqsubq_s32(vy_lo, vimagic);
36     vy_hi = vqsubq_s32(vy_hi, vimagic);
37 
38     const int16x8_t vy = vcombine_s16(vqmovn_s32(vy_lo), vqmovn_s32(vy_hi));
39 
40     const uint8x8_t vout = vqmovun_s16(vy);
41     vst1_u8(output, vout); output += 8;
42   }
43 }
44