xref: /aosp_15_r20/external/XNNPACK/src/qu8-vcvt/gen/vcvt-armsimd32-x4.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/qs8-vcvt/armsimd32.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2022 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_acle.h>
13 
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/unaligned.h>
17 #include <xnnpack/vcvt.h>
18 
19 
xnn_qu8_vcvt_ukernel__armsimd32_x4(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])20 void xnn_qu8_vcvt_ukernel__armsimd32_x4(
21     size_t n,
22     const uint8_t* x,
23     uint8_t* y,
24     const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
25 {
26   const uint16x2_t vminus_input_zero_point = (uint16x2_t) params->armsimd32.minus_input_zero_point;
27   const int32_t vbias = params->armsimd32.bias;
28   const int32_t vmultiplier = params->armsimd32.multiplier;
29   for (; n >= 4 * sizeof(uint8_t); n -= 4 * sizeof(uint8_t)) {
30     const uint8x4_t vx0123 = (uint8x4_t) unaligned_load_u32(x);
31     x += 4;
32 
33     const uint16x2_t vx02 = __uxtab16(vminus_input_zero_point, vx0123);
34     const uint16x2_t vx13 = __uxtab16(vminus_input_zero_point, __ror(vx0123, 8));
35 
36     int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias);
37     int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
38     int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
39     int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias);
40 
41     vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
42     vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
43     vacc2 = __usat(math_asr_s32(vacc2, 1), 8);
44     vacc3 = __usat(math_asr_s32(vacc3, 1), 8);
45 
46     y[0] = (uint8_t) vacc0;
47     y[1] = (uint8_t) vacc1;
48     y[2] = (uint8_t) vacc2;
49     y[3] = (uint8_t) vacc3;
50     y += 4;
51   }
52   if XNN_UNLIKELY(n != 0) {
53     const uint8x4_t vx0123 = (uint8x4_t) unaligned_load_u32(x);
54 
55     const uint16x2_t vx02 = __uxtab16(vminus_input_zero_point, vx0123);
56     const uint16x2_t vx13 = __uxtab16(vminus_input_zero_point, __ror(vx0123, 8));
57 
58     int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias);
59     int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
60     const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
61 
62     vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
63     vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
64 
65     if (n & (2 * sizeof(uint8_t))) {
66       y[0] = (uint8_t) vacc0;
67       y[1] = (uint8_t) vacc1;
68       vacc0 = __usat(math_asr_s32(vacc2, 1), 8);
69       y += 2;
70     }
71     if (n & (1 * sizeof(uint8_t))) {
72       y[0] = (uint8_t) vacc0;
73     }
74   }
75 }
76