xref: /aosp_15_r20/external/XNNPACK/src/qu8-vcvt/gen/vcvt-armsimd32-x8.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/qs8-vcvt/armsimd32.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2022 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_acle.h>
13 
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/unaligned.h>
17 #include <xnnpack/vcvt.h>
18 
19 
xnn_qu8_vcvt_ukernel__armsimd32_x8(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])20 void xnn_qu8_vcvt_ukernel__armsimd32_x8(
21     size_t n,
22     const uint8_t* x,
23     uint8_t* y,
24     const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
25 {
26   const uint16x2_t vminus_input_zero_point = (uint16x2_t) params->armsimd32.minus_input_zero_point;
27   const int32_t vbias = params->armsimd32.bias;
28   const int32_t vmultiplier = params->armsimd32.multiplier;
29   for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
30     const uint8x4_t vx0123 = (uint8x4_t) unaligned_indexed_load_u32(x, 0);
31     const uint8x4_t vx4567 = (uint8x4_t) unaligned_indexed_load_u32(x, 1);
32     x += 8;
33 
34     const uint16x2_t vx02 = __uxtab16(vminus_input_zero_point, vx0123);
35     const uint16x2_t vx13 = __uxtab16(vminus_input_zero_point, __ror(vx0123, 8));
36     const uint16x2_t vx46 = __uxtab16(vminus_input_zero_point, vx4567);
37     const uint16x2_t vx57 = __uxtab16(vminus_input_zero_point, __ror(vx4567, 8));
38 
39     int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias);
40     int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
41     int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
42     int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias);
43     int32_t vacc4 = __smlawb(vmultiplier, vx46, vbias);
44     int32_t vacc5 = __smlawb(vmultiplier, vx57, vbias);
45     int32_t vacc6 = __smlawt(vmultiplier, vx46, vbias);
46     int32_t vacc7 = __smlawt(vmultiplier, vx57, vbias);
47 
48     vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
49     vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
50     vacc2 = __usat(math_asr_s32(vacc2, 1), 8);
51     vacc3 = __usat(math_asr_s32(vacc3, 1), 8);
52     vacc4 = __usat(math_asr_s32(vacc4, 1), 8);
53     vacc5 = __usat(math_asr_s32(vacc5, 1), 8);
54     vacc6 = __usat(math_asr_s32(vacc6, 1), 8);
55     vacc7 = __usat(math_asr_s32(vacc7, 1), 8);
56 
57     y[0] = (uint8_t) vacc0;
58     y[1] = (uint8_t) vacc1;
59     y[2] = (uint8_t) vacc2;
60     y[3] = (uint8_t) vacc3;
61     y[4] = (uint8_t) vacc4;
62     y[5] = (uint8_t) vacc5;
63     y[6] = (uint8_t) vacc6;
64     y[7] = (uint8_t) vacc7;
65     y += 8;
66   }
67   for (; n >= 4 * sizeof(uint8_t); n -= 4 * sizeof(uint8_t)) {
68     const uint8x4_t vx0123 = (uint8x4_t) unaligned_load_u32(x);
69     x += 4;
70 
71     const uint16x2_t vx02 = __uxtab16(vminus_input_zero_point, vx0123);
72     const uint16x2_t vx13 = __uxtab16(vminus_input_zero_point, __ror(vx0123, 8));
73 
74     int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias);
75     int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
76     int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
77     int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias);
78 
79     vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
80     vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
81     vacc2 = __usat(math_asr_s32(vacc2, 1), 8);
82     vacc3 = __usat(math_asr_s32(vacc3, 1), 8);
83 
84     y[0] = (uint8_t) vacc0;
85     y[1] = (uint8_t) vacc1;
86     y[2] = (uint8_t) vacc2;
87     y[3] = (uint8_t) vacc3;
88     y += 4;
89   }
90   if XNN_UNLIKELY(n != 0) {
91     const uint8x4_t vx0123 = (uint8x4_t) unaligned_load_u32(x);
92 
93     const uint16x2_t vx02 = __uxtab16(vminus_input_zero_point, vx0123);
94     const uint16x2_t vx13 = __uxtab16(vminus_input_zero_point, __ror(vx0123, 8));
95 
96     int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias);
97     int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
98     const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
99 
100     vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
101     vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
102 
103     if (n & (2 * sizeof(uint8_t))) {
104       y[0] = (uint8_t) vacc0;
105       y[1] = (uint8_t) vacc1;
106       vacc0 = __usat(math_asr_s32(vacc2, 1), 8);
107       y += 2;
108     }
109     if (n & (1 * sizeof(uint8_t))) {
110       y[0] = (uint8_t) vacc0;
111     }
112   }
113 }
114