xref: /aosp_15_r20/external/XNNPACK/src/qu8-vlrelu/gen/vlrelu-armsimd32-x8.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/qs8-vlrelu/armsimd32.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2022 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_acle.h>
13 
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/unaligned.h>
17 #include <xnnpack/vlrelu.h>
18 
19 
xnn_qu8_vlrelu_ukernel__armsimd32_x8(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])20 void xnn_qu8_vlrelu_ukernel__armsimd32_x8(
21     size_t n,
22     const uint8_t* x,
23     uint8_t* y,
24     const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
25 {
26   const uint16x2_t vinput_zero_point = (uint16x2_t) params->armsimd32.input_zero_point;
27   const int16x2_t vpositive_multiplier = (int16x2_t) params->armsimd32.positive_multiplier;
28   const int16x2_t vnegative_multiplier = (int16x2_t) params->armsimd32.negative_multiplier;
29   const int32_t vbias = params->armsimd32.bias;
30   for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
31     const uint8x4_t vx0123 = (uint8x4_t) unaligned_indexed_load_u32(x, 0);
32     const uint8x4_t vx4567 = (uint8x4_t) unaligned_indexed_load_u32(x, 1);
33     x += 8;
34 
35     uint16x2_t vx02 = __uxtb16(vx0123);
36     uint16x2_t vx13 = __uxtb16(__ror(vx0123, 8));
37     uint16x2_t vx46 = __uxtb16(vx4567);
38     uint16x2_t vx57 = __uxtb16(__ror(vx4567, 8));
39 
40     vx02 = __usub16(vinput_zero_point, vx02);
41     const int16x2_t vmultiplier02 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
42     vx13 = __usub16(vinput_zero_point, vx13);
43     const int16x2_t vmultiplier13 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
44     vx46 = __usub16(vinput_zero_point, vx46);
45     const int16x2_t vmultiplier46 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
46     vx57 = __usub16(vinput_zero_point, vx57);
47     const int16x2_t vmultiplier57 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
48 
49     int32_t vacc0 = __smlabb(vmultiplier02, vx02, vbias);
50     int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias);
51     int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
52     int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias);
53     int32_t vacc4 = __smlabb(vmultiplier46, vx46, vbias);
54     int32_t vacc5 = __smlabb(vmultiplier57, vx57, vbias);
55     int32_t vacc6 = __smlatt(vmultiplier46, vx46, vbias);
56     int32_t vacc7 = __smlatt(vmultiplier57, vx57, vbias);
57 
58     vacc0 = __usat(math_asr_s32(vacc0, 8), 8);
59     vacc1 = __usat(math_asr_s32(vacc1, 8), 8);
60     vacc2 = __usat(math_asr_s32(vacc2, 8), 8);
61     vacc3 = __usat(math_asr_s32(vacc3, 8), 8);
62     vacc4 = __usat(math_asr_s32(vacc4, 8), 8);
63     vacc5 = __usat(math_asr_s32(vacc5, 8), 8);
64     vacc6 = __usat(math_asr_s32(vacc6, 8), 8);
65     vacc7 = __usat(math_asr_s32(vacc7, 8), 8);
66 
67     y[0] = (uint8_t) vacc0;
68     y[1] = (uint8_t) vacc1;
69     y[2] = (uint8_t) vacc2;
70     y[3] = (uint8_t) vacc3;
71     y[4] = (uint8_t) vacc4;
72     y[5] = (uint8_t) vacc5;
73     y[6] = (uint8_t) vacc6;
74     y[7] = (uint8_t) vacc7;
75     y += 8;
76   }
77   for (; n >= 4 * sizeof(uint8_t); n -= 4 * sizeof(uint8_t)) {
78     const uint8x4_t vx0123 = (uint8x4_t) unaligned_load_u32(x);
79     x += 4;
80 
81     uint16x2_t vx02 = __uxtb16(vx0123);
82     uint16x2_t vx13 = __uxtb16(__ror(vx0123, 8));
83 
84     vx02 = __usub16(vinput_zero_point, vx02);
85     const int16x2_t vmultiplier02 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
86     vx13 = __usub16(vinput_zero_point, vx13);
87     const int16x2_t vmultiplier13 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
88 
89     int32_t vacc0 = __smlabb(vmultiplier02, vx02, vbias);
90     int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias);
91     int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
92     int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias);
93 
94     vacc0 = __usat(math_asr_s32(vacc0, 8), 8);
95     vacc1 = __usat(math_asr_s32(vacc1, 8), 8);
96     vacc2 = __usat(math_asr_s32(vacc2, 8), 8);
97     vacc3 = __usat(math_asr_s32(vacc3, 8), 8);
98 
99     y[0] = (uint8_t) vacc0;
100     y[1] = (uint8_t) vacc1;
101     y[2] = (uint8_t) vacc2;
102     y[3] = (uint8_t) vacc3;
103     y += 4;
104   }
105   if XNN_UNLIKELY(n != 0) {
106     const uint8x4_t vx0123 = (uint8x4_t) unaligned_load_u32(x);
107 
108     uint16x2_t vx02 = __uxtb16(vx0123);
109     uint16x2_t vx13 = __uxtb16(__ror(vx0123, 8));
110 
111     vx02 = __usub16(vinput_zero_point, vx02);
112     const int16x2_t vmultiplier02 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
113     vx13 = __usub16(vinput_zero_point, vx13);
114     const int16x2_t vmultiplier13 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
115 
116     int32_t vacc0 = __smlabb(vmultiplier02, vx02, vbias);
117     int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias);
118     const int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
119 
120     vacc0 = __usat(math_asr_s32(vacc0, 8), 8);
121     vacc1 = __usat(math_asr_s32(vacc1, 8), 8);
122 
123     if (n & (2 * sizeof(uint8_t))) {
124       y[0] = (uint8_t) vacc0;
125       y[1] = (uint8_t) vacc1;
126       vacc0 = __usat(math_asr_s32(vacc2, 8), 8);
127       y += 2;
128     }
129     if (n & (1 * sizeof(uint8_t))) {
130       y[0] = (uint8_t) vacc0;
131     }
132   }
133 }
134