1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-vlrelu/armsimd32.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2022 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_acle.h>
13
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/unaligned.h>
17 #include <xnnpack/vlrelu.h>
18
19
xnn_qu8_vlrelu_ukernel__armsimd32_x8(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])20 void xnn_qu8_vlrelu_ukernel__armsimd32_x8(
21 size_t n,
22 const uint8_t* x,
23 uint8_t* y,
24 const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
25 {
26 const uint16x2_t vinput_zero_point = (uint16x2_t) params->armsimd32.input_zero_point;
27 const int16x2_t vpositive_multiplier = (int16x2_t) params->armsimd32.positive_multiplier;
28 const int16x2_t vnegative_multiplier = (int16x2_t) params->armsimd32.negative_multiplier;
29 const int32_t vbias = params->armsimd32.bias;
30 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
31 const uint8x4_t vx0123 = (uint8x4_t) unaligned_indexed_load_u32(x, 0);
32 const uint8x4_t vx4567 = (uint8x4_t) unaligned_indexed_load_u32(x, 1);
33 x += 8;
34
35 uint16x2_t vx02 = __uxtb16(vx0123);
36 uint16x2_t vx13 = __uxtb16(__ror(vx0123, 8));
37 uint16x2_t vx46 = __uxtb16(vx4567);
38 uint16x2_t vx57 = __uxtb16(__ror(vx4567, 8));
39
40 vx02 = __usub16(vinput_zero_point, vx02);
41 const int16x2_t vmultiplier02 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
42 vx13 = __usub16(vinput_zero_point, vx13);
43 const int16x2_t vmultiplier13 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
44 vx46 = __usub16(vinput_zero_point, vx46);
45 const int16x2_t vmultiplier46 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
46 vx57 = __usub16(vinput_zero_point, vx57);
47 const int16x2_t vmultiplier57 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
48
49 int32_t vacc0 = __smlabb(vmultiplier02, vx02, vbias);
50 int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias);
51 int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
52 int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias);
53 int32_t vacc4 = __smlabb(vmultiplier46, vx46, vbias);
54 int32_t vacc5 = __smlabb(vmultiplier57, vx57, vbias);
55 int32_t vacc6 = __smlatt(vmultiplier46, vx46, vbias);
56 int32_t vacc7 = __smlatt(vmultiplier57, vx57, vbias);
57
58 vacc0 = __usat(math_asr_s32(vacc0, 8), 8);
59 vacc1 = __usat(math_asr_s32(vacc1, 8), 8);
60 vacc2 = __usat(math_asr_s32(vacc2, 8), 8);
61 vacc3 = __usat(math_asr_s32(vacc3, 8), 8);
62 vacc4 = __usat(math_asr_s32(vacc4, 8), 8);
63 vacc5 = __usat(math_asr_s32(vacc5, 8), 8);
64 vacc6 = __usat(math_asr_s32(vacc6, 8), 8);
65 vacc7 = __usat(math_asr_s32(vacc7, 8), 8);
66
67 y[0] = (uint8_t) vacc0;
68 y[1] = (uint8_t) vacc1;
69 y[2] = (uint8_t) vacc2;
70 y[3] = (uint8_t) vacc3;
71 y[4] = (uint8_t) vacc4;
72 y[5] = (uint8_t) vacc5;
73 y[6] = (uint8_t) vacc6;
74 y[7] = (uint8_t) vacc7;
75 y += 8;
76 }
77 for (; n >= 4 * sizeof(uint8_t); n -= 4 * sizeof(uint8_t)) {
78 const uint8x4_t vx0123 = (uint8x4_t) unaligned_load_u32(x);
79 x += 4;
80
81 uint16x2_t vx02 = __uxtb16(vx0123);
82 uint16x2_t vx13 = __uxtb16(__ror(vx0123, 8));
83
84 vx02 = __usub16(vinput_zero_point, vx02);
85 const int16x2_t vmultiplier02 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
86 vx13 = __usub16(vinput_zero_point, vx13);
87 const int16x2_t vmultiplier13 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
88
89 int32_t vacc0 = __smlabb(vmultiplier02, vx02, vbias);
90 int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias);
91 int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
92 int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias);
93
94 vacc0 = __usat(math_asr_s32(vacc0, 8), 8);
95 vacc1 = __usat(math_asr_s32(vacc1, 8), 8);
96 vacc2 = __usat(math_asr_s32(vacc2, 8), 8);
97 vacc3 = __usat(math_asr_s32(vacc3, 8), 8);
98
99 y[0] = (uint8_t) vacc0;
100 y[1] = (uint8_t) vacc1;
101 y[2] = (uint8_t) vacc2;
102 y[3] = (uint8_t) vacc3;
103 y += 4;
104 }
105 if XNN_UNLIKELY(n != 0) {
106 const uint8x4_t vx0123 = (uint8x4_t) unaligned_load_u32(x);
107
108 uint16x2_t vx02 = __uxtb16(vx0123);
109 uint16x2_t vx13 = __uxtb16(__ror(vx0123, 8));
110
111 vx02 = __usub16(vinput_zero_point, vx02);
112 const int16x2_t vmultiplier02 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
113 vx13 = __usub16(vinput_zero_point, vx13);
114 const int16x2_t vmultiplier13 = (int16x2_t) __sel((uint8x4_t) vnegative_multiplier, (uint8x4_t) vpositive_multiplier);
115
116 int32_t vacc0 = __smlabb(vmultiplier02, vx02, vbias);
117 int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias);
118 const int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
119
120 vacc0 = __usat(math_asr_s32(vacc0, 8), 8);
121 vacc1 = __usat(math_asr_s32(vacc1, 8), 8);
122
123 if (n & (2 * sizeof(uint8_t))) {
124 y[0] = (uint8_t) vacc0;
125 y[1] = (uint8_t) vacc1;
126 vacc0 = __usat(math_asr_s32(vacc2, 8), 8);
127 y += 2;
128 }
129 if (n & (1 * sizeof(uint8_t))) {
130 y[0] = (uint8_t) vacc0;
131 }
132 }
133 }
134