1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gavgpool/multipass-wasmsimd.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <wasm_simd128.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16
17
xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,int32_t * buffer,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8(
19 size_t rows,
20 size_t channels,
21 const uint8_t* input,
22 size_t input_stride,
23 const uint8_t* zero,
24 int32_t* buffer,
25 uint8_t* output,
26 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27 {
28 assert(rows > 7);
29 assert(channels != 0);
30
31 const uint8_t* i0 = input;
32 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
33 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
34 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
35 const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
36 const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
37 const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
38 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
39
40 const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
41 int32_t* b = buffer;
42 size_t c = channels;
43 for (; c != 0; c = doz(c, 8)) {
44 const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
45 i0 += 8;
46 const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
47 i1 += 8;
48
49 v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
50 const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
51 i2 += 8;
52
53 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
54 const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
55 i3 += 8;
56 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
57 const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
58 i4 += 8;
59 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
60 const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
61 i5 += 8;
62 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
63 const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
64 i6 += 8;
65
66 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
67
68 const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
69 const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
70
71 wasm_v128_store(b, vacc0123);
72 wasm_v128_store(b + 4, vacc4567);
73 b += 8;
74 }
75
76 for (rows -= 7; rows > 7; rows -= 7) {
77 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
78 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
79 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
80 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
81 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
82 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
83 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
84
85 int32_t* b = buffer;
86 size_t c = channels;
87 for (; c != 0; c = doz(c, 8)) {
88 const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
89 i0 += 8;
90 const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
91 i1 += 8;
92
93 v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
94 const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
95 i2 += 8;
96
97 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
98 const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
99 i3 += 8;
100 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
101 const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
102 i4 += 8;
103 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
104 const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
105 i5 += 8;
106 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
107 const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
108 i6 += 8;
109
110 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
111
112 v128_t vacc0123 = wasm_v128_load(b);
113 v128_t vacc4567 = wasm_v128_load(b + 4);
114
115 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
116 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
117
118 wasm_v128_store(b, vacc0123);
119 wasm_v128_store(b + 4, vacc4567);
120 b += 8;
121 }
122 }
123
124 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
125 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
126 if XNN_UNPREDICTABLE(rows < 2) {
127 i1 = zero;
128 }
129 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
130 if XNN_UNPREDICTABLE(rows <= 2) {
131 i2 = zero;
132 }
133 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
134 if XNN_UNPREDICTABLE(rows < 4) {
135 i3 = zero;
136 }
137 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
138 if XNN_UNPREDICTABLE(rows <= 4) {
139 i4 = zero;
140 }
141 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
142 if XNN_UNPREDICTABLE(rows < 6) {
143 i5 = zero;
144 }
145 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
146 if XNN_UNPREDICTABLE(rows <= 6) {
147 i6 = zero;
148 }
149
150 const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
151 const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
152 const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
153 const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
154 const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
155 for (; channels >= 8; channels -= 8) {
156 const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
157 i0 += 8;
158 const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
159 i1 += 8;
160
161 v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
162 const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
163 i2 += 8;
164
165 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
166 const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
167 i3 += 8;
168 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
169 const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
170 i4 += 8;
171 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
172 const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
173 i5 += 8;
174 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
175 const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
176 i6 += 8;
177
178 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
179
180 v128_t vacc0123 = wasm_v128_load(buffer);
181 v128_t vacc4567 = wasm_v128_load(buffer + 4);
182 buffer += 8;
183
184 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
185 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
186
187 vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
188 vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
189
190 vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
191 vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
192
193 vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
194 vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
195
196 vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
197 vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
198
199 vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
200 vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
201
202 v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
203
204 v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
205
206 vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
207
208 *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
209 output += 8;
210 }
211 if XNN_UNLIKELY(channels != 0) {
212 {
213 const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
214 i0 += 8;
215 const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
216 i1 += 8;
217
218 v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
219 const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
220 i2 += 8;
221
222 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
223 const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
224 i3 += 8;
225 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
226 const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
227 i4 += 8;
228 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
229 const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
230 i5 += 8;
231 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
232 const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
233 i6 += 8;
234
235 vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
236
237 v128_t vacc0123 = wasm_v128_load(buffer);
238 v128_t vacc4567 = wasm_v128_load(buffer + 4);
239 buffer += 8;
240
241 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
242 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
243
244 vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
245 vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
246
247 vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
248 vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
249
250 vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
251 vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
252
253 vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
254 vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
255
256 vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
257 vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
258
259 const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
260 v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
261 vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
262
263 if (channels & 4) {
264 *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
265 vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
266 output += 4;
267 }
268 uint32_t vout0123 = wasm_i32x4_extract_lane(vout0123456701234567, 0);
269 if (channels & 2) {
270 *((uint16_t*) output) = (uint16_t) vout0123;
271 vout0123 >>= 16;
272 output += 2;
273 }
274 if (channels & 1) {
275 *output = (uint8_t) vout0123;
276 }
277 }
278 }
279 }
280