xref: /aosp_15_r20/external/XNNPACK/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/qs8-gavgpool/multipass-wasmsimd.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <wasm_simd128.h>
13 
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,int32_t * buffer,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8(
19     size_t rows,
20     size_t channels,
21     const uint8_t* input,
22     size_t input_stride,
23     const uint8_t* zero,
24     int32_t* buffer,
25     uint8_t* output,
26     const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27 {
28   assert(rows > 7);
29   assert(channels != 0);
30 
31   const uint8_t* i0 = input;
32   const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
33   const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
34   const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
35   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
36   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
37   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
38   const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
39 
40   const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
41   int32_t* b = buffer;
42   size_t c = channels;
43   for (; c != 0; c = doz(c, 8)) {
44     const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
45     i0 += 8;
46     const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
47     i1 += 8;
48 
49     v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
50     const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
51     i2 += 8;
52 
53     vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
54     const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
55     i3 += 8;
56     vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
57     const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
58     i4 += 8;
59     vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
60     const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
61     i5 += 8;
62     vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
63     const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
64     i6 += 8;
65 
66     vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
67 
68     const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
69     const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
70 
71     wasm_v128_store(b, vacc0123);
72     wasm_v128_store(b + 4, vacc4567);
73     b += 8;
74   }
75 
76   for (rows -= 7; rows > 7; rows -= 7) {
77     i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
78     i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
79     i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
80     i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
81     i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
82     i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
83     i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
84 
85     int32_t* b = buffer;
86     size_t c = channels;
87     for (; c != 0; c = doz(c, 8)) {
88       const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
89       i0 += 8;
90       const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
91       i1 += 8;
92 
93       v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
94       const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
95       i2 += 8;
96 
97       vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
98       const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
99       i3 += 8;
100       vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
101       const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
102       i4 += 8;
103       vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
104       const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
105       i5 += 8;
106       vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
107       const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
108       i6 += 8;
109 
110       vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
111 
112       v128_t vacc0123 = wasm_v128_load(b);
113       v128_t vacc4567 = wasm_v128_load(b + 4);
114 
115       vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
116       vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
117 
118       wasm_v128_store(b, vacc0123);
119       wasm_v128_store(b + 4, vacc4567);
120       b += 8;
121     }
122   }
123 
124   i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
125   i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
126   if XNN_UNPREDICTABLE(rows < 2) {
127     i1 = zero;
128   }
129   i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
130   if XNN_UNPREDICTABLE(rows <= 2) {
131     i2 = zero;
132   }
133   i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
134   if XNN_UNPREDICTABLE(rows < 4) {
135     i3 = zero;
136   }
137   i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
138   if XNN_UNPREDICTABLE(rows <= 4) {
139     i4 = zero;
140   }
141   i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
142   if XNN_UNPREDICTABLE(rows < 6) {
143     i5 = zero;
144   }
145   i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
146   if XNN_UNPREDICTABLE(rows <= 6) {
147     i6 = zero;
148   }
149 
150   const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
151   const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
152   const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
153   const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
154   const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
155   for (; channels >= 8; channels -= 8) {
156     const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
157     i0 += 8;
158     const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
159     i1 += 8;
160 
161     v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
162     const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
163     i2 += 8;
164 
165     vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
166     const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
167     i3 += 8;
168     vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
169     const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
170     i4 += 8;
171     vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
172     const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
173     i5 += 8;
174     vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
175     const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
176     i6 += 8;
177 
178     vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
179 
180     v128_t vacc0123 = wasm_v128_load(buffer);
181     v128_t vacc4567 = wasm_v128_load(buffer + 4);
182     buffer += 8;
183 
184     vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
185     vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
186 
187     vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
188     vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
189 
190     vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
191     vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
192 
193     vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
194     vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
195 
196     vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
197     vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
198 
199     vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
200     vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
201 
202     v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
203 
204     v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
205 
206     vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
207 
208     *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
209     output += 8;
210   }
211   if XNN_UNLIKELY(channels != 0) {
212     {
213       const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
214       i0 += 8;
215       const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
216       i1 += 8;
217 
218       v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
219       const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
220       i2 += 8;
221 
222       vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
223       const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
224       i3 += 8;
225       vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
226       const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
227       i4 += 8;
228       vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
229       const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
230       i5 += 8;
231       vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
232       const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
233       i6 += 8;
234 
235       vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
236 
237       v128_t vacc0123 = wasm_v128_load(buffer);
238       v128_t vacc4567 = wasm_v128_load(buffer + 4);
239       buffer += 8;
240 
241       vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
242       vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
243 
244       vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
245       vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
246 
247       vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
248       vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
249 
250       vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
251       vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
252 
253       vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
254       vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
255 
256       vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
257       vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
258 
259       const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
260       v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
261       vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
262 
263       if (channels & 4) {
264         *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
265         vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
266         output += 4;
267       }
268       uint32_t vout0123 = wasm_i32x4_extract_lane(vout0123456701234567, 0);
269       if (channels & 2) {
270         *((uint16_t*) output) = (uint16_t) vout0123;
271         vout0123 >>= 16;
272         output += 2;
273       }
274       if (channels & 1) {
275         *output = (uint8_t) vout0123;
276       }
277     }
278   }
279 }
280