xref: /aosp_15_r20/external/XNNPACK/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f16-gavgpool/multipass-neonfp16arith.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24(size_t rows,size_t channels,const void * input,size_t input_stride,const void * zero,void * buffer,void * output,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24(
19     size_t rows,
20     size_t channels,
21     const void* input,
22     size_t input_stride,
23     const void* zero,
24     void* buffer,
25     void* output,
26     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27 {
28   assert(rows > 7);
29   assert(channels != 0);
30 
31   const __fp16* i0 = input;
32   const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
33   const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
34   const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
35   const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
36   const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
37   const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
38   const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(__fp16);
39 
40   __fp16* b = buffer;
41   size_t c = channels;
42   for (; c >= 24; c -= 24) {
43     const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
44     const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
45     const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
46     const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
47     const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
48     const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
49 
50     const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
51     float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
52     const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
53     float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
54     const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
55     float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN);
56 
57     const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
58     vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
59     const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
60     vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
61     const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
62     vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
63     const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
64     vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
65     const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
66     vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
67     const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
68     vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
69     const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
70     vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
71     const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
72     vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
73     const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
74     vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
75     const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
76     vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
77     const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
78     vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
79     const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
80     vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
81     vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
82     vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
83     vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
84 
85     vst1q_f16(b, vacc01234567); b += 8;
86     vst1q_f16(b, vacc89ABCDEF); b += 8;
87     vst1q_f16(b, vaccGHIJKLMN); b += 8;
88   }
89   if XNN_UNLIKELY(c != 0) {
90     do {
91       const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
92       const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
93       const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
94       float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
95 
96       const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
97       vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
98       const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
99       vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
100       const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
101       vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
102       const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
103       vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
104       vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
105 
106       vst1q_f16(b, vacc01234567); b += 8;
107 
108       c = doz(c, 8);
109     } while (c != 0);
110   }
111 
112   for (rows -= 7; rows > 7; rows -= 7) {
113     i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
114     i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
115     i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
116     i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
117     i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
118     i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
119     i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
120 
121     __fp16* b = buffer;
122     size_t c = channels;
123     for (; c >= 24; c -= 24) {
124       float16x8_t vacc01234567 = vld1q_f16(b);
125       float16x8_t vacc89ABCDEF = vld1q_f16(b + 8);
126       float16x8_t vaccGHIJKLMN = vld1q_f16(b + 16);
127 
128       const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
129       const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
130       const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
131 
132       const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
133       vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
134       const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
135       vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
136       const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
137       vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN);
138       const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
139       vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
140       const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
141       vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
142       const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
143       vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN);
144       const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
145       vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
146       const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
147       vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
148       const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
149       vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
150       const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
151       vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
152       const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
153       vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
154       const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
155       vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
156       const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
157       vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
158       const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
159       vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
160       const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
161       vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
162       const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
163       vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
164       const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
165       vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
166       const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
167       vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
168       vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
169       vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
170       vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
171 
172       vst1q_f16(b, vacc01234567); b += 8;
173       vst1q_f16(b, vacc89ABCDEF); b += 8;
174       vst1q_f16(b, vaccGHIJKLMN); b += 8;
175     }
176     if XNN_UNLIKELY(c != 0) {
177       do {
178         float16x8_t vacc01234567 = vld1q_f16(b);
179         const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
180 
181         const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
182         vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
183         const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
184         vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
185         const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
186         vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
187         const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
188         vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
189         const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
190         vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
191         const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
192         vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
193         vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
194 
195         vst1q_f16(b, vacc01234567); b += 8;
196 
197         c = doz(c, 8);
198       } while (c != 0);
199     }
200   }
201 
202   i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
203   i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
204   if XNN_UNPREDICTABLE(rows < 2) {
205     i1 = (const __fp16*) zero;
206   }
207   i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
208   if XNN_UNPREDICTABLE(rows <= 2) {
209     i2 = (const __fp16*) zero;
210   }
211   i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
212   if XNN_UNPREDICTABLE(rows < 4) {
213     i3 = (const __fp16*) zero;
214   }
215   i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
216   if XNN_UNPREDICTABLE(rows <= 4) {
217     i4 = (const __fp16*) zero;
218   }
219   i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
220   if XNN_UNPREDICTABLE(rows < 6) {
221     i5 = (const __fp16*) zero;
222   }
223   i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
224   if XNN_UNPREDICTABLE(rows <= 6) {
225     i6 = (const __fp16*) zero;
226   }
227 
228   const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
229   const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
230   const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
231   for (; channels >= 24; channels -= 24) {
232     float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
233     float16x8_t vacc89ABCDEF = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
234     float16x8_t vaccGHIJKLMN = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
235 
236     const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
237     const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
238     const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
239 
240     const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
241     vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
242     const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
243     vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
244     const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
245     vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN);
246     const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
247     vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
248     const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
249     vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
250     const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
251     vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN);
252     const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
253     vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
254     const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
255     vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
256     const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
257     vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
258     const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
259     vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
260     const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
261     vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
262     const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
263     vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
264     const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
265     vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
266     const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
267     vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
268     const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
269     vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
270     const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
271     vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
272     const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
273     vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
274     const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
275     vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
276     vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
277     vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
278     vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
279 
280     vacc01234567 = vmulq_f16(vacc01234567, vscale);
281     vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
282     vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale);
283 
284     vacc01234567 = vmaxq_f16(vacc01234567, vmin);
285     vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
286     vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin);
287 
288     vacc01234567 = vminq_f16(vacc01234567, vmax);
289     vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
290     vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax);
291 
292     vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
293     vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
294     vst1q_f16(output, vaccGHIJKLMN); output = (__fp16*) output + 8;
295   }
296   if XNN_UNLIKELY(channels != 0) {
297     do {
298       float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
299 
300       const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
301       const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
302       vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
303       const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
304       vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
305       const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
306       vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
307       const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
308       vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
309       const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
310       vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
311       const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
312       vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
313       vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
314 
315       vacc01234567 = vmulq_f16(vacc01234567, vscale);
316       vacc01234567 = vmaxq_f16(vacc01234567, vmin);
317       vacc01234567 = vminq_f16(vacc01234567, vmax);
318 
319       if XNN_LIKELY(channels >= 8) {
320         vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
321         channels -= 8;
322       } else {
323         float16x4_t vacc0123 = vget_low_f16(vacc01234567);
324         if (channels & 4) {
325           vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
326           vacc0123 = vget_high_f16(vacc01234567);
327         }
328         if (channels & 2) {
329           vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
330           vacc0123 = vext_f16(vacc0123, vacc0123, 2);
331         }
332         if (channels & 1) {
333           vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
334         }
335         channels = 0;
336       }
337     } while (channels != 0);
338   }
339 }
340