1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "pooling.hpp"
26 #include <cstdint>
27 #include <cstddef>
28 #include <cstring>
29 #include <cmath>
30 
31 
32 #if defined(ARM_COMPUTE_ENABLE_SVE)
33 
34 namespace arm_conv {
35 namespace pooling {
36 
37 namespace {
38   struct RescaleParams
39   {
40     int32_t multiplier, shift;
41   };
42 
43   constexpr RescaleParams rescale_params[8] = {
44     {0x40000000, -0},  // 1/2
45     {0x55555556, -1},  // 1/3
46     {0x40000000, -1},  // 1/4
47     {0x66666666, -2},  // 1/5
48     {0x55555556, -2},  // 1/6
49     {0x49249249, -2},  // 1/7
50     {0x40000000, -2},  // 1/8
51     {0x71c71c72, -3},  // 1/9
52   };
53 }
54 
sve_s8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells,const uint64_t n_valid_cells,uint64_t n_channels,const int8_t * const * const inptrs,int8_t * outptr,const Requantize32 & qp)55 void sve_s8q_nhwc_avg_generic_depthfirst_impl(
56   const uint64_t window_cells,
57   const uint64_t n_valid_cells,
58   uint64_t n_channels,
59   const int8_t *const *const inptrs,
60   int8_t *outptr,
61   const Requantize32 &qp
62 )
63 {
64   if (n_valid_cells == 1 && window_cells == 1)
65   {
66     // In this case, simply copy from the input to the output
67     std::memcpy(outptr, *inptrs, n_channels);
68     return;
69   }
70 
71   // Compute (or look up) the rescale values
72   int32_t shift_value = 0, rescale_value = 0;
73   if (2 <= window_cells && window_cells <= 9)
74   {
75     auto &params = rescale_params[window_cells - 2];
76     rescale_value = params.multiplier;
77     shift_value = params.shift;
78   }
79   else
80   {
81     auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
82 
83     shift_value = 0;
84     while (f_rescale_value < 0.5f)
85     {
86       shift_value--;
87       f_rescale_value *= 2.0f;
88     }
89 
90     int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
91     if (long_rescale_value == (1ll << 31))
92     {
93       shift_value++;
94       long_rescale_value >>= 1;
95     }
96     rescale_value = static_cast<int32_t>(long_rescale_value);
97   }
98 
99   // Combine together the rescale value for the requantization and the scaling
100   // factor for the average pool.
101   const int32_t shift = qp.per_layer_left_shift - qp.per_layer_right_shift + shift_value;
102   const int32_t left_shift = shift > 0 ? shift : 0;
103   const int32_t right_shift = shift <= 0 ? shift : 0;
104 
105   int32_t combined_rescale_value = 0;
106   __asm__ __volatile__ (
107       "mov v16.s[0], %w[per_layer_mul]\n"
108       "mov v17.s[0], %w[rescale_value]\n"
109       "sqrdmulh s18, s16, s17\n"
110       "mov %w[combined_rescale_value], v18.s[0]\n"
111     : [combined_rescale_value] "=r" (combined_rescale_value)
112     : [per_layer_mul] "r" (qp.per_layer_mul), [rescale_value] "r" (rescale_value)
113     : "v16", "v17", "v18"
114   );
115 
116   __asm__ __volatile__(
117     "mov x26, #0x0\n"
118     "cntb x25\n"
119     "cntb x24, ALL, MUL #2\n"
120     "cntb x23, ALL, MUL #3\n"
121     "whilelt p4.b, x26, %x[n_channels]\n"
122     "whilelt p3.b, x25, %x[n_channels]\n"
123     "whilelt p2.b, x24, %x[n_channels]\n"
124     "whilelt p1.b, x23, %x[n_channels]\n"
125     "ptrue p0.b\n"
126     "b.none 7f\n"
127     "1:"  // 4-vectors of channels
128     "lsr x22, %x[n_valid_cells], #0x1\n"
129     "mov z15.s, #0x0\n"
130     "mov z14.s, #0x0\n"
131     "mov x19, %x[inptrs]\n"
132     "mov z13.s, #0x0\n"
133     "mov z12.s, #0x0\n"
134     "mov z11.s, #0x0\n"
135     "mov z10.s, #0x0\n"
136     "mov z9.s, #0x0\n"
137     "mov z8.s, #0x0\n"
138     "mov z7.s, #0x0\n"
139     "mov z6.s, #0x0\n"
140     "mov z5.s, #0x0\n"
141     "mov z4.s, #0x0\n"
142     "mov z3.s, #0x0\n"
143     "mov z2.s, #0x0\n"
144     "mov z1.s, #0x0\n"
145     "mov z0.s, #0x0\n"
146     "cbz x22, 4f\n"
147     "ldp x21, x20, [x19, #0x0]\n"
148     "subs x22, x22, #0x1\n"
149     "add x19, x19, #0x10\n"
150     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
151     "ld1b { z30.b }, p4/Z, [x20, x26]\n"
152     "ld1b { z29.b }, p3/Z, [x21, x25]\n"
153     "ld1b { z28.b }, p3/Z, [x20, x25]\n"
154     "ld1b { z27.b }, p2/Z, [x21, x24]\n"
155     "ld1b { z26.b }, p2/Z, [x20, x24]\n"
156     "ld1b { z25.b }, p1/Z, [x21, x23]\n"
157     "ld1b { z24.b }, p1/Z, [x20, x23]\n"
158     "beq 3f\n"
159     "2:"  // 4-vectors of channels: 2 inputs loop
160     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
161     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
162     "ldp x21, x20, [x19, #0x0]\n"
163     "subs x22, x22, #0x1\n"
164     ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
165     ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
166     "add x19, x19, #0x10\n"
167     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
168     ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
169     ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
170     "ld1b { z30.b }, p4/Z, [x20, x26]\n"
171     "ld1b { z29.b }, p3/Z, [x21, x25]\n"
172     ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
173     ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
174     "ld1b { z28.b }, p3/Z, [x20, x25]\n"
175     "ld1b { z27.b }, p2/Z, [x21, x24]\n"
176     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
177     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
178     "ld1b { z26.b }, p2/Z, [x20, x24]\n"
179     "ld1b { z25.b }, p1/Z, [x21, x23]\n"
180     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
181     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
182     "ld1b { z24.b }, p1/Z, [x20, x23]\n"
183     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
184     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
185     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
186     ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
187     ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
188     ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
189     ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
190     ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
191     ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
192     ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
193     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
194     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
195     "bgt 2b\n"
196     "3:"  // 4-vectors of channels: 2 inputs tail
197     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
198     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
199     ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
200     ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
201     ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
202     ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
203     ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
204     ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
205     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
206     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
207     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
208     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
209     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
210     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
211     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
212     ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
213     ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
214     ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
215     ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
216     ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
217     ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
218     ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
219     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
220     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
221     "4:"  // 4-vectors of channels: After loop
222     "ands x20, %x[n_valid_cells], #0x1\n"
223     "beq 6f\n"
224     "5:"  // 4-vectors of channels: Single input loop
225     "ldr x21, [x19], #0x8\n"
226     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
227     ".inst 0x4508a3f7  // sshllb z23.h, z31.b, #0x0\n"
228     ".inst 0x4508a7f6  // sshllt z22.h, z31.b, #0x0\n"
229     "ld1b { z29.b }, p3/Z, [x21, x25]\n"
230     "ld1b { z27.b }, p2/Z, [x21, x24]\n"
231     ".inst 0x4508a3b5  // sshllb z21.h, z29.b, #0x0\n"
232     ".inst 0x4508a7b4  // sshllt z20.h, z29.b, #0x0\n"
233     "ld1b { z25.b }, p1/Z, [x21, x23]\n"
234     ".inst 0x4508a373  // sshllb z19.h, z27.b, #0x0\n"
235     ".inst 0x4508a772  // sshllt z18.h, z27.b, #0x0\n"
236     "subs x20, x20, #0x1\n"
237     ".inst 0x4508a331  // sshllb z17.h, z25.b, #0x0\n"
238     ".inst 0x4508a730  // sshllt z16.h, z25.b, #0x0\n"
239     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
240     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
241     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
242     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
243     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
244     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
245     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
246     ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
247     ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
248     ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
249     ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
250     ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
251     ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
252     ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
253     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
254     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
255     "bgt 5b\n"
256     "6:"  // 4-vectors of channels: Single input loop: End
257     "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
258     "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
259     ".inst 0x4482824f  // srshl z15.s, p0/M, z15.s, z18.s\n"
260     ".inst 0x4482824e  // srshl z14.s, p0/M, z14.s, z18.s\n"
261     ".inst 0x4482824d  // srshl z13.s, p0/M, z13.s, z18.s\n"
262     ".inst 0x4482824c  // srshl z12.s, p0/M, z12.s, z18.s\n"
263     "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
264     ".inst 0x04b175ef  // sqrdmulh z15.s, z15.s, z17.s\n"
265     ".inst 0x4482824b  // srshl z11.s, p0/M, z11.s, z18.s\n"
266     ".inst 0x4482824a  // srshl z10.s, p0/M, z10.s, z18.s\n"
267     ".inst 0x04b175ce  // sqrdmulh z14.s, z14.s, z17.s\n"
268     ".inst 0x04b175ad  // sqrdmulh z13.s, z13.s, z17.s\n"
269     ".inst 0x44828249  // srshl z9.s, p0/M, z9.s, z18.s\n"
270     ".inst 0x44828248  // srshl z8.s, p0/M, z8.s, z18.s\n"
271     ".inst 0x04b1758c  // sqrdmulh z12.s, z12.s, z17.s\n"
272     ".inst 0x04b1756b  // sqrdmulh z11.s, z11.s, z17.s\n"
273     ".inst 0x44828247  // srshl z7.s, p0/M, z7.s, z18.s\n"
274     ".inst 0x44828246  // srshl z6.s, p0/M, z6.s, z18.s\n"
275     ".inst 0x04b1754a  // sqrdmulh z10.s, z10.s, z17.s\n"
276     ".inst 0x04b17529  // sqrdmulh z9.s, z9.s, z17.s\n"
277     ".inst 0x44828245  // srshl z5.s, p0/M, z5.s, z18.s\n"
278     ".inst 0x44828244  // srshl z4.s, p0/M, z4.s, z18.s\n"
279     ".inst 0x04b17508  // sqrdmulh z8.s, z8.s, z17.s\n"
280     ".inst 0x04b174e7  // sqrdmulh z7.s, z7.s, z17.s\n"
281     ".inst 0x44828243  // srshl z3.s, p0/M, z3.s, z18.s\n"
282     ".inst 0x44828242  // srshl z2.s, p0/M, z2.s, z18.s\n"
283     ".inst 0x04b174c6  // sqrdmulh z6.s, z6.s, z17.s\n"
284     ".inst 0x04b174a5  // sqrdmulh z5.s, z5.s, z17.s\n"
285     ".inst 0x44828241  // srshl z1.s, p0/M, z1.s, z18.s\n"
286     ".inst 0x44828240  // srshl z0.s, p0/M, z0.s, z18.s\n"
287     ".inst 0x04b17484  // sqrdmulh z4.s, z4.s, z17.s\n"
288     ".inst 0x04b17463  // sqrdmulh z3.s, z3.s, z17.s\n"
289     ".inst 0x04b17442  // sqrdmulh z2.s, z2.s, z17.s\n"
290     ".inst 0x04b17421  // sqrdmulh z1.s, z1.s, z17.s\n"
291     ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
292     ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
293     ".inst 0x04b17400  // sqrdmulh z0.s, z0.s, z17.s\n"
294     "mov z18.s, #0x7f\n"
295     ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
296     ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
297     ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
298     ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
299     ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
300     ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
301     ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
302     ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
303     ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
304     ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
305     ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
306     ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
307     ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
308     ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
309     "not z16.s, p0/M, z18.s\n"
310     "smax z15.s, p0/M, z15.s, z16.s\n"
311     "smax z14.s, p0/M, z14.s, z16.s\n"
312     "smax z13.s, p0/M, z13.s, z16.s\n"
313     "smax z12.s, p0/M, z12.s, z16.s\n"
314     "smax z11.s, p0/M, z11.s, z16.s\n"
315     "smax z10.s, p0/M, z10.s, z16.s\n"
316     "smax z9.s, p0/M, z9.s, z16.s\n"
317     "smax z8.s, p0/M, z8.s, z16.s\n"
318     "smax z7.s, p0/M, z7.s, z16.s\n"
319     "smax z6.s, p0/M, z6.s, z16.s\n"
320     "smax z5.s, p0/M, z5.s, z16.s\n"
321     "smax z4.s, p0/M, z4.s, z16.s\n"
322     "smax z3.s, p0/M, z3.s, z16.s\n"
323     "smax z2.s, p0/M, z2.s, z16.s\n"
324     "smax z1.s, p0/M, z1.s, z16.s\n"
325     "smax z0.s, p0/M, z0.s, z16.s\n"
326     "smin z15.s, p0/M, z15.s, z18.s\n"
327     "smin z14.s, p0/M, z14.s, z18.s\n"
328     "smin z13.s, p0/M, z13.s, z18.s\n"
329     "trn1 z17.h, z15.h, z14.h\n"
330     "smin z12.s, p0/M, z12.s, z18.s\n"
331     "smin z11.s, p0/M, z11.s, z18.s\n"
332     "trn1 z16.h, z13.h, z12.h\n"
333     "trn1 z16.b, z17.b, z16.b\n"
334     "smin z10.s, p0/M, z10.s, z18.s\n"
335     "smin z9.s, p0/M, z9.s, z18.s\n"
336     "trn1 z17.h, z11.h, z10.h\n"
337     "st1b { z16.b }, p4, [%x[outptr], x26]\n"
338     "smin z8.s, p0/M, z8.s, z18.s\n"
339     "smin z7.s, p0/M, z7.s, z18.s\n"
340     "trn1 z16.h, z9.h, z8.h\n"
341     "trn1 z16.b, z17.b, z16.b\n"
342     "smin z6.s, p0/M, z6.s, z18.s\n"
343     "smin z5.s, p0/M, z5.s, z18.s\n"
344     "trn1 z17.h, z7.h, z6.h\n"
345     "st1b { z16.b }, p3, [%x[outptr], x25]\n"
346     "smin z4.s, p0/M, z4.s, z18.s\n"
347     "smin z3.s, p0/M, z3.s, z18.s\n"
348     "trn1 z16.h, z5.h, z4.h\n"
349     "trn1 z16.b, z17.b, z16.b\n"
350     "smin z2.s, p0/M, z2.s, z18.s\n"
351     "smin z1.s, p0/M, z1.s, z18.s\n"
352     "trn1 z17.h, z3.h, z2.h\n"
353     "st1b { z16.b }, p2, [%x[outptr], x24]\n"
354     "smin z0.s, p0/M, z0.s, z18.s\n"
355     "trn1 z16.h, z1.h, z0.h\n"
356     "trn1 z16.b, z17.b, z16.b\n"
357     "st1b { z16.b }, p1, [%x[outptr], x23]\n"
358     "incb x23, ALL, MUL #4\n"
359     "whilelt p1.b, x23, %x[n_channels]\n"
360     "incb x26, ALL, MUL #4\n"
361     "incb x25, ALL, MUL #4\n"
362     "incb x24, ALL, MUL #4\n"
363     "b.any 1b\n"
364     "7:"  // Single vector of channels
365     "whilelt p4.b, x26, %x[n_channels]\n"
366     "b.none 14f\n"
367     "8:"  // Single vector of channels: Loop
368     "lsr x22, %x[n_valid_cells], #0x1\n"
369     "mov z15.s, #0x0\n"
370     "mov z14.s, #0x0\n"
371     "mov x19, %x[inptrs]\n"
372     "mov z13.s, #0x0\n"
373     "mov z12.s, #0x0\n"
374     "cbz x22, 11f\n"
375     "ldp x21, x20, [x19, #0x0]\n"
376     "subs x22, x22, #0x1\n"
377     "add x19, x19, #0x10\n"
378     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
379     "ld1b { z30.b }, p4/Z, [x20, x26]\n"
380     "beq 10f\n"
381     "9:"  // Single vector of channels: Loop: 2 inputs loop
382     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
383     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
384     "ldp x21, x20, [x19, #0x0]\n"
385     "subs x22, x22, #0x1\n"
386     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
387     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
388     "add x19, x19, #0x10\n"
389     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
390     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
391     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
392     "ld1b { z30.b }, p4/Z, [x20, x26]\n"
393     "bgt 9b\n"
394     "10:"  // Single vector of channels: Loop: 2 inputs tail
395     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
396     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
397     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
398     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
399     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
400     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
401     "11:"  // Single vector of channels: Loop: After loop
402     "ands x20, %x[n_valid_cells], #0x1\n"
403     "beq 13f\n"
404     "12:"  // Single vector of channels: Loop: Single input loop
405     "ldr x21, [x19], #0x8\n"
406     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
407     ".inst 0x4508a3f7  // sshllb z23.h, z31.b, #0x0\n"
408     ".inst 0x4508a7f6  // sshllt z22.h, z31.b, #0x0\n"
409     "subs x20, x20, #0x1\n"
410     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
411     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
412     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
413     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
414     "bgt 12b\n"
415     "13:"  // Single vector of channels: Loop: Single input loop: End
416     "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
417     "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
418     ".inst 0x4482824f  // srshl z15.s, p0/M, z15.s, z18.s\n"
419     ".inst 0x4482824e  // srshl z14.s, p0/M, z14.s, z18.s\n"
420     ".inst 0x4482824d  // srshl z13.s, p0/M, z13.s, z18.s\n"
421     ".inst 0x4482824c  // srshl z12.s, p0/M, z12.s, z18.s\n"
422     "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
423     ".inst 0x04b175ef  // sqrdmulh z15.s, z15.s, z17.s\n"
424     ".inst 0x04b175ce  // sqrdmulh z14.s, z14.s, z17.s\n"
425     ".inst 0x04b175ad  // sqrdmulh z13.s, z13.s, z17.s\n"
426     ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
427     ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
428     ".inst 0x04b1758c  // sqrdmulh z12.s, z12.s, z17.s\n"
429     "mov z18.s, #0x7f\n"
430     ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
431     ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
432     "not z16.s, p0/M, z18.s\n"
433     "smax z15.s, p0/M, z15.s, z16.s\n"
434     "smax z14.s, p0/M, z14.s, z16.s\n"
435     "smax z13.s, p0/M, z13.s, z16.s\n"
436     "smax z12.s, p0/M, z12.s, z16.s\n"
437     "smin z15.s, p0/M, z15.s, z18.s\n"
438     "smin z14.s, p0/M, z14.s, z18.s\n"
439     "smin z13.s, p0/M, z13.s, z18.s\n"
440     "trn1 z17.h, z15.h, z14.h\n"
441     "smin z12.s, p0/M, z12.s, z18.s\n"
442     "trn1 z16.h, z13.h, z12.h\n"
443     "trn1 z16.b, z17.b, z16.b\n"
444     "st1b { z16.b }, p4, [%x[outptr], x26]\n"
445     "incb x26\n"
446     "whilelt p4.b, x26, %x[n_channels]\n"
447     "b.any 8b\n"
448     "14:"  // End
449     :
450     : [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [right_shift] "r" (&right_shift)
451     : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
452   );
453 }
454 
455 }  // namespace pooling
456 }  // namespace arm_conv
457 
458 #endif  // defined(ARM_COMPUTE_ENABLE_SVE)
459