1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #if defined(ARM_COMPUTE_ENABLE_SME)
26 
27 #include "src/core/NEON/kernels/assembly/pooling.hpp"
28 #include <cstdint>
29 #include <cstring>
30 #include <cmath>
31 
32 
33 namespace arm_conv {
34 namespace pooling {
35 
36 namespace {
37   struct RescaleParams
38   {
39     int32_t multiplier, shift;
40   };
41 
42   constexpr RescaleParams rescale_params[8] = {
43     {0x40000000, -0},  // 1/2
44     {0x55555556, -1},  // 1/3
45     {0x40000000, -1},  // 1/4
46     {0x66666666, -2},  // 1/5
47     {0x55555556, -2},  // 1/6
48     {0x49249249, -2},  // 1/7
49     {0x40000000, -2},  // 1/8
50     {0x71c71c72, -3},  // 1/9
51   };
52 }
53 
sme_s8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells,const uint64_t n_valid_cells,uint64_t n_channels,const int8_t * const * const inptrs,int8_t * outptr,const Requantize32 & qp)54 void sme_s8q_nhwc_avg_generic_depthfirst_impl(
55   const uint64_t window_cells,
56   const uint64_t n_valid_cells,
57   uint64_t n_channels,
58   const int8_t *const *const inptrs,
59   int8_t *outptr,
60   const Requantize32 &qp
61 )
62 {
63   if (n_valid_cells == 1 && window_cells == 1)
64   {
65     // In this case, simply copy from the input to the output
66     std::memcpy(outptr, *inptrs, n_channels);
67     return;
68   }
69 
70   // Compute (or look up) the rescale values
71   int32_t shift_value = 0, rescale_value = 0;
72   if (2 <= window_cells && window_cells <= 9)
73   {
74     auto &params = rescale_params[window_cells - 2];
75     rescale_value = params.multiplier;
76     shift_value = params.shift;
77   }
78   else
79   {
80     auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
81 
82     shift_value = 0;
83     while (f_rescale_value < 0.5f)
84     {
85       shift_value--;
86       f_rescale_value *= 2.0f;
87     }
88 
89     rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
90     if (static_cast<int64_t>(rescale_value) == (1ll << 31))
91     {
92       shift_value++;
93       rescale_value >>= 1;
94     }
95   }
96 
97   // Combine together the rescale value for the requantization and the scaling
98   // factor for the average pool.
99   const int32_t shift = qp.per_layer_left_shift - qp.per_layer_right_shift + shift_value;
100   const int32_t left_shift = shift > 0 ? shift : 0;
101   const int32_t right_shift = shift <= 0 ? shift : 0;
102 
103   int32_t combined_rescale_value = 0;
104   __asm__ __volatile__ (
105       "mov v16.s[0], %w[per_layer_mul]\n"
106       "mov v17.s[0], %w[rescale_value]\n"
107       "sqrdmulh s18, s16, s17\n"
108       "mov %w[combined_rescale_value], v18.s[0]\n"
109     : [combined_rescale_value] "=r" (combined_rescale_value)
110     : [per_layer_mul] "r" (qp.per_layer_mul), [rescale_value] "r" (rescale_value)
111     : "v16", "v17", "v18"
112   );
113 
114   __asm__ __volatile__(
115     ".inst 0xd503477f  // SMSTART ZA\n"
116     "mov x26, #0x0\n"
117     "cntb x25\n"
118     "cntb x24, ALL, MUL #2\n"
119     "cntb x23, ALL, MUL #3\n"
120     "whilelt p4.b, x26, %x[n_channels]\n"
121     "whilelt p3.b, x25, %x[n_channels]\n"
122     "whilelt p2.b, x24, %x[n_channels]\n"
123     "whilelt p1.b, x23, %x[n_channels]\n"
124     "ptrue p0.b\n"
125     "b.none 7f\n"
126     "1:"  // 4-vectors of channels
127     "lsr x22, %x[n_valid_cells], #0x1\n"
128     "mov z15.s, #0x0\n"
129     "mov z14.s, #0x0\n"
130     "mov x19, %x[inptrs]\n"
131     "mov z13.s, #0x0\n"
132     "mov z12.s, #0x0\n"
133     "mov z11.s, #0x0\n"
134     "mov z10.s, #0x0\n"
135     "mov z9.s, #0x0\n"
136     "mov z8.s, #0x0\n"
137     "mov z7.s, #0x0\n"
138     "mov z6.s, #0x0\n"
139     "mov z5.s, #0x0\n"
140     "mov z4.s, #0x0\n"
141     "mov z3.s, #0x0\n"
142     "mov z2.s, #0x0\n"
143     "mov z1.s, #0x0\n"
144     "mov z0.s, #0x0\n"
145     "cbz x22, 4f\n"
146     "ldp x21, x20, [x19, #0x0]\n"
147     "subs x22, x22, #0x1\n"
148     "add x19, x19, #0x10\n"
149     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
150     "ld1b { z30.b }, p4/Z, [x20, x26]\n"
151     "ld1b { z29.b }, p3/Z, [x21, x25]\n"
152     "ld1b { z28.b }, p3/Z, [x20, x25]\n"
153     "ld1b { z27.b }, p2/Z, [x21, x24]\n"
154     "ld1b { z26.b }, p2/Z, [x20, x24]\n"
155     "ld1b { z25.b }, p1/Z, [x21, x23]\n"
156     "ld1b { z24.b }, p1/Z, [x20, x23]\n"
157     "beq 3f\n"
158     "2:"  // 4-vectors of channels: 2 inputs loop
159     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
160     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
161     "ldp x21, x20, [x19, #0x0]\n"
162     "subs x22, x22, #0x1\n"
163     ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
164     ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
165     "add x19, x19, #0x10\n"
166     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
167     ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
168     ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
169     "ld1b { z30.b }, p4/Z, [x20, x26]\n"
170     ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
171     ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
172     "ld1b { z29.b }, p3/Z, [x21, x25]\n"
173     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
174     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
175     "ld1b { z28.b }, p3/Z, [x20, x25]\n"
176     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
177     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
178     "ld1b { z27.b }, p2/Z, [x21, x24]\n"
179     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
180     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
181     "ld1b { z26.b }, p2/Z, [x20, x24]\n"
182     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
183     ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
184     "ld1b { z25.b }, p1/Z, [x21, x23]\n"
185     ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
186     ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
187     "ld1b { z24.b }, p1/Z, [x20, x23]\n"
188     ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
189     ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
190     ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
191     ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
192     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
193     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
194     "bgt 2b\n"
195     "3:"  // 4-vectors of channels: 2 inputs tail
196     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
197     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
198     ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
199     ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
200     ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
201     ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
202     ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
203     ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
204     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
205     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
206     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
207     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
208     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
209     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
210     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
211     ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
212     ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
213     ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
214     ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
215     ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
216     ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
217     ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
218     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
219     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
220     "4:"  // 4-vectors of channels: After loop
221     "ands x20, %x[n_valid_cells], #0x1\n"
222     "beq 6f\n"
223     "5:"  // 4-vectors of channels: Single input loop
224     "ldr x21, [x19], #0x8\n"
225     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
226     ".inst 0x4508a3f7  // sshllb z23.h, z31.b, #0x0\n"
227     ".inst 0x4508a7f6  // sshllt z22.h, z31.b, #0x0\n"
228     "ld1b { z29.b }, p3/Z, [x21, x25]\n"
229     ".inst 0x4508a3b5  // sshllb z21.h, z29.b, #0x0\n"
230     ".inst 0x4508a7b4  // sshllt z20.h, z29.b, #0x0\n"
231     "subs x20, x20, #0x1\n"
232     "ld1b { z27.b }, p2/Z, [x21, x24]\n"
233     ".inst 0x4508a373  // sshllb z19.h, z27.b, #0x0\n"
234     ".inst 0x4508a772  // sshllt z18.h, z27.b, #0x0\n"
235     "ld1b { z25.b }, p1/Z, [x21, x23]\n"
236     ".inst 0x4508a331  // sshllb z17.h, z25.b, #0x0\n"
237     ".inst 0x4508a730  // sshllt z16.h, z25.b, #0x0\n"
238     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
239     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
240     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
241     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
242     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
243     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
244     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
245     ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
246     ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
247     ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
248     ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
249     ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
250     ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
251     ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
252     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
253     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
254     "bgt 5b\n"
255     "6:"  // 4-vectors of channels: Single input loop: End
256     "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
257     ".inst 0x4482824f  // srshl z15.s, p0/M, z15.s, z18.s\n"
258     ".inst 0x4482824e  // srshl z14.s, p0/M, z14.s, z18.s\n"
259     ".inst 0x4482824d  // srshl z13.s, p0/M, z13.s, z18.s\n"
260     ".inst 0x4482824c  // srshl z12.s, p0/M, z12.s, z18.s\n"
261     "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
262     ".inst 0x4482824b  // srshl z11.s, p0/M, z11.s, z18.s\n"
263     ".inst 0x4482824a  // srshl z10.s, p0/M, z10.s, z18.s\n"
264     "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
265     ".inst 0x44828249  // srshl z9.s, p0/M, z9.s, z18.s\n"
266     ".inst 0x44828248  // srshl z8.s, p0/M, z8.s, z18.s\n"
267     ".inst 0x44828247  // srshl z7.s, p0/M, z7.s, z18.s\n"
268     ".inst 0x44828246  // srshl z6.s, p0/M, z6.s, z18.s\n"
269     ".inst 0x44828245  // srshl z5.s, p0/M, z5.s, z18.s\n"
270     ".inst 0x44828244  // srshl z4.s, p0/M, z4.s, z18.s\n"
271     ".inst 0x44828243  // srshl z3.s, p0/M, z3.s, z18.s\n"
272     ".inst 0x44828242  // srshl z2.s, p0/M, z2.s, z18.s\n"
273     ".inst 0x44828241  // srshl z1.s, p0/M, z1.s, z18.s\n"
274     ".inst 0x44828240  // srshl z0.s, p0/M, z0.s, z18.s\n"
275     ".inst 0x04b175ef  // sqrdmulh z15.s, z15.s, z17.s\n"
276     ".inst 0x04b175ce  // sqrdmulh z14.s, z14.s, z17.s\n"
277     ".inst 0x04b175ad  // sqrdmulh z13.s, z13.s, z17.s\n"
278     ".inst 0x04b1758c  // sqrdmulh z12.s, z12.s, z17.s\n"
279     ".inst 0x04b1756b  // sqrdmulh z11.s, z11.s, z17.s\n"
280     ".inst 0x04b1754a  // sqrdmulh z10.s, z10.s, z17.s\n"
281     ".inst 0x04b17529  // sqrdmulh z9.s, z9.s, z17.s\n"
282     ".inst 0x04b17508  // sqrdmulh z8.s, z8.s, z17.s\n"
283     ".inst 0x04b174e7  // sqrdmulh z7.s, z7.s, z17.s\n"
284     ".inst 0x04b174c6  // sqrdmulh z6.s, z6.s, z17.s\n"
285     ".inst 0x04b174a5  // sqrdmulh z5.s, z5.s, z17.s\n"
286     ".inst 0x04b17484  // sqrdmulh z4.s, z4.s, z17.s\n"
287     ".inst 0x04b17463  // sqrdmulh z3.s, z3.s, z17.s\n"
288     ".inst 0x04b17442  // sqrdmulh z2.s, z2.s, z17.s\n"
289     ".inst 0x04b17421  // sqrdmulh z1.s, z1.s, z17.s\n"
290     ".inst 0x04b17400  // sqrdmulh z0.s, z0.s, z17.s\n"
291     "mov z19.s, #0x7f\n"
292     ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
293     ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
294     ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
295     ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
296     ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
297     ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
298     ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
299     ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
300     ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
301     ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
302     ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
303     ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
304     ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
305     ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
306     ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
307     ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
308     "not z16.s, p0/M, z19.s\n"
309     "smax z15.s, p0/M, z15.s, z16.s\n"
310     "smax z14.s, p0/M, z14.s, z16.s\n"
311     "smax z13.s, p0/M, z13.s, z16.s\n"
312     "smax z12.s, p0/M, z12.s, z16.s\n"
313     "smax z11.s, p0/M, z11.s, z16.s\n"
314     "smax z10.s, p0/M, z10.s, z16.s\n"
315     "smax z9.s, p0/M, z9.s, z16.s\n"
316     "smax z8.s, p0/M, z8.s, z16.s\n"
317     "smax z7.s, p0/M, z7.s, z16.s\n"
318     "smax z6.s, p0/M, z6.s, z16.s\n"
319     "smax z5.s, p0/M, z5.s, z16.s\n"
320     "smax z4.s, p0/M, z4.s, z16.s\n"
321     "smax z3.s, p0/M, z3.s, z16.s\n"
322     "smax z2.s, p0/M, z2.s, z16.s\n"
323     "smax z1.s, p0/M, z1.s, z16.s\n"
324     "smax z0.s, p0/M, z0.s, z16.s\n"
325     "smin z15.s, p0/M, z15.s, z19.s\n"
326     "smin z14.s, p0/M, z14.s, z19.s\n"
327     "trn1 z23.h, z15.h, z14.h\n"
328     "smin z13.s, p0/M, z13.s, z19.s\n"
329     "smin z12.s, p0/M, z12.s, z19.s\n"
330     "trn1 z16.h, z13.h, z12.h\n"
331     "smin z11.s, p0/M, z11.s, z19.s\n"
332     "smin z10.s, p0/M, z10.s, z19.s\n"
333     "trn1 z22.h, z11.h, z10.h\n"
334     "smin z9.s, p0/M, z9.s, z19.s\n"
335     "smin z8.s, p0/M, z8.s, z19.s\n"
336     "trn1 z18.h, z9.h, z8.h\n"
337     "smin z7.s, p0/M, z7.s, z19.s\n"
338     "smin z6.s, p0/M, z6.s, z19.s\n"
339     "trn1 z21.h, z7.h, z6.h\n"
340     "smin z5.s, p0/M, z5.s, z19.s\n"
341     "smin z4.s, p0/M, z4.s, z19.s\n"
342     "trn1 z17.h, z5.h, z4.h\n"
343     "smin z3.s, p0/M, z3.s, z19.s\n"
344     "smin z2.s, p0/M, z2.s, z19.s\n"
345     "trn1 z20.h, z3.h, z2.h\n"
346     "smin z1.s, p0/M, z1.s, z19.s\n"
347     "smin z0.s, p0/M, z0.s, z19.s\n"
348     "trn1 z19.h, z1.h, z0.h\n"
349     "trn1 z16.b, z23.b, z16.b\n"
350     "trn1 z18.b, z22.b, z18.b\n"
351     "st1b { z16.b }, p4, [%x[outptr], x26]\n"
352     "incb x26, ALL, MUL #4\n"
353     "trn1 z17.b, z21.b, z17.b\n"
354     "trn1 z16.b, z20.b, z19.b\n"
355     "st1b { z18.b }, p3, [%x[outptr], x25]\n"
356     "incb x25, ALL, MUL #4\n"
357     "st1b { z17.b }, p2, [%x[outptr], x24]\n"
358     "incb x24, ALL, MUL #4\n"
359     "st1b { z16.b }, p1, [%x[outptr], x23]\n"
360     "incb x23, ALL, MUL #4\n"
361     "whilelt p1.b, x23, %x[n_channels]\n"
362     "b.any 1b\n"
363     "7:"  // Single vector of channels
364     "whilelt p4.b, x26, %x[n_channels]\n"
365     "b.none 14f\n"
366     "8:"  // Single vector of channels: Loop
367     "lsr x22, %x[n_valid_cells], #0x1\n"
368     "mov z15.s, #0x0\n"
369     "mov z14.s, #0x0\n"
370     "mov x19, %x[inptrs]\n"
371     "mov z13.s, #0x0\n"
372     "mov z12.s, #0x0\n"
373     "cbz x22, 11f\n"
374     "ldp x21, x20, [x19, #0x0]\n"
375     "subs x22, x22, #0x1\n"
376     "add x19, x19, #0x10\n"
377     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
378     "ld1b { z30.b }, p4/Z, [x20, x26]\n"
379     "beq 10f\n"
380     "9:"  // Single vector of channels: Loop: 2 inputs loop
381     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
382     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
383     "ldp x21, x20, [x19, #0x0]\n"
384     "subs x22, x22, #0x1\n"
385     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
386     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
387     "add x19, x19, #0x10\n"
388     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
389     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
390     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
391     "ld1b { z30.b }, p4/Z, [x20, x26]\n"
392     "bgt 9b\n"
393     "10:"  // Single vector of channels: Loop: 2 inputs tail
394     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
395     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
396     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
397     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
398     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
399     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
400     "11:"  // Single vector of channels: Loop: After loop
401     "ands x20, %x[n_valid_cells], #0x1\n"
402     "beq 13f\n"
403     "12:"  // Single vector of channels: Loop: Single input loop
404     "ldr x21, [x19], #0x8\n"
405     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
406     ".inst 0x4508a3f7  // sshllb z23.h, z31.b, #0x0\n"
407     ".inst 0x4508a7f6  // sshllt z22.h, z31.b, #0x0\n"
408     "subs x20, x20, #0x1\n"
409     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
410     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
411     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
412     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
413     "bgt 12b\n"
414     "13:"  // Single vector of channels: Loop: Single input loop: End
415     "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
416     ".inst 0x4482824f  // srshl z15.s, p0/M, z15.s, z18.s\n"
417     ".inst 0x4482824e  // srshl z14.s, p0/M, z14.s, z18.s\n"
418     ".inst 0x4482824d  // srshl z13.s, p0/M, z13.s, z18.s\n"
419     ".inst 0x4482824c  // srshl z12.s, p0/M, z12.s, z18.s\n"
420     "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
421     ".inst 0x04b175ef  // sqrdmulh z15.s, z15.s, z17.s\n"
422     ".inst 0x04b175ce  // sqrdmulh z14.s, z14.s, z17.s\n"
423     "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
424     ".inst 0x04b175ad  // sqrdmulh z13.s, z13.s, z17.s\n"
425     ".inst 0x04b1758c  // sqrdmulh z12.s, z12.s, z17.s\n"
426     "mov z19.s, #0x7f\n"
427     ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
428     ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
429     ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
430     ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
431     "not z16.s, p0/M, z19.s\n"
432     "smax z15.s, p0/M, z15.s, z16.s\n"
433     "smax z14.s, p0/M, z14.s, z16.s\n"
434     "smax z13.s, p0/M, z13.s, z16.s\n"
435     "smax z12.s, p0/M, z12.s, z16.s\n"
436     "smin z15.s, p0/M, z15.s, z19.s\n"
437     "smin z14.s, p0/M, z14.s, z19.s\n"
438     "trn1 z23.h, z15.h, z14.h\n"
439     "smin z13.s, p0/M, z13.s, z19.s\n"
440     "smin z12.s, p0/M, z12.s, z19.s\n"
441     "trn1 z16.h, z13.h, z12.h\n"
442     "trn1 z16.b, z23.b, z16.b\n"
443     "st1b { z16.b }, p4, [%x[outptr], x26]\n"
444     "incb x26\n"
445     "whilelt p4.b, x26, %x[n_channels]\n"
446     "b.any 8b\n"
447     "14:"  // End
448     ".inst 0xd503467f  // SMSTOP\n"
449     :
450     : [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [right_shift] "r" (&right_shift)
451     : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
452   );
453 }
454 
455 }  // namespace pooling
456 }  // namespace arm_conv
457 
458 #endif  // defined(ARM_COMPUTE_ENABLE_SME)
459