1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #if defined(ARM_COMPUTE_ENABLE_SME)
26 
27 #include <cstdint>
28 #include <cstring>
29 #include <cmath>
30 
31 
32 namespace arm_conv {
33 namespace pooling {
34 
35 namespace {
36   struct RescaleParams
37   {
38     int32_t multiplier, shift;
39   };
40 
41   constexpr RescaleParams rescale_params[8] = {
42     {0x40000000, -0},  // 1/2
43     {0x55555556, -1},  // 1/3
44     {0x40000000, -1},  // 1/4
45     {0x66666666, -2},  // 1/5
46     {0x55555556, -2},  // 1/6
47     {0x49249249, -2},  // 1/7
48     {0x40000000, -2},  // 1/8
49     {0x71c71c72, -3},  // 1/9
50   };
51 }
52 
sme_s8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells,const uint64_t n_valid_cells,uint64_t n_channels,const int8_t * const * const inptrs,int8_t * outptr)53 void sme_s8_nhwc_avg_generic_depthfirst_impl(
54   const uint64_t window_cells,
55   const uint64_t n_valid_cells,
56   uint64_t n_channels,
57   const int8_t *const *const inptrs,
58   int8_t *outptr
59 )
60 {
61   if (n_valid_cells == 1 && window_cells == 1)
62   {
63     // In this case, simply copy from the input to the output
64     std::memcpy(outptr, *inptrs, n_channels);
65     return;
66   }
67 
68   // Compute (or look up) the rescale values
69   int32_t shift_value = 0, rescale_value = 0;
70   if (2 <= window_cells && window_cells <= 9)
71   {
72     auto &params = rescale_params[window_cells - 2];
73     rescale_value = params.multiplier;
74     shift_value = params.shift;
75   }
76   else
77   {
78     auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
79 
80     shift_value = 0;
81     while (f_rescale_value < 0.5f)
82     {
83       shift_value--;
84       f_rescale_value *= 2.0f;
85     }
86 
87     rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
88     if (static_cast<int64_t>(rescale_value) == (1ll << 31))
89     {
90       shift_value++;
91       rescale_value >>= 1;
92     }
93   }
94 
95   __asm__ __volatile__(
96     ".inst 0xd503477f  // SMSTART ZA\n"
97     "mov x26, #0x0\n"
98     "cntb x25\n"
99     "cntb x24, ALL, MUL #2\n"
100     "cntb x23, ALL, MUL #3\n"
101     "whilelt p4.b, x26, %x[n_channels]\n"
102     "whilelt p3.b, x25, %x[n_channels]\n"
103     "whilelt p2.b, x24, %x[n_channels]\n"
104     "whilelt p1.b, x23, %x[n_channels]\n"
105     "ptrue p0.b\n"
106     "b.none 7f\n"
107     "1:"  // 4-vectors of channels
108     "lsr x22, %x[n_valid_cells], #0x1\n"
109     "mov z15.s, #0x0\n"
110     "mov z14.s, #0x0\n"
111     "mov x19, %x[inptrs]\n"
112     "mov z13.s, #0x0\n"
113     "mov z12.s, #0x0\n"
114     "mov z11.s, #0x0\n"
115     "mov z10.s, #0x0\n"
116     "mov z9.s, #0x0\n"
117     "mov z8.s, #0x0\n"
118     "mov z7.s, #0x0\n"
119     "mov z6.s, #0x0\n"
120     "mov z5.s, #0x0\n"
121     "mov z4.s, #0x0\n"
122     "mov z3.s, #0x0\n"
123     "mov z2.s, #0x0\n"
124     "mov z1.s, #0x0\n"
125     "mov z0.s, #0x0\n"
126     "cbz x22, 4f\n"
127     "ldp x21, x20, [x19, #0x0]\n"
128     "subs x22, x22, #0x1\n"
129     "add x19, x19, #0x10\n"
130     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
131     "ld1b { z30.b }, p4/Z, [x20, x26]\n"
132     "ld1b { z29.b }, p3/Z, [x21, x25]\n"
133     "ld1b { z28.b }, p3/Z, [x20, x25]\n"
134     "ld1b { z27.b }, p2/Z, [x21, x24]\n"
135     "ld1b { z26.b }, p2/Z, [x20, x24]\n"
136     "ld1b { z25.b }, p1/Z, [x21, x23]\n"
137     "ld1b { z24.b }, p1/Z, [x20, x23]\n"
138     "beq 3f\n"
139     "2:"  // 4-vectors of channels: 2 inputs loop
140     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
141     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
142     "ldp x21, x20, [x19, #0x0]\n"
143     "subs x22, x22, #0x1\n"
144     ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
145     ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
146     "add x19, x19, #0x10\n"
147     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
148     ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
149     ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
150     "ld1b { z30.b }, p4/Z, [x20, x26]\n"
151     ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
152     ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
153     "ld1b { z29.b }, p3/Z, [x21, x25]\n"
154     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
155     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
156     "ld1b { z28.b }, p3/Z, [x20, x25]\n"
157     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
158     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
159     "ld1b { z27.b }, p2/Z, [x21, x24]\n"
160     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
161     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
162     "ld1b { z26.b }, p2/Z, [x20, x24]\n"
163     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
164     ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
165     "ld1b { z25.b }, p1/Z, [x21, x23]\n"
166     ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
167     ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
168     "ld1b { z24.b }, p1/Z, [x20, x23]\n"
169     ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
170     ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
171     ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
172     ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
173     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
174     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
175     "bgt 2b\n"
176     "3:"  // 4-vectors of channels: 2 inputs tail
177     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
178     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
179     ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
180     ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
181     ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
182     ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
183     ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
184     ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
185     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
186     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
187     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
188     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
189     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
190     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
191     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
192     ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
193     ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
194     ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
195     ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
196     ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
197     ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
198     ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
199     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
200     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
201     "4:"  // 4-vectors of channels: After loop
202     "ands x20, %x[n_valid_cells], #0x1\n"
203     "beq 6f\n"
204     "5:"  // 4-vectors of channels: Single input loop
205     "ldr x21, [x19], #0x8\n"
206     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
207     ".inst 0x4508a3f7  // sshllb z23.h, z31.b, #0x0\n"
208     ".inst 0x4508a7f6  // sshllt z22.h, z31.b, #0x0\n"
209     "ld1b { z29.b }, p3/Z, [x21, x25]\n"
210     ".inst 0x4508a3b5  // sshllb z21.h, z29.b, #0x0\n"
211     ".inst 0x4508a7b4  // sshllt z20.h, z29.b, #0x0\n"
212     "subs x20, x20, #0x1\n"
213     "ld1b { z27.b }, p2/Z, [x21, x24]\n"
214     ".inst 0x4508a373  // sshllb z19.h, z27.b, #0x0\n"
215     ".inst 0x4508a772  // sshllt z18.h, z27.b, #0x0\n"
216     "ld1b { z25.b }, p1/Z, [x21, x23]\n"
217     ".inst 0x4508a331  // sshllb z17.h, z25.b, #0x0\n"
218     ".inst 0x4508a730  // sshllt z16.h, z25.b, #0x0\n"
219     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
220     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
221     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
222     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
223     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
224     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
225     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
226     ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
227     ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
228     ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
229     ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
230     ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
231     ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
232     ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
233     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
234     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
235     "bgt 5b\n"
236     "6:"  // 4-vectors of channels: Single input loop: End
237     "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
238     ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
239     ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
240     ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
241     ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
242     "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
243     ".inst 0x04b1756b  // sqdmulh z11.s, z11.s, z17.s\n"
244     ".inst 0x04b1754a  // sqdmulh z10.s, z10.s, z17.s\n"
245     ".inst 0x04b17529  // sqdmulh z9.s, z9.s, z17.s\n"
246     ".inst 0x04b17508  // sqdmulh z8.s, z8.s, z17.s\n"
247     ".inst 0x04b174e7  // sqdmulh z7.s, z7.s, z17.s\n"
248     ".inst 0x04b174c6  // sqdmulh z6.s, z6.s, z17.s\n"
249     ".inst 0x04b174a5  // sqdmulh z5.s, z5.s, z17.s\n"
250     ".inst 0x04b17484  // sqdmulh z4.s, z4.s, z17.s\n"
251     ".inst 0x04b17463  // sqdmulh z3.s, z3.s, z17.s\n"
252     ".inst 0x04b17442  // sqdmulh z2.s, z2.s, z17.s\n"
253     ".inst 0x04b17421  // sqdmulh z1.s, z1.s, z17.s\n"
254     ".inst 0x04b17400  // sqdmulh z0.s, z0.s, z17.s\n"
255     "mov z19.s, #0x7f\n"
256     ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
257     ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
258     ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
259     ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
260     ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
261     ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
262     ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
263     ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
264     ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
265     ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
266     ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
267     ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
268     ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
269     ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
270     ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
271     ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
272     "not z16.s, p0/M, z19.s\n"
273     "smax z15.s, p0/M, z15.s, z16.s\n"
274     "smax z14.s, p0/M, z14.s, z16.s\n"
275     "smax z13.s, p0/M, z13.s, z16.s\n"
276     "smax z12.s, p0/M, z12.s, z16.s\n"
277     "smax z11.s, p0/M, z11.s, z16.s\n"
278     "smax z10.s, p0/M, z10.s, z16.s\n"
279     "smax z9.s, p0/M, z9.s, z16.s\n"
280     "smax z8.s, p0/M, z8.s, z16.s\n"
281     "smax z7.s, p0/M, z7.s, z16.s\n"
282     "smax z6.s, p0/M, z6.s, z16.s\n"
283     "smax z5.s, p0/M, z5.s, z16.s\n"
284     "smax z4.s, p0/M, z4.s, z16.s\n"
285     "smax z3.s, p0/M, z3.s, z16.s\n"
286     "smax z2.s, p0/M, z2.s, z16.s\n"
287     "smax z1.s, p0/M, z1.s, z16.s\n"
288     "smax z0.s, p0/M, z0.s, z16.s\n"
289     "smin z15.s, p0/M, z15.s, z19.s\n"
290     "smin z14.s, p0/M, z14.s, z19.s\n"
291     "trn1 z23.h, z15.h, z14.h\n"
292     "smin z13.s, p0/M, z13.s, z19.s\n"
293     "smin z12.s, p0/M, z12.s, z19.s\n"
294     "trn1 z16.h, z13.h, z12.h\n"
295     "smin z11.s, p0/M, z11.s, z19.s\n"
296     "smin z10.s, p0/M, z10.s, z19.s\n"
297     "trn1 z22.h, z11.h, z10.h\n"
298     "smin z9.s, p0/M, z9.s, z19.s\n"
299     "smin z8.s, p0/M, z8.s, z19.s\n"
300     "trn1 z18.h, z9.h, z8.h\n"
301     "smin z7.s, p0/M, z7.s, z19.s\n"
302     "smin z6.s, p0/M, z6.s, z19.s\n"
303     "trn1 z21.h, z7.h, z6.h\n"
304     "smin z5.s, p0/M, z5.s, z19.s\n"
305     "smin z4.s, p0/M, z4.s, z19.s\n"
306     "trn1 z17.h, z5.h, z4.h\n"
307     "smin z3.s, p0/M, z3.s, z19.s\n"
308     "smin z2.s, p0/M, z2.s, z19.s\n"
309     "trn1 z20.h, z3.h, z2.h\n"
310     "smin z1.s, p0/M, z1.s, z19.s\n"
311     "smin z0.s, p0/M, z0.s, z19.s\n"
312     "trn1 z19.h, z1.h, z0.h\n"
313     "trn1 z16.b, z23.b, z16.b\n"
314     "trn1 z18.b, z22.b, z18.b\n"
315     "st1b { z16.b }, p4, [%x[outptr], x26]\n"
316     "incb x26, ALL, MUL #4\n"
317     "trn1 z17.b, z21.b, z17.b\n"
318     "trn1 z16.b, z20.b, z19.b\n"
319     "st1b { z18.b }, p3, [%x[outptr], x25]\n"
320     "incb x25, ALL, MUL #4\n"
321     "st1b { z17.b }, p2, [%x[outptr], x24]\n"
322     "incb x24, ALL, MUL #4\n"
323     "st1b { z16.b }, p1, [%x[outptr], x23]\n"
324     "incb x23, ALL, MUL #4\n"
325     "whilelt p1.b, x23, %x[n_channels]\n"
326     "b.any 1b\n"
327     "7:"  // Single vector of channels
328     "whilelt p4.b, x26, %x[n_channels]\n"
329     "b.none 14f\n"
330     "8:"  // Single vector of channels: Loop
331     "lsr x22, %x[n_valid_cells], #0x1\n"
332     "mov z15.s, #0x0\n"
333     "mov z14.s, #0x0\n"
334     "mov x19, %x[inptrs]\n"
335     "mov z13.s, #0x0\n"
336     "mov z12.s, #0x0\n"
337     "cbz x22, 11f\n"
338     "ldp x21, x20, [x19, #0x0]\n"
339     "subs x22, x22, #0x1\n"
340     "add x19, x19, #0x10\n"
341     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
342     "ld1b { z30.b }, p4/Z, [x20, x26]\n"
343     "beq 10f\n"
344     "9:"  // Single vector of channels: Loop: 2 inputs loop
345     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
346     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
347     "ldp x21, x20, [x19, #0x0]\n"
348     "subs x22, x22, #0x1\n"
349     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
350     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
351     "add x19, x19, #0x10\n"
352     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
353     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
354     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
355     "ld1b { z30.b }, p4/Z, [x20, x26]\n"
356     "bgt 9b\n"
357     "10:"  // Single vector of channels: Loop: 2 inputs tail
358     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
359     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
360     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
361     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
362     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
363     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
364     "11:"  // Single vector of channels: Loop: After loop
365     "ands x20, %x[n_valid_cells], #0x1\n"
366     "beq 13f\n"
367     "12:"  // Single vector of channels: Loop: Single input loop
368     "ldr x21, [x19], #0x8\n"
369     "ld1b { z31.b }, p4/Z, [x21, x26]\n"
370     ".inst 0x4508a3f7  // sshllb z23.h, z31.b, #0x0\n"
371     ".inst 0x4508a7f6  // sshllt z22.h, z31.b, #0x0\n"
372     "subs x20, x20, #0x1\n"
373     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
374     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
375     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
376     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
377     "bgt 12b\n"
378     "13:"  // Single vector of channels: Loop: Single input loop: End
379     "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
380     ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
381     ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
382     ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
383     ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
384     "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
385     "mov z19.s, #0x7f\n"
386     ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
387     ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
388     ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
389     ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
390     "not z16.s, p0/M, z19.s\n"
391     "smax z15.s, p0/M, z15.s, z16.s\n"
392     "smax z14.s, p0/M, z14.s, z16.s\n"
393     "smax z13.s, p0/M, z13.s, z16.s\n"
394     "smax z12.s, p0/M, z12.s, z16.s\n"
395     "smin z15.s, p0/M, z15.s, z19.s\n"
396     "smin z14.s, p0/M, z14.s, z19.s\n"
397     "trn1 z23.h, z15.h, z14.h\n"
398     "smin z13.s, p0/M, z13.s, z19.s\n"
399     "smin z12.s, p0/M, z12.s, z19.s\n"
400     "trn1 z16.h, z13.h, z12.h\n"
401     "trn1 z16.b, z23.b, z16.b\n"
402     "st1b { z16.b }, p4, [%x[outptr], x26]\n"
403     "incb x26\n"
404     "whilelt p4.b, x26, %x[n_channels]\n"
405     "b.any 8b\n"
406     "14:"  // End
407     ".inst 0xd503467f  // SMSTOP\n"
408     :
409     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
410     : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
411   );
412 }
413 
414 }  // namespace pooling
415 }  // namespace arm_conv
416 
417 #endif  // defined(ARM_COMPUTE_ENABLE_SME)
418