1 /*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #if defined(ARM_COMPUTE_ENABLE_SME)
26
27 #include "src/core/NEON/kernels/assembly/pooling.hpp"
28 #include <cstdint>
29
30 namespace arm_conv {
31 namespace pooling {
32
33
sme_s8q_nhwc_max_generic_depthfirst_impl(const uint64_t,const uint64_t n_valid_cells,uint64_t n_channels,const int8_t * const * const inptrs,int8_t * outptr,const Requantize32 & qp)34 void sme_s8q_nhwc_max_generic_depthfirst_impl(
35 const uint64_t,
36 const uint64_t n_valid_cells,
37 uint64_t n_channels,
38 const int8_t *const *const inptrs,
39 int8_t *outptr,
40 const Requantize32 &qp
41 )
42 {
43 __asm__ __volatile__(
44 ".inst 0xd503477f // SMSTART ZA\n"
45 "mov x28, #0x0\n"
46 "cntb x27\n"
47 "cntb x26, ALL, MUL #2\n"
48 "cntb x25, ALL, MUL #3\n"
49 "whilelt p4.b, x28, %x[n_channels]\n"
50 "whilelt p3.b, x27, %x[n_channels]\n"
51 "whilelt p2.b, x26, %x[n_channels]\n"
52 "whilelt p1.b, x25, %x[n_channels]\n"
53 "ptrue p0.b\n"
54 "b.none 7f\n"
55 "1:" // 4-vectors of channels
56 "lsr x24, %x[n_valid_cells], #0x2\n"
57 "mov z4.b, #0x80\n"
58 "mov z3.b, #0x80\n"
59 "mov x19, %x[inptrs]\n"
60 "mov z2.b, #0x80\n"
61 "mov z1.b, #0x80\n"
62 "cbz x24, 4f\n"
63 "ldp x23, x22, [x19, #0x0]\n"
64 "subs x24, x24, #0x1\n"
65 "ld1b { z0.b }, p4/Z, [x23, x28]\n"
66 "ldp x21, x20, [x19, #0x10]\n"
67 "add x19, x19, #0x20\n"
68 "ld1b { z31.b }, p4/Z, [x22, x28]\n"
69 "ld1b { z23.b }, p4/Z, [x21, x28]\n"
70 "ld1b { z30.b }, p4/Z, [x20, x28]\n"
71 "ld1b { z18.b }, p3/Z, [x23, x27]\n"
72 "ld1b { z29.b }, p3/Z, [x22, x27]\n"
73 "ld1b { z22.b }, p3/Z, [x21, x27]\n"
74 "ld1b { z28.b }, p3/Z, [x20, x27]\n"
75 "ld1b { z17.b }, p2/Z, [x23, x26]\n"
76 "ld1b { z27.b }, p2/Z, [x22, x26]\n"
77 "ld1b { z21.b }, p2/Z, [x21, x26]\n"
78 "ld1b { z26.b }, p2/Z, [x20, x26]\n"
79 "ld1b { z16.b }, p1/Z, [x23, x25]\n"
80 "ld1b { z25.b }, p1/Z, [x22, x25]\n"
81 "ld1b { z20.b }, p1/Z, [x21, x25]\n"
82 "ld1b { z24.b }, p1/Z, [x20, x25]\n"
83 "beq 3f\n"
84 "2:" // 4-vectors of channels: 4 inputs loop
85 "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
86 "smax z23.b, p0/M, z23.b, z30.b\n"
87 "ldp x23, x22, [x19, #0x0]\n"
88 "subs x24, x24, #0x1\n"
89 "smax z18.b, p0/M, z18.b, z29.b\n"
90 "smax z22.b, p0/M, z22.b, z28.b\n"
91 "ldp x21, x20, [x19, #0x10]\n"
92 "add x19, x19, #0x20\n"
93 "smax z17.b, p0/M, z17.b, z27.b\n"
94 "smax z21.b, p0/M, z21.b, z26.b\n"
95 "ld1b { z0.b }, p4/Z, [x23, x28]\n"
96 "smax z16.b, p0/M, z16.b, z25.b\n"
97 "smax z20.b, p0/M, z20.b, z24.b\n"
98 "ld1b { z31.b }, p4/Z, [x22, x28]\n"
99 "smax z19.b, p0/M, z19.b, z23.b\n"
100 "smax z18.b, p0/M, z18.b, z22.b\n"
101 "ld1b { z23.b }, p4/Z, [x21, x28]\n"
102 "smax z17.b, p0/M, z17.b, z21.b\n"
103 "smax z16.b, p0/M, z16.b, z20.b\n"
104 "ld1b { z30.b }, p4/Z, [x20, x28]\n"
105 "smax z4.b, p0/M, z4.b, z19.b\n"
106 "smax z3.b, p0/M, z3.b, z18.b\n"
107 "ld1b { z18.b }, p3/Z, [x23, x27]\n"
108 "smax z2.b, p0/M, z2.b, z17.b\n"
109 "smax z1.b, p0/M, z1.b, z16.b\n"
110 "ld1b { z29.b }, p3/Z, [x22, x27]\n"
111 "ld1b { z22.b }, p3/Z, [x21, x27]\n"
112 "ld1b { z28.b }, p3/Z, [x20, x27]\n"
113 "ld1b { z17.b }, p2/Z, [x23, x26]\n"
114 "ld1b { z27.b }, p2/Z, [x22, x26]\n"
115 "ld1b { z21.b }, p2/Z, [x21, x26]\n"
116 "ld1b { z26.b }, p2/Z, [x20, x26]\n"
117 "ld1b { z16.b }, p1/Z, [x23, x25]\n"
118 "ld1b { z25.b }, p1/Z, [x22, x25]\n"
119 "ld1b { z20.b }, p1/Z, [x21, x25]\n"
120 "ld1b { z24.b }, p1/Z, [x20, x25]\n"
121 "bgt 2b\n"
122 "3:" // 4-vectors of channels: 4 inputs tail
123 "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
124 "smax z23.b, p0/M, z23.b, z30.b\n"
125 "smax z18.b, p0/M, z18.b, z29.b\n"
126 "smax z22.b, p0/M, z22.b, z28.b\n"
127 "smax z17.b, p0/M, z17.b, z27.b\n"
128 "smax z21.b, p0/M, z21.b, z26.b\n"
129 "smax z16.b, p0/M, z16.b, z25.b\n"
130 "smax z20.b, p0/M, z20.b, z24.b\n"
131 "smax z19.b, p0/M, z19.b, z23.b\n"
132 "smax z18.b, p0/M, z18.b, z22.b\n"
133 "smax z17.b, p0/M, z17.b, z21.b\n"
134 "smax z16.b, p0/M, z16.b, z20.b\n"
135 "smax z4.b, p0/M, z4.b, z19.b\n"
136 "smax z3.b, p0/M, z3.b, z18.b\n"
137 "smax z2.b, p0/M, z2.b, z17.b\n"
138 "smax z1.b, p0/M, z1.b, z16.b\n"
139 "4:" // 4-vectors of channels: After loop
140 "ands x20, %x[n_valid_cells], #0x3\n"
141 "beq 6f\n"
142 "5:" // 4-vectors of channels: Single input loop
143 "ldr x23, [x19], #0x8\n"
144 "ld1b { z0.b }, p4/Z, [x23, x28]\n"
145 "subs x20, x20, #0x1\n"
146 "smax z4.b, p0/M, z4.b, z0.b\n"
147 "ld1b { z18.b }, p3/Z, [x23, x27]\n"
148 "smax z3.b, p0/M, z3.b, z18.b\n"
149 "ld1b { z17.b }, p2/Z, [x23, x26]\n"
150 "smax z2.b, p0/M, z2.b, z17.b\n"
151 "ld1b { z16.b }, p1/Z, [x23, x25]\n"
152 "smax z1.b, p0/M, z1.b, z16.b\n"
153 "bgt 5b\n"
154 "6:" // 4-vectors of channels: Single input loop: End
155 ".inst 0x4508a097 // sshllb z23.h, z4.b, #0x0\n"
156 ".inst 0x4508a496 // sshllt z22.h, z4.b, #0x0\n"
157 "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
158 "ld1rw { z4.s }, p0/Z, [x19]\n"
159 ".inst 0x4508a075 // sshllb z21.h, z3.b, #0x0\n"
160 ".inst 0x4508a472 // sshllt z18.h, z3.b, #0x0\n"
161 "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
162 "ld1rw { z3.s }, p0/Z, [x19]\n"
163 ".inst 0x4508a054 // sshllb z20.h, z2.b, #0x0\n"
164 ".inst 0x4508a451 // sshllt z17.h, z2.b, #0x0\n"
165 "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
166 "ld1rw { z2.s }, p0/Z, [x19]\n"
167 ".inst 0x4508a033 // sshllb z19.h, z1.b, #0x0\n"
168 ".inst 0x4508a430 // sshllt z16.h, z1.b, #0x0\n"
169 ".inst 0x4510a2e1 // sshllb z1.s, z23.h, #0x0\n"
170 ".inst 0x4510a6f7 // sshllt z23.s, z23.h, #0x0\n"
171 ".inst 0x4510a2c0 // sshllb z0.s, z22.h, #0x0\n"
172 ".inst 0x4510a6df // sshllt z31.s, z22.h, #0x0\n"
173 ".inst 0x4510a2be // sshllb z30.s, z21.h, #0x0\n"
174 ".inst 0x4510a6b6 // sshllt z22.s, z21.h, #0x0\n"
175 ".inst 0x4510a25d // sshllb z29.s, z18.h, #0x0\n"
176 ".inst 0x4510a652 // sshllt z18.s, z18.h, #0x0\n"
177 ".inst 0x4510a29c // sshllb z28.s, z20.h, #0x0\n"
178 ".inst 0x4510a695 // sshllt z21.s, z20.h, #0x0\n"
179 ".inst 0x4510a23b // sshllb z27.s, z17.h, #0x0\n"
180 ".inst 0x4510a631 // sshllt z17.s, z17.h, #0x0\n"
181 ".inst 0x4510a27a // sshllb z26.s, z19.h, #0x0\n"
182 ".inst 0x4510a674 // sshllt z20.s, z19.h, #0x0\n"
183 ".inst 0x4510a219 // sshllb z25.s, z16.h, #0x0\n"
184 ".inst 0x4510a618 // sshllt z24.s, z16.h, #0x0\n"
185 ".inst 0x44828081 // srshl z1.s, p0/M, z1.s, z4.s\n"
186 ".inst 0x44828097 // srshl z23.s, p0/M, z23.s, z4.s\n"
187 ".inst 0x44828080 // srshl z0.s, p0/M, z0.s, z4.s\n"
188 ".inst 0x4482809f // srshl z31.s, p0/M, z31.s, z4.s\n"
189 ".inst 0x4482809e // srshl z30.s, p0/M, z30.s, z4.s\n"
190 ".inst 0x44828096 // srshl z22.s, p0/M, z22.s, z4.s\n"
191 ".inst 0x4482809d // srshl z29.s, p0/M, z29.s, z4.s\n"
192 ".inst 0x44828092 // srshl z18.s, p0/M, z18.s, z4.s\n"
193 ".inst 0x4482809c // srshl z28.s, p0/M, z28.s, z4.s\n"
194 ".inst 0x44828095 // srshl z21.s, p0/M, z21.s, z4.s\n"
195 ".inst 0x4482809b // srshl z27.s, p0/M, z27.s, z4.s\n"
196 ".inst 0x44828091 // srshl z17.s, p0/M, z17.s, z4.s\n"
197 ".inst 0x4482809a // srshl z26.s, p0/M, z26.s, z4.s\n"
198 ".inst 0x44828094 // srshl z20.s, p0/M, z20.s, z4.s\n"
199 ".inst 0x44828099 // srshl z25.s, p0/M, z25.s, z4.s\n"
200 ".inst 0x44828098 // srshl z24.s, p0/M, z24.s, z4.s\n"
201 ".inst 0x04a37421 // sqrdmulh z1.s, z1.s, z3.s\n"
202 ".inst 0x04a376f7 // sqrdmulh z23.s, z23.s, z3.s\n"
203 ".inst 0x04a37400 // sqrdmulh z0.s, z0.s, z3.s\n"
204 ".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
205 ".inst 0x04a377de // sqrdmulh z30.s, z30.s, z3.s\n"
206 ".inst 0x04a376d6 // sqrdmulh z22.s, z22.s, z3.s\n"
207 ".inst 0x04a377bd // sqrdmulh z29.s, z29.s, z3.s\n"
208 ".inst 0x04a37652 // sqrdmulh z18.s, z18.s, z3.s\n"
209 ".inst 0x04a3779c // sqrdmulh z28.s, z28.s, z3.s\n"
210 ".inst 0x04a376b5 // sqrdmulh z21.s, z21.s, z3.s\n"
211 ".inst 0x04a3777b // sqrdmulh z27.s, z27.s, z3.s\n"
212 ".inst 0x04a37631 // sqrdmulh z17.s, z17.s, z3.s\n"
213 ".inst 0x04a3775a // sqrdmulh z26.s, z26.s, z3.s\n"
214 ".inst 0x04a37694 // sqrdmulh z20.s, z20.s, z3.s\n"
215 ".inst 0x04a37739 // sqrdmulh z25.s, z25.s, z3.s\n"
216 ".inst 0x04a37718 // sqrdmulh z24.s, z24.s, z3.s\n"
217 "mov z19.s, #0x7f\n"
218 ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
219 ".inst 0x44828057 // srshl z23.s, p0/M, z23.s, z2.s\n"
220 ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
221 ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
222 ".inst 0x4482805e // srshl z30.s, p0/M, z30.s, z2.s\n"
223 ".inst 0x44828056 // srshl z22.s, p0/M, z22.s, z2.s\n"
224 ".inst 0x4482805d // srshl z29.s, p0/M, z29.s, z2.s\n"
225 ".inst 0x44828052 // srshl z18.s, p0/M, z18.s, z2.s\n"
226 ".inst 0x4482805c // srshl z28.s, p0/M, z28.s, z2.s\n"
227 ".inst 0x44828055 // srshl z21.s, p0/M, z21.s, z2.s\n"
228 ".inst 0x4482805b // srshl z27.s, p0/M, z27.s, z2.s\n"
229 ".inst 0x44828051 // srshl z17.s, p0/M, z17.s, z2.s\n"
230 ".inst 0x4482805a // srshl z26.s, p0/M, z26.s, z2.s\n"
231 ".inst 0x44828054 // srshl z20.s, p0/M, z20.s, z2.s\n"
232 ".inst 0x44828059 // srshl z25.s, p0/M, z25.s, z2.s\n"
233 ".inst 0x44828058 // srshl z24.s, p0/M, z24.s, z2.s\n"
234 "not z16.s, p0/M, z19.s\n"
235 "smax z1.s, p0/M, z1.s, z16.s\n"
236 "smax z23.s, p0/M, z23.s, z16.s\n"
237 "smax z0.s, p0/M, z0.s, z16.s\n"
238 "smax z31.s, p0/M, z31.s, z16.s\n"
239 "smax z30.s, p0/M, z30.s, z16.s\n"
240 "smax z22.s, p0/M, z22.s, z16.s\n"
241 "smax z29.s, p0/M, z29.s, z16.s\n"
242 "smax z18.s, p0/M, z18.s, z16.s\n"
243 "smax z28.s, p0/M, z28.s, z16.s\n"
244 "smax z21.s, p0/M, z21.s, z16.s\n"
245 "smax z27.s, p0/M, z27.s, z16.s\n"
246 "smax z17.s, p0/M, z17.s, z16.s\n"
247 "smax z26.s, p0/M, z26.s, z16.s\n"
248 "smax z20.s, p0/M, z20.s, z16.s\n"
249 "smax z25.s, p0/M, z25.s, z16.s\n"
250 "smax z24.s, p0/M, z24.s, z16.s\n"
251 "smin z1.s, p0/M, z1.s, z19.s\n"
252 "smin z23.s, p0/M, z23.s, z19.s\n"
253 "trn1 z23.h, z1.h, z23.h\n"
254 "smin z0.s, p0/M, z0.s, z19.s\n"
255 "smin z31.s, p0/M, z31.s, z19.s\n"
256 "trn1 z16.h, z0.h, z31.h\n"
257 "smin z30.s, p0/M, z30.s, z19.s\n"
258 "smin z22.s, p0/M, z22.s, z19.s\n"
259 "trn1 z22.h, z30.h, z22.h\n"
260 "smin z29.s, p0/M, z29.s, z19.s\n"
261 "smin z18.s, p0/M, z18.s, z19.s\n"
262 "trn1 z18.h, z29.h, z18.h\n"
263 "smin z28.s, p0/M, z28.s, z19.s\n"
264 "smin z21.s, p0/M, z21.s, z19.s\n"
265 "trn1 z21.h, z28.h, z21.h\n"
266 "smin z27.s, p0/M, z27.s, z19.s\n"
267 "smin z17.s, p0/M, z17.s, z19.s\n"
268 "trn1 z17.h, z27.h, z17.h\n"
269 "smin z26.s, p0/M, z26.s, z19.s\n"
270 "smin z20.s, p0/M, z20.s, z19.s\n"
271 "trn1 z20.h, z26.h, z20.h\n"
272 "smin z25.s, p0/M, z25.s, z19.s\n"
273 "smin z24.s, p0/M, z24.s, z19.s\n"
274 "trn1 z19.h, z25.h, z24.h\n"
275 "trn1 z16.b, z23.b, z16.b\n"
276 "trn1 z18.b, z22.b, z18.b\n"
277 "st1b { z16.b }, p4, [%x[outptr], x28]\n"
278 "incb x28, ALL, MUL #4\n"
279 "trn1 z17.b, z21.b, z17.b\n"
280 "trn1 z16.b, z20.b, z19.b\n"
281 "st1b { z18.b }, p3, [%x[outptr], x27]\n"
282 "incb x27, ALL, MUL #4\n"
283 "st1b { z17.b }, p2, [%x[outptr], x26]\n"
284 "incb x26, ALL, MUL #4\n"
285 "st1b { z16.b }, p1, [%x[outptr], x25]\n"
286 "incb x25, ALL, MUL #4\n"
287 "whilelt p1.b, x25, %x[n_channels]\n"
288 "b.any 1b\n"
289 "7:" // Single vector of channels
290 "whilelt p4.b, x28, %x[n_channels]\n"
291 "b.none 14f\n"
292 "8:" // Single vector of channels: Loop
293 "lsr x24, %x[n_valid_cells], #0x2\n"
294 "mov z4.b, #0x80\n"
295 "mov x19, %x[inptrs]\n"
296 "cbz x24, 11f\n"
297 "ldp x23, x22, [x19, #0x0]\n"
298 "subs x24, x24, #0x1\n"
299 "ld1b { z0.b }, p4/Z, [x23, x28]\n"
300 "ldp x21, x20, [x19, #0x10]\n"
301 "add x19, x19, #0x20\n"
302 "ld1b { z31.b }, p4/Z, [x22, x28]\n"
303 "ld1b { z23.b }, p4/Z, [x21, x28]\n"
304 "ld1b { z30.b }, p4/Z, [x20, x28]\n"
305 "beq 10f\n"
306 "9:" // Single vector of channels: Loop: 4 inputs loop
307 "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
308 "smax z23.b, p0/M, z23.b, z30.b\n"
309 "ldp x23, x22, [x19, #0x0]\n"
310 "subs x24, x24, #0x1\n"
311 "smax z19.b, p0/M, z19.b, z23.b\n"
312 "ldp x21, x20, [x19, #0x10]\n"
313 "smax z4.b, p0/M, z4.b, z19.b\n"
314 "add x19, x19, #0x20\n"
315 "ld1b { z0.b }, p4/Z, [x23, x28]\n"
316 "ld1b { z31.b }, p4/Z, [x22, x28]\n"
317 "ld1b { z23.b }, p4/Z, [x21, x28]\n"
318 "ld1b { z30.b }, p4/Z, [x20, x28]\n"
319 "bgt 9b\n"
320 "10:" // Single vector of channels: Loop: 4 inputs tail
321 "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
322 "smax z23.b, p0/M, z23.b, z30.b\n"
323 "smax z19.b, p0/M, z19.b, z23.b\n"
324 "smax z4.b, p0/M, z4.b, z19.b\n"
325 "11:" // Single vector of channels: Loop: After loop
326 "ands x20, %x[n_valid_cells], #0x3\n"
327 "beq 13f\n"
328 "12:" // Single vector of channels: Loop: Single input loop
329 "ldr x23, [x19], #0x8\n"
330 "ld1b { z0.b }, p4/Z, [x23, x28]\n"
331 "subs x20, x20, #0x1\n"
332 "smax z4.b, p0/M, z4.b, z0.b\n"
333 "bgt 12b\n"
334 "13:" // Single vector of channels: Loop: Single input loop: End
335 ".inst 0x4508a097 // sshllb z23.h, z4.b, #0x0\n"
336 ".inst 0x4508a496 // sshllt z22.h, z4.b, #0x0\n"
337 "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
338 "ld1rw { z4.s }, p0/Z, [x19]\n"
339 ".inst 0x4510a2e1 // sshllb z1.s, z23.h, #0x0\n"
340 ".inst 0x4510a6f7 // sshllt z23.s, z23.h, #0x0\n"
341 "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
342 "ld1rw { z3.s }, p0/Z, [x19]\n"
343 ".inst 0x4510a2c0 // sshllb z0.s, z22.h, #0x0\n"
344 ".inst 0x4510a6df // sshllt z31.s, z22.h, #0x0\n"
345 "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
346 "ld1rw { z2.s }, p0/Z, [x19]\n"
347 ".inst 0x44828081 // srshl z1.s, p0/M, z1.s, z4.s\n"
348 ".inst 0x44828097 // srshl z23.s, p0/M, z23.s, z4.s\n"
349 ".inst 0x44828080 // srshl z0.s, p0/M, z0.s, z4.s\n"
350 ".inst 0x4482809f // srshl z31.s, p0/M, z31.s, z4.s\n"
351 ".inst 0x04a37421 // sqrdmulh z1.s, z1.s, z3.s\n"
352 ".inst 0x04a376f7 // sqrdmulh z23.s, z23.s, z3.s\n"
353 ".inst 0x04a37400 // sqrdmulh z0.s, z0.s, z3.s\n"
354 ".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
355 "mov z19.s, #0x7f\n"
356 ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
357 ".inst 0x44828057 // srshl z23.s, p0/M, z23.s, z2.s\n"
358 ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
359 ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
360 "not z16.s, p0/M, z19.s\n"
361 "smax z1.s, p0/M, z1.s, z16.s\n"
362 "smax z23.s, p0/M, z23.s, z16.s\n"
363 "smax z0.s, p0/M, z0.s, z16.s\n"
364 "smax z31.s, p0/M, z31.s, z16.s\n"
365 "smin z1.s, p0/M, z1.s, z19.s\n"
366 "smin z23.s, p0/M, z23.s, z19.s\n"
367 "trn1 z23.h, z1.h, z23.h\n"
368 "smin z0.s, p0/M, z0.s, z19.s\n"
369 "smin z31.s, p0/M, z31.s, z19.s\n"
370 "trn1 z16.h, z0.h, z31.h\n"
371 "trn1 z16.b, z23.b, z16.b\n"
372 "st1b { z16.b }, p4, [%x[outptr], x28]\n"
373 "incb x28\n"
374 "whilelt p4.b, x28, %x[n_channels]\n"
375 "b.any 8b\n"
376 "14:" // End
377 ".inst 0xd503467f // SMSTOP\n"
378 :
379 : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
380 : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
381 );
382 }
383
384 } // namespace pooling
385 } // namespace arm_conv
386
387 #endif // defined(ARM_COMPUTE_ENABLE_SME)
388