1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const uint8_t** restrict a, x4
19#     const uint8_t* restrict w,  x5
20#     uint8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const uint8_t* zero,                [sp + 16] -> x12
25#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
26
27# params structure is 20 bytes
28#  struct {
29#    uint8_t kernel_zero_point[4];
30#    int32_t right_pre_shift;
31#    int32_t multiplier;
32#    int32_t right_post_shift;
33#    int16_t output_zero_point;
34#    uint8_t output_min;
35#    uint8_t output_max;
36#  } rndnu_neon;
37#
38# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
39
40# Register usage
41# A0  x13  v0
42# A1  x14  v1
43# A2  x15  v2
44# A3  x20  v3
45# B    x5  v4  v5  v6
46# C0   x6 v16 v20 v24 v28
47# C1  x16 v17 v21 v25 v29
48# C2  x17 v18 v22 v26 v30
49# C3   x7 v19 v23 v27 v31
50# zero_point v7
51# unused  v8 v9 v10 v11 v12 v13 v14 v15
52# x11, x21 temp for Cortex-A53 loads
53
54BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
55
56        # Clamp C pointers
57        CMP     x0, 2                   // if mr < 2
58        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
59        ADD     x16, x6, x7             // c1 = c0 + cm_stride
60        CSEL    x16, x6,  x16, LO       //   c1 = c0
61
62        ADD     x17, x16, x7            // c2 = c1 + cm_stride
63        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
64                                        // if mr <= 2
65        CSEL    x17, x16, x17, LS       //   c2 = c1
66
67        CMP     x0, 4                   // if mr < 4
68        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
69        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
70        CSEL    x7,  x17, x7, LO        //   c3 = c2
71
72        LD1R    {v7.4s}, [x11]          // kernel_zero_point
73
74        .p2align 3
750:
76        # Load initial bias from w into accumulators
77        LDP     q16, q20, [x5], 32
78        MOV     v17.16b, v16.16b
79        MOV     v18.16b, v16.16b
80        LDP     q24, q28, [x5], 32
81        MOV     v19.16b, v16.16b
82        MOV     v21.16b, v20.16b
83        ADD     x11, x11, 4              // adjust params pointer
84        MOV     v22.16b, v20.16b
85        MOV     v23.16b, v20.16b
86        MOV     v25.16b, v24.16b
87        MOV     v26.16b, v24.16b
88        MOV     v27.16b, v24.16b
89        MOV     v29.16b, v28.16b
90        MOV     v30.16b, v28.16b
91        MOV     v31.16b, v28.16b
92        MOV     x9, x3                  // p = ks
93
94        .p2align 3
951:
96        # Load next 4 A pointers
97        LDP     x13, x14, [x4], 16
98        LDP     x15, x20, [x4], 16
99
100        CMP     x13, x12                // if a0 == zero
101        ADD     x13, x13, x8            // a0 += a_offset
102        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
103        CMP     x14, x12                // if a1 == zero
104        ADD     x14, x14, x8            // a1 += a_offset
105        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
106        CMP     x15, x12                // if a2 == zero
107        ADD     x15, x15, x8            // a2 += a_offset
108        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
109        CMP     x20, x12                // if a3 == zero
110        ADD     x20, x20, x8            // a3 += a_offset
111        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
112
113        # Is there at least 8 bytes for epilogue?
114        SUBS    x0, x2, 8               // k = kc - 8
115        B.LO    5f
116
117        # Prologue
118        LDR     d0, [x13], 8
119        LDP     d4, d6, [x5]
120        LDR     d1, [x14], 8
121        LDR     d2, [x15], 8
122        LDR     d3, [x20], 8
123        UXTL    v0.8h, v0.8b
124        LDR     x11, [x5, 16]
125        USUBL   v4.8h, v4.8b, v7.8b
126        UXTL    v1.8h, v1.8b
127        UXTL    v2.8h, v2.8b
128        UXTL    v3.8h, v3.8b
129        USUBL   v6.8h, v6.8b, v7.8b
130
131        SUBS    x0, x0, 8               // k = k - 8
132        # Is there at least 8 bytes for main loop?
133        B.LO    3f
134
135        # Main loop - 8 bytes of A
136        .p2align 3
1372:
138        SMLAL   v16.4s, v4.4h, v0.h[0]
139        SMLAL2  v20.4s, v4.8h, v0.h[0]
140        PRFM    PLDL1KEEP, [x13, 128]
141        SMLAL   v17.4s, v4.4h, v1.h[0]
142        SMLAL2  v21.4s, v4.8h, v1.h[0]
143        PRFM    PLDL1KEEP, [x14, 128]
144        SMLAL   v18.4s, v4.4h, v2.h[0]
145        SMLAL2  v22.4s, v4.8h, v2.h[0]
146        PRFM    PLDL1KEEP, [x15, 128]
147        SMLAL   v19.4s, v4.4h, v3.h[0]
148        SMLAL2  v23.4s, v4.8h, v3.h[0]
149        PRFM    PLDL1KEEP, [x20, 128]
150        LDR     d4, [x5, 24]
151        INS     v5.d[0], x11
152        SMLAL   v24.4s, v6.4h, v0.h[0]
153        SMLAL2  v28.4s, v6.8h, v0.h[0]
154        PRFM    PLDL1KEEP, [x5, 448]
155        SMLAL   v25.4s, v6.4h, v1.h[0]
156        SMLAL2  v29.4s, v6.8h, v1.h[0]
157        PRFM    PLDL1KEEP, [x5, 512]
158        USUBL   v5.8h, v5.8b, v7.8b
159        SMLAL   v26.4s, v6.4h, v2.h[0]
160        SMLAL2  v30.4s, v6.8h, v2.h[0]
161        SMLAL   v27.4s, v6.4h, v3.h[0]
162        SMLAL2  v31.4s, v6.8h, v3.h[0]
163        LDR     x11, [x5, 32]
164        SMLAL   v16.4s, v5.4h, v0.h[1]
165        SMLAL2  v20.4s, v5.8h, v0.h[1]
166        SMLAL   v17.4s, v5.4h, v1.h[1]
167        SMLAL2  v21.4s, v5.8h, v1.h[1]
168        USUBL   v4.8h, v4.8b, v7.8b
169        SMLAL   v18.4s, v5.4h, v2.h[1]
170        SMLAL2  v22.4s, v5.8h, v2.h[1]
171        SMLAL   v19.4s, v5.4h, v3.h[1]
172        SMLAL2  v23.4s, v5.8h, v3.h[1]
173        LDR     d5, [x5, 40]
174        INS     v6.d[0], x11
175        SMLAL   v24.4s, v4.4h, v0.h[1]
176        SMLAL2  v28.4s, v4.8h, v0.h[1]
177        SMLAL   v25.4s, v4.4h, v1.h[1]
178        SMLAL2  v29.4s, v4.8h, v1.h[1]
179        USUBL   v6.8h, v6.8b, v7.8b
180        SMLAL   v26.4s, v4.4h, v2.h[1]
181        SMLAL2  v30.4s, v4.8h, v2.h[1]
182        SMLAL   v27.4s, v4.4h, v3.h[1]
183        SMLAL2  v31.4s, v4.8h, v3.h[1]
184        LDR     x11, [x5, 48]
185        SMLAL   v16.4s, v6.4h, v0.h[2]
186        SMLAL2  v20.4s, v6.8h, v0.h[2]
187        SMLAL   v17.4s, v6.4h, v1.h[2]
188        USUBL   v5.8h, v5.8b, v7.8b
189        SMLAL2  v21.4s, v6.8h, v1.h[2]
190        SMLAL   v18.4s, v6.4h, v2.h[2]
191        SMLAL2  v22.4s, v6.8h, v2.h[2]
192        SMLAL   v19.4s, v6.4h, v3.h[2]
193        SMLAL2  v23.4s, v6.8h, v3.h[2]
194        LDR     d6, [x5, 56]
195        INS     v4.d[0], x11
196        SMLAL   v24.4s, v5.4h, v0.h[2]
197        SMLAL2  v28.4s, v5.8h, v0.h[2]
198        SMLAL   v25.4s, v5.4h, v1.h[2]
199        SMLAL2  v29.4s, v5.8h, v1.h[2]
200        USUBL   v4.8h, v4.8b, v7.8b
201        SMLAL   v26.4s, v5.4h, v2.h[2]
202        SMLAL2  v30.4s, v5.8h, v2.h[2]
203        SMLAL   v27.4s, v5.4h, v3.h[2]
204        SMLAL2  v31.4s, v5.8h, v3.h[2]
205        LDR     x11, [x5, 64]
206        SMLAL   v16.4s, v4.4h, v0.h[3]
207        SMLAL2  v20.4s, v4.8h, v0.h[3]
208        SMLAL   v17.4s, v4.4h, v1.h[3]
209        SMLAL2  v21.4s, v4.8h, v1.h[3]
210        USUBL   v6.8h, v6.8b, v7.8b
211        SMLAL   v18.4s, v4.4h, v2.h[3]
212        SMLAL2  v22.4s, v4.8h, v2.h[3]
213        SMLAL   v19.4s, v4.4h, v3.h[3]
214        SMLAL2  v23.4s, v4.8h, v3.h[3]
215        LDR     d4, [x5, 72]
216        INS     v5.d[0], x11
217        SMLAL   v24.4s, v6.4h, v0.h[3]
218        SMLAL2  v28.4s, v6.8h, v0.h[3]
219        USUBL   v5.8h, v5.8b, v7.8b
220        SMLAL   v25.4s, v6.4h, v1.h[3]
221        SMLAL2  v29.4s, v6.8h, v1.h[3]
222        SMLAL   v26.4s, v6.4h, v2.h[3]
223        SMLAL2  v30.4s, v6.8h, v2.h[3]
224        SMLAL   v27.4s, v6.4h, v3.h[3]
225        SMLAL2  v31.4s, v6.8h, v3.h[3]
226        LDR     x11, [x5, 80]
227        SMLAL   v16.4s, v5.4h, v0.h[4]
228        SMLAL2  v20.4s, v5.8h, v0.h[4]
229        SMLAL   v17.4s, v5.4h, v1.h[4]
230        SMLAL2  v21.4s, v5.8h, v1.h[4]
231        USUBL   v4.8h, v4.8b, v7.8b
232        SMLAL   v18.4s, v5.4h, v2.h[4]
233        SMLAL2  v22.4s, v5.8h, v2.h[4]
234        SMLAL   v19.4s, v5.4h, v3.h[4]
235        SMLAL2  v23.4s, v5.8h, v3.h[4]
236        LDR     d5, [x5, 88]
237        INS     v6.d[0], x11
238        SMLAL   v24.4s, v4.4h, v0.h[4]
239        SMLAL2  v28.4s, v4.8h, v0.h[4]
240        SMLAL   v25.4s, v4.4h, v1.h[4]
241        SMLAL2  v29.4s, v4.8h, v1.h[4]
242        USUBL   v6.8h, v6.8b, v7.8b
243        SMLAL   v26.4s, v4.4h, v2.h[4]
244        SMLAL2  v30.4s, v4.8h, v2.h[4]
245        SMLAL   v27.4s, v4.4h, v3.h[4]
246        SMLAL2  v31.4s, v4.8h, v3.h[4]
247        LDR     x11, [x5, 96]
248        SMLAL   v16.4s, v6.4h, v0.h[5]
249        SMLAL2  v20.4s, v6.8h, v0.h[5]
250        SMLAL   v17.4s, v6.4h, v1.h[5]
251        SMLAL2  v21.4s, v6.8h, v1.h[5]
252        USUBL   v5.8h, v5.8b, v7.8b
253        SMLAL   v18.4s, v6.4h, v2.h[5]
254        SMLAL2  v22.4s, v6.8h, v2.h[5]
255        SMLAL   v19.4s, v6.4h, v3.h[5]
256        SMLAL2  v23.4s, v6.8h, v3.h[5]
257        LDR     d6, [x5, 104]
258        INS     v4.d[0], x11
259        SMLAL   v24.4s, v5.4h, v0.h[5]
260        SMLAL2  v28.4s, v5.8h, v0.h[5]
261        SMLAL   v25.4s, v5.4h, v1.h[5]
262        SMLAL2  v29.4s, v5.8h, v1.h[5]
263        USUBL   v4.8h, v4.8b, v7.8b
264        SMLAL   v26.4s, v5.4h, v2.h[5]
265        SMLAL2  v30.4s, v5.8h, v2.h[5]
266        SMLAL   v27.4s, v5.4h, v3.h[5]
267        SMLAL2  v31.4s, v5.8h, v3.h[5]
268        USUBL   v6.8h, v6.8b, v7.8b
269        LDR     x11, [x5, 112]
270        SMLAL   v16.4s, v4.4h, v0.h[6]
271        SMLAL2  v20.4s, v4.8h, v0.h[6]
272        SMLAL   v17.4s, v4.4h, v1.h[6]
273        SMLAL2  v21.4s, v4.8h, v1.h[6]
274        SMLAL   v18.4s, v4.4h, v2.h[6]
275        SMLAL2  v22.4s, v4.8h, v2.h[6]
276        SMLAL   v19.4s, v4.4h, v3.h[6]
277        SMLAL2  v23.4s, v4.8h, v3.h[6]
278        LDR     d5, [x5, 120]
279        INS     v4.d[0], x11
280        SMLAL   v24.4s, v6.4h, v0.h[6]
281        SMLAL2  v28.4s, v6.8h, v0.h[6]
282        SMLAL   v25.4s, v6.4h, v1.h[6]
283        SMLAL2  v29.4s, v6.8h, v1.h[6]
284        USUBL   v4.8h, v4.8b, v7.8b
285        ADD     x5, x5, 128
286
287        SMLAL   v26.4s, v6.4h, v2.h[6]
288        SMLAL2  v30.4s, v6.8h, v2.h[6]
289        LDR     x11, [x5]
290        SMLAL   v27.4s, v6.4h, v3.h[6]
291        SMLAL2  v31.4s, v6.8h, v3.h[6]
292        USUBL   v5.8h, v5.8b, v7.8b
293        LDR     x21, [x13], 8
294
295        SMLAL   v16.4s, v4.4h, v0.h[7]
296        SMLAL2  v20.4s, v4.8h, v0.h[7]
297        SMLAL   v17.4s, v4.4h, v1.h[7]
298        SMLAL2  v21.4s, v4.8h, v1.h[7]
299        SMLAL   v18.4s, v4.4h, v2.h[7]
300        SMLAL2  v22.4s, v4.8h, v2.h[7]
301        SMLAL   v19.4s, v4.4h, v3.h[7]
302        SMLAL2  v23.4s, v4.8h, v3.h[7]
303        LDR     d6, [x5, 8]
304        INS     v4.d[0], x11
305        SMLAL   v24.4s, v5.4h, v0.h[7]
306        SMLAL2  v28.4s, v5.8h, v0.h[7]
307        LDR     x11, [x15], 8
308        SMLAL   v25.4s, v5.4h, v1.h[7]
309        SMLAL2  v29.4s, v5.8h, v1.h[7]
310        LDR     d1, [x14], 8
311        INS     v0.d[0], x21
312        SMLAL   v26.4s, v5.4h, v2.h[7]
313        SMLAL2  v30.4s, v5.8h, v2.h[7]
314        SMLAL   v27.4s, v5.4h, v3.h[7]
315        SMLAL2  v31.4s, v5.8h, v3.h[7]
316        LDR     d3, [x20], 8
317        INS     v2.d[0], x11
318
319        UXTL    v0.8h, v0.8b
320        UXTL    v1.8h, v1.8b
321        LDR     x11, [x5, 16]
322        USUBL   v4.8h, v4.8b, v7.8b
323        UXTL    v2.8h, v2.8b
324        SUBS    x0, x0, 8
325        UXTL    v3.8h, v3.8b
326        USUBL   v6.8h, v6.8b, v7.8b
327        B.HS    2b
328
329        # Epilogue.  Same as main loop but no preloads in final group
330
331        .p2align 3
3323:
333        SMLAL   v16.4s, v4.4h, v0.h[0]
334        SMLAL2  v20.4s, v4.8h, v0.h[0]
335        SMLAL   v17.4s, v4.4h, v1.h[0]
336        SMLAL2  v21.4s, v4.8h, v1.h[0]
337        SMLAL   v18.4s, v4.4h, v2.h[0]
338        SMLAL2  v22.4s, v4.8h, v2.h[0]
339        SMLAL   v19.4s, v4.4h, v3.h[0]
340        SMLAL2  v23.4s, v4.8h, v3.h[0]
341        LDR     d4, [x5, 24]
342        INS     v5.d[0], x11
343        SMLAL   v24.4s, v6.4h, v0.h[0]
344        SMLAL2  v28.4s, v6.8h, v0.h[0]
345        SMLAL   v25.4s, v6.4h, v1.h[0]
346        SMLAL2  v29.4s, v6.8h, v1.h[0]
347        USUBL   v5.8h, v5.8b, v7.8b
348        SMLAL   v26.4s, v6.4h, v2.h[0]
349        SMLAL2  v30.4s, v6.8h, v2.h[0]
350        SMLAL   v27.4s, v6.4h, v3.h[0]
351        SMLAL2  v31.4s, v6.8h, v3.h[0]
352        LDR     x11, [x5, 32]
353        SMLAL   v16.4s, v5.4h, v0.h[1]
354        SMLAL2  v20.4s, v5.8h, v0.h[1]
355        SMLAL   v17.4s, v5.4h, v1.h[1]
356        SMLAL2  v21.4s, v5.8h, v1.h[1]
357        USUBL   v4.8h, v4.8b, v7.8b
358        SMLAL   v18.4s, v5.4h, v2.h[1]
359        SMLAL2  v22.4s, v5.8h, v2.h[1]
360        SMLAL   v19.4s, v5.4h, v3.h[1]
361        SMLAL2  v23.4s, v5.8h, v3.h[1]
362        LDR     d5, [x5, 40]
363        INS     v6.d[0], x11
364        SMLAL   v24.4s, v4.4h, v0.h[1]
365        SMLAL2  v28.4s, v4.8h, v0.h[1]
366        SMLAL   v25.4s, v4.4h, v1.h[1]
367        SMLAL2  v29.4s, v4.8h, v1.h[1]
368        USUBL   v6.8h, v6.8b, v7.8b
369        SMLAL   v26.4s, v4.4h, v2.h[1]
370        SMLAL2  v30.4s, v4.8h, v2.h[1]
371        SMLAL   v27.4s, v4.4h, v3.h[1]
372        SMLAL2  v31.4s, v4.8h, v3.h[1]
373        LDR     x11, [x5, 48]
374        SMLAL   v16.4s, v6.4h, v0.h[2]
375        SMLAL2  v20.4s, v6.8h, v0.h[2]
376        SMLAL   v17.4s, v6.4h, v1.h[2]
377        USUBL   v5.8h, v5.8b, v7.8b
378        SMLAL2  v21.4s, v6.8h, v1.h[2]
379        SMLAL   v18.4s, v6.4h, v2.h[2]
380        SMLAL2  v22.4s, v6.8h, v2.h[2]
381        SMLAL   v19.4s, v6.4h, v3.h[2]
382        SMLAL2  v23.4s, v6.8h, v3.h[2]
383        LDR     d6, [x5, 56]
384        INS     v4.d[0], x11
385        SMLAL   v24.4s, v5.4h, v0.h[2]
386        SMLAL2  v28.4s, v5.8h, v0.h[2]
387        SMLAL   v25.4s, v5.4h, v1.h[2]
388        SMLAL2  v29.4s, v5.8h, v1.h[2]
389        USUBL   v4.8h, v4.8b, v7.8b
390        SMLAL   v26.4s, v5.4h, v2.h[2]
391        SMLAL2  v30.4s, v5.8h, v2.h[2]
392        SMLAL   v27.4s, v5.4h, v3.h[2]
393        SMLAL2  v31.4s, v5.8h, v3.h[2]
394        LDR     x11, [x5, 64]
395        SMLAL   v16.4s, v4.4h, v0.h[3]
396        SMLAL2  v20.4s, v4.8h, v0.h[3]
397        SMLAL   v17.4s, v4.4h, v1.h[3]
398        SMLAL2  v21.4s, v4.8h, v1.h[3]
399        USUBL   v6.8h, v6.8b, v7.8b
400        SMLAL   v18.4s, v4.4h, v2.h[3]
401        SMLAL2  v22.4s, v4.8h, v2.h[3]
402        SMLAL   v19.4s, v4.4h, v3.h[3]
403        SMLAL2  v23.4s, v4.8h, v3.h[3]
404        LDR     d4, [x5, 72]
405        INS     v5.d[0], x11
406        SMLAL   v24.4s, v6.4h, v0.h[3]
407        SMLAL2  v28.4s, v6.8h, v0.h[3]
408        USUBL   v5.8h, v5.8b, v7.8b
409        SMLAL   v25.4s, v6.4h, v1.h[3]
410        SMLAL2  v29.4s, v6.8h, v1.h[3]
411        SMLAL   v26.4s, v6.4h, v2.h[3]
412        SMLAL2  v30.4s, v6.8h, v2.h[3]
413        SMLAL   v27.4s, v6.4h, v3.h[3]
414        SMLAL2  v31.4s, v6.8h, v3.h[3]
415        LDR     x11, [x5, 80]
416        SMLAL   v16.4s, v5.4h, v0.h[4]
417        SMLAL2  v20.4s, v5.8h, v0.h[4]
418        SMLAL   v17.4s, v5.4h, v1.h[4]
419        SMLAL2  v21.4s, v5.8h, v1.h[4]
420        USUBL   v4.8h, v4.8b, v7.8b
421        SMLAL   v18.4s, v5.4h, v2.h[4]
422        SMLAL2  v22.4s, v5.8h, v2.h[4]
423        SMLAL   v19.4s, v5.4h, v3.h[4]
424        SMLAL2  v23.4s, v5.8h, v3.h[4]
425        LDR     d5, [x5, 88]
426        INS     v6.d[0], x11
427        SMLAL   v24.4s, v4.4h, v0.h[4]
428        SMLAL2  v28.4s, v4.8h, v0.h[4]
429        SMLAL   v25.4s, v4.4h, v1.h[4]
430        SMLAL2  v29.4s, v4.8h, v1.h[4]
431        USUBL   v6.8h, v6.8b, v7.8b
432        SMLAL   v26.4s, v4.4h, v2.h[4]
433        SMLAL2  v30.4s, v4.8h, v2.h[4]
434        SMLAL   v27.4s, v4.4h, v3.h[4]
435        SMLAL2  v31.4s, v4.8h, v3.h[4]
436        LDR     x11, [x5, 96]
437        SMLAL   v16.4s, v6.4h, v0.h[5]
438        SMLAL2  v20.4s, v6.8h, v0.h[5]
439        SMLAL   v17.4s, v6.4h, v1.h[5]
440        SMLAL2  v21.4s, v6.8h, v1.h[5]
441        USUBL   v5.8h, v5.8b, v7.8b
442        SMLAL   v18.4s, v6.4h, v2.h[5]
443        SMLAL2  v22.4s, v6.8h, v2.h[5]
444        SMLAL   v19.4s, v6.4h, v3.h[5]
445        SMLAL2  v23.4s, v6.8h, v3.h[5]
446        LDR     d6, [x5, 104]
447        INS     v4.d[0], x11
448        SMLAL   v24.4s, v5.4h, v0.h[5]
449        SMLAL2  v28.4s, v5.8h, v0.h[5]
450        SMLAL   v25.4s, v5.4h, v1.h[5]
451        SMLAL2  v29.4s, v5.8h, v1.h[5]
452        USUBL   v4.8h, v4.8b, v7.8b
453        SMLAL   v26.4s, v5.4h, v2.h[5]
454        SMLAL2  v30.4s, v5.8h, v2.h[5]
455        SMLAL   v27.4s, v5.4h, v3.h[5]
456        SMLAL2  v31.4s, v5.8h, v3.h[5]
457        USUBL   v6.8h, v6.8b, v7.8b
458        SMLAL   v16.4s, v4.4h, v0.h[6]
459        SMLAL2  v20.4s, v4.8h, v0.h[6]
460        SMLAL   v17.4s, v4.4h, v1.h[6]
461        SMLAL2  v21.4s, v4.8h, v1.h[6]
462        SMLAL   v18.4s, v4.4h, v2.h[6]
463        SMLAL2  v22.4s, v4.8h, v2.h[6]
464        SMLAL   v19.4s, v4.4h, v3.h[6]
465        SMLAL2  v23.4s, v4.8h, v3.h[6]
466        LDR     x11, [x5, 112]
467        SMLAL   v24.4s, v6.4h, v0.h[6]
468        SMLAL2  v28.4s, v6.8h, v0.h[6]
469        SMLAL   v25.4s, v6.4h, v1.h[6]
470        SMLAL2  v29.4s, v6.8h, v1.h[6]
471        LDR     d5, [x5, 120]
472        INS     v4.d[0], x11
473        USUBL   v4.8h, v4.8b, v7.8b
474        SMLAL   v26.4s, v6.4h, v2.h[6]
475        SMLAL2  v30.4s, v6.8h, v2.h[6]
476        SMLAL   v27.4s, v6.4h, v3.h[6]
477        SMLAL2  v31.4s, v6.8h, v3.h[6]
478        SMLAL   v16.4s, v4.4h, v0.h[7]
479        SMLAL2  v20.4s, v4.8h, v0.h[7]
480        SMLAL   v17.4s, v4.4h, v1.h[7]
481        SMLAL2  v21.4s, v4.8h, v1.h[7]
482        USUBL   v5.8h, v5.8b, v7.8b
483        SMLAL   v18.4s, v4.4h, v2.h[7]
484        SMLAL2  v22.4s, v4.8h, v2.h[7]
485        SMLAL   v19.4s, v4.4h, v3.h[7]
486        SMLAL2  v23.4s, v4.8h, v3.h[7]
487        ADD     x5, x5, 128
488        SMLAL   v24.4s, v5.4h, v0.h[7]
489        SMLAL2  v28.4s, v5.8h, v0.h[7]
490        SMLAL   v25.4s, v5.4h, v1.h[7]
491        SMLAL2  v29.4s, v5.8h, v1.h[7]
492        AND     x0, x2, 7               // kc remainder 0 to 7
493        SMLAL   v26.4s, v5.4h, v2.h[7]
494        SMLAL2  v30.4s, v5.8h, v2.h[7]
495        LDR     x11, [sp, 40]            // reload params pointer
496        SMLAL   v27.4s, v5.4h, v3.h[7]
497        SMLAL2  v31.4s, v5.8h, v3.h[7]
498        ADD     x11, x11, 4
499
500        # Is there a remainder?- 1 to 7 bytes of A
501        CBNZ    x0, 5f
502
5034:
504        # ks loop
505        SUBS    x9, x9, 32              // ks -= MR * sizeof(uint8_t*)
506        B.HI    1b
507
508        # Apply params - preshift, scale, postshift, bias and clamp
509        LD1R    {v4.4s}, [x11], 4
510        SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
511        SQSHL   v17.4s, v17.4s, v4.4s
512        SQSHL   v18.4s, v18.4s, v4.4s
513        SQSHL   v19.4s, v19.4s, v4.4s
514        SQSHL   v20.4s, v20.4s, v4.4s
515        SQSHL   v21.4s, v21.4s, v4.4s
516        SQSHL   v22.4s, v22.4s, v4.4s
517        SQSHL   v23.4s, v23.4s, v4.4s
518        LD1R    {v5.4s}, [x11], 4
519        SQSHL   v24.4s, v24.4s, v4.4s
520        SQSHL   v25.4s, v25.4s, v4.4s
521        SQSHL   v26.4s, v26.4s, v4.4s
522        SQSHL   v27.4s, v27.4s, v4.4s
523        SQSHL   v28.4s, v28.4s, v4.4s
524        SQSHL   v29.4s, v29.4s, v4.4s
525        SQSHL   v30.4s, v30.4s, v4.4s
526        SQSHL   v31.4s, v31.4s, v4.4s
527        LD1R    {v6.4s}, [x11], 4
528        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
529        SQDMULH v17.4s, v17.4s, v5.4s
530        SQDMULH v18.4s, v18.4s, v5.4s
531        SQDMULH v19.4s, v19.4s, v5.4s
532        SQDMULH v20.4s, v20.4s, v5.4s
533        SQDMULH v21.4s, v21.4s, v5.4s
534        SQDMULH v22.4s, v22.4s, v5.4s
535        SQDMULH v23.4s, v23.4s, v5.4s
536        SQDMULH v24.4s, v24.4s, v5.4s
537        SQDMULH v25.4s, v25.4s, v5.4s
538        SQDMULH v26.4s, v26.4s, v5.4s
539        SQDMULH v27.4s, v27.4s, v5.4s
540        SQDMULH v28.4s, v28.4s, v5.4s
541        SQDMULH v29.4s, v29.4s, v5.4s
542        SQDMULH v30.4s, v30.4s, v5.4s
543        SQDMULH v31.4s, v31.4s, v5.4s
544        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
545        SRSHL   v17.4s, v17.4s, v6.4s
546        SRSHL   v18.4s, v18.4s, v6.4s
547        SRSHL   v19.4s, v19.4s, v6.4s
548        SRSHL   v20.4s, v20.4s, v6.4s
549        SRSHL   v21.4s, v21.4s, v6.4s
550        SRSHL   v22.4s, v22.4s, v6.4s
551        SRSHL   v23.4s, v23.4s, v6.4s
552        SRSHL   v24.4s, v24.4s, v6.4s
553        SRSHL   v25.4s, v25.4s, v6.4s
554        SRSHL   v26.4s, v26.4s, v6.4s
555        SRSHL   v27.4s, v27.4s, v6.4s
556        SRSHL   v28.4s, v28.4s, v6.4s
557        SRSHL   v29.4s, v29.4s, v6.4s
558        SRSHL   v30.4s, v30.4s, v6.4s
559        SRSHL   v31.4s, v31.4s, v6.4s
560
561        SQXTN   v16.4h, v16.4s
562        SQXTN   v17.4h, v17.4s
563        SQXTN   v18.4h, v18.4s
564        SQXTN   v19.4h, v19.4s
565        SQXTN   v24.4h, v24.4s
566        SQXTN   v25.4h, v25.4s
567        SQXTN   v26.4h, v26.4s
568        SQXTN   v27.4h, v27.4s
569        LD1R    {v6.8h}, [x11], 2        // add bias
570
571        SQXTN2  v16.8h, v20.4s
572        SQXTN2  v17.8h, v21.4s
573        SQXTN2  v18.8h, v22.4s
574        SQXTN2  v19.8h, v23.4s
575        SQXTN2  v24.8h, v28.4s
576        SQXTN2  v25.8h, v29.4s
577        SQXTN2  v26.8h, v30.4s
578        SQXTN2  v27.8h, v31.4s
579
580        SQADD   v16.8h, v16.8h, v6.8h
581        SQADD   v17.8h, v17.8h, v6.8h
582        SQADD   v18.8h, v18.8h, v6.8h
583        SQADD   v19.8h, v19.8h, v6.8h
584        SQADD   v24.8h, v24.8h, v6.8h
585        SQADD   v25.8h, v25.8h, v6.8h
586        SQADD   v26.8h, v26.8h, v6.8h
587        SQADD   v27.8h, v27.8h, v6.8h
588        LD1R    {v4.16b}, [x11], 1       // clamp min value
589
590        SQXTUN   v0.8b, v16.8h
591        SQXTUN   v1.8b, v17.8h
592        SQXTUN   v2.8b, v18.8h
593        SQXTUN   v3.8b, v19.8h
594        LD1R    {v5.16b}, [x11]          // clamp max value
595        SQXTUN2  v0.16b, v24.8h
596        SQXTUN2  v1.16b, v25.8h
597        SQXTUN2  v2.16b, v26.8h
598        SQXTUN2  v3.16b, v27.8h
599        SUB     x11, x11, 19             // rewind params pointer
600
601        UMAX    v0.16b, v0.16b, v4.16b
602        UMAX    v1.16b, v1.16b, v4.16b
603        UMAX    v2.16b, v2.16b, v4.16b
604        UMAX    v3.16b, v3.16b, v4.16b
605        SUBS    x1, x1, 16
606        UMIN    v0.16b, v0.16b, v5.16b
607        UMIN    v1.16b, v1.16b, v5.16b
608        UMIN    v2.16b, v2.16b, v5.16b
609        UMIN    v3.16b, v3.16b, v5.16b
610        B.LO    6f
611
612        # Store full 4 x 16
613        ST1     {v3.16b},  [x7], x10
614        ST1     {v2.16b}, [x17], x10
615        ST1     {v1.16b}, [x16], x10
616        ST1     {v0.16b},  [x6], x10
617
618        SUB     x4, x4, x3              // a -= ks
619
620        # nc loop
621        B.HI    0b
622
623        # Restore x20-x21 from stack
624        LDP     x20, x21, [sp], 16
625        RET
626
627        # Remainder- 1 to 7 bytes of A
628        .p2align 3
6295:
630        AND     x0, x2, 7               // kc remainder 1 to 7
631
632        LD1     {v0.8b}, [x13], x0
633        LDP     d4, d5, [x5], 16
634        LD1     {v1.8b}, [x14], x0
635        LD1     {v2.8b}, [x15], x0
636        LD1     {v3.8b}, [x20], x0
637        UXTL    v0.8h, v0.8b
638        USUBL   v4.8h, v4.8b, v7.8b
639        USUBL   v5.8h, v5.8b, v7.8b
640        UXTL    v1.8h, v1.8b
641        UXTL    v2.8h, v2.8b
642        UXTL    v3.8h, v3.8b
643        SMLAL   v16.4s, v4.4h, v0.h[0]
644        SMLAL2  v20.4s, v4.8h, v0.h[0]
645        SMLAL   v24.4s, v5.4h, v0.h[0]
646        SMLAL2  v28.4s, v5.8h, v0.h[0]
647        SMLAL   v17.4s, v4.4h, v1.h[0]
648        SMLAL2  v21.4s, v4.8h, v1.h[0]
649        SMLAL   v25.4s, v5.4h, v1.h[0]
650        SMLAL2  v29.4s, v5.8h, v1.h[0]
651        SMLAL   v18.4s, v4.4h, v2.h[0]
652        SMLAL2  v22.4s, v4.8h, v2.h[0]
653        SMLAL   v26.4s, v5.4h, v2.h[0]
654        SMLAL2  v30.4s, v5.8h, v2.h[0]
655        SMLAL   v19.4s, v4.4h, v3.h[0]
656        SMLAL2  v23.4s, v4.8h, v3.h[0]
657        SMLAL   v27.4s, v5.4h, v3.h[0]
658        SMLAL2  v31.4s, v5.8h, v3.h[0]
659        CMP     x0, 2
660        B.LO    4b
661
662        LDP     d4, d5, [x5], 16
663        USUBL   v4.8h, v4.8b, v7.8b
664        USUBL   v5.8h, v5.8b, v7.8b
665        SMLAL   v16.4s, v4.4h, v0.h[1]
666        SMLAL2  v20.4s, v4.8h, v0.h[1]
667        SMLAL   v24.4s, v5.4h, v0.h[1]
668        SMLAL2  v28.4s, v5.8h, v0.h[1]
669        SMLAL   v17.4s, v4.4h, v1.h[1]
670        SMLAL2  v21.4s, v4.8h, v1.h[1]
671        SMLAL   v25.4s, v5.4h, v1.h[1]
672        SMLAL2  v29.4s, v5.8h, v1.h[1]
673        SMLAL   v18.4s, v4.4h, v2.h[1]
674        SMLAL2  v22.4s, v4.8h, v2.h[1]
675        SMLAL   v26.4s, v5.4h, v2.h[1]
676        SMLAL2  v30.4s, v5.8h, v2.h[1]
677        SMLAL   v19.4s, v4.4h, v3.h[1]
678        SMLAL2  v23.4s, v4.8h, v3.h[1]
679        SMLAL   v27.4s, v5.4h, v3.h[1]
680        SMLAL2  v31.4s, v5.8h, v3.h[1]
681        B.EQ    4b
682
683        LDP     d4, d5, [x5], 16
684        USUBL   v4.8h, v4.8b, v7.8b
685        USUBL   v5.8h, v5.8b, v7.8b
686        SMLAL   v16.4s, v4.4h, v0.h[2]
687        SMLAL2  v20.4s, v4.8h, v0.h[2]
688        SMLAL   v24.4s, v5.4h, v0.h[2]
689        SMLAL2  v28.4s, v5.8h, v0.h[2]
690        SMLAL   v17.4s, v4.4h, v1.h[2]
691        SMLAL2  v21.4s, v4.8h, v1.h[2]
692        SMLAL   v25.4s, v5.4h, v1.h[2]
693        SMLAL2  v29.4s, v5.8h, v1.h[2]
694        SMLAL   v18.4s, v4.4h, v2.h[2]
695        SMLAL2  v22.4s, v4.8h, v2.h[2]
696        SMLAL   v26.4s, v5.4h, v2.h[2]
697        SMLAL2  v30.4s, v5.8h, v2.h[2]
698        SMLAL   v19.4s, v4.4h, v3.h[2]
699        SMLAL2  v23.4s, v4.8h, v3.h[2]
700        SMLAL   v27.4s, v5.4h, v3.h[2]
701        SMLAL2  v31.4s, v5.8h, v3.h[2]
702        CMP     x0, 4
703        B.LO    4b
704
705        LDP     d4, d5, [x5], 16
706        USUBL   v4.8h, v4.8b, v7.8b
707        USUBL   v5.8h, v5.8b, v7.8b
708        SMLAL   v16.4s, v4.4h, v0.h[3]
709        SMLAL2  v20.4s, v4.8h, v0.h[3]
710        SMLAL   v24.4s, v5.4h, v0.h[3]
711        SMLAL2  v28.4s, v5.8h, v0.h[3]
712        SMLAL   v17.4s, v4.4h, v1.h[3]
713        SMLAL2  v21.4s, v4.8h, v1.h[3]
714        SMLAL   v25.4s, v5.4h, v1.h[3]
715        SMLAL2  v29.4s, v5.8h, v1.h[3]
716        SMLAL   v18.4s, v4.4h, v2.h[3]
717        SMLAL2  v22.4s, v4.8h, v2.h[3]
718        SMLAL   v26.4s, v5.4h, v2.h[3]
719        SMLAL2  v30.4s, v5.8h, v2.h[3]
720        SMLAL   v19.4s, v4.4h, v3.h[3]
721        SMLAL2  v23.4s, v4.8h, v3.h[3]
722        SMLAL   v27.4s, v5.4h, v3.h[3]
723        SMLAL2  v31.4s, v5.8h, v3.h[3]
724        B.EQ    4b
725
726        LDP     d4, d5, [x5], 16
727        USUBL   v4.8h, v4.8b, v7.8b
728        USUBL   v5.8h, v5.8b, v7.8b
729        SMLAL   v16.4s, v4.4h, v0.h[4]
730        SMLAL2  v20.4s, v4.8h, v0.h[4]
731        SMLAL   v24.4s, v5.4h, v0.h[4]
732        SMLAL2  v28.4s, v5.8h, v0.h[4]
733        SMLAL   v17.4s, v4.4h, v1.h[4]
734        SMLAL2  v21.4s, v4.8h, v1.h[4]
735        SMLAL   v25.4s, v5.4h, v1.h[4]
736        SMLAL2  v29.4s, v5.8h, v1.h[4]
737        SMLAL   v18.4s, v4.4h, v2.h[4]
738        SMLAL2  v22.4s, v4.8h, v2.h[4]
739        SMLAL   v26.4s, v5.4h, v2.h[4]
740        SMLAL2  v30.4s, v5.8h, v2.h[4]
741        SMLAL   v19.4s, v4.4h, v3.h[4]
742        SMLAL2  v23.4s, v4.8h, v3.h[4]
743        SMLAL   v27.4s, v5.4h, v3.h[4]
744        SMLAL2  v31.4s, v5.8h, v3.h[4]
745        CMP     x0, 6
746        B.LO    4b
747
748        LDP     d4, d5, [x5], 16
749        USUBL   v4.8h, v4.8b, v7.8b
750        USUBL   v5.8h, v5.8b, v7.8b
751        SMLAL   v16.4s, v4.4h, v0.h[5]
752        SMLAL2  v20.4s, v4.8h, v0.h[5]
753        SMLAL   v24.4s, v5.4h, v0.h[5]
754        SMLAL2  v28.4s, v5.8h, v0.h[5]
755        SMLAL   v17.4s, v4.4h, v1.h[5]
756        SMLAL2  v21.4s, v4.8h, v1.h[5]
757        SMLAL   v25.4s, v5.4h, v1.h[5]
758        SMLAL2  v29.4s, v5.8h, v1.h[5]
759        SMLAL   v18.4s, v4.4h, v2.h[5]
760        SMLAL2  v22.4s, v4.8h, v2.h[5]
761        SMLAL   v26.4s, v5.4h, v2.h[5]
762        SMLAL2  v30.4s, v5.8h, v2.h[5]
763        SMLAL   v19.4s, v4.4h, v3.h[5]
764        SMLAL2  v23.4s, v4.8h, v3.h[5]
765        SMLAL   v27.4s, v5.4h, v3.h[5]
766        SMLAL2  v31.4s, v5.8h, v3.h[5]
767        B.EQ    4b
768
769        LDP     d4, d5, [x5], 16
770        USUBL   v4.8h, v4.8b, v7.8b
771        USUBL   v5.8h, v5.8b, v7.8b
772        SMLAL   v16.4s, v4.4h, v0.h[6]
773        SMLAL2  v20.4s, v4.8h, v0.h[6]
774        SMLAL   v24.4s, v5.4h, v0.h[6]
775        SMLAL2  v28.4s, v5.8h, v0.h[6]
776        SMLAL   v17.4s, v4.4h, v1.h[6]
777        SMLAL2  v21.4s, v4.8h, v1.h[6]
778        SMLAL   v25.4s, v5.4h, v1.h[6]
779        SMLAL2  v29.4s, v5.8h, v1.h[6]
780        SMLAL   v18.4s, v4.4h, v2.h[6]
781        SMLAL2  v22.4s, v4.8h, v2.h[6]
782        SMLAL   v26.4s, v5.4h, v2.h[6]
783        SMLAL2  v30.4s, v5.8h, v2.h[6]
784        SMLAL   v19.4s, v4.4h, v3.h[6]
785        SMLAL2  v23.4s, v4.8h, v3.h[6]
786        SMLAL   v27.4s, v5.4h, v3.h[6]
787        SMLAL2  v31.4s, v5.8h, v3.h[6]
788        B       4b
789
790        # Store odd width
791        .p2align 3
7926:
793        TBZ     x1, 3, 7f
794        STR     d3, [x7], 8
795        STR     d2, [x17], 8
796        DUP     d3, v3.d[1]
797        DUP     d2, v2.d[1]
798        STR     d1, [x16], 8
799        STR     d0, [x6], 8
800        DUP     d1, v1.d[1]
801        DUP     d0, v0.d[1]
8027:
803        TBZ     x1, 2, 8f
804        STR     s3, [x7], 4
805        STR     s2, [x17], 4
806        DUP     s3, v3.s[1]
807        DUP     s2, v2.s[1]
808        STR     s1, [x16], 4
809        STR     s0, [x6], 4
810        DUP     s1, v1.s[1]
811        DUP     s0, v0.s[1]
8128:
813        TBZ     x1, 1, 9f
814        STR     h3, [x7], 2
815        STR     h2, [x17], 2
816        DUP     h3, v3.h[1]
817        DUP     h2, v2.h[1]
818        STR     h1, [x16], 2
819        STR     h0, [x6], 2
820        DUP     h1, v1.h[1]
821        DUP     h0, v0.h[1]
8229:
823        TBZ     x1, 0, 10f
824        STR     b3, [x7]
825        STR     b2, [x17]
826        STR     b1, [x16]
827        STR     b0, [x6]
82810:
829        # Restore x20-x21 from stack
830        LDP     x20, x21, [sp], 16
831        RET
832
833END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
834
835#ifdef __ELF__
836.section ".note.GNU-stack","",%progbits
837#endif
838