1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t** restrict a, x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0
31# A1  x14  v1
32# A2  x15  v2
33# A3  x20  v3
34# B    x5  v4  v5  v6
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
40# x11, x21 temp for Cortex-A53 loads
41
42BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
43
44        # Clamp C pointers
45        CMP     x0, 2                   // if mr < 2
46        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
47        ADD     x16, x6, x7             // c1 = c0 + cm_stride
48        CSEL    x16, x6,  x16, LO       //   c1 = c0
49
50        ADD     x17, x16, x7            // c2 = c1 + cm_stride
51        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
52                                        // if mr <= 2
53        CSEL    x17, x16, x17, LS       //   c2 = c1
54
55        CMP     x0, 4                   // if mr < 4
56        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
57        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
58        CSEL    x7,  x17, x7, LO        //   c3 = c2
59
60
61        .p2align 3
620:
63        # Load initial bias from w into accumulators
64        LDP     q16, q20, [x5], 32
65        MOV     v17.16b, v16.16b
66        MOV     v18.16b, v16.16b
67        LDP     q24, q28, [x5], 32
68        MOV     v19.16b, v16.16b
69        MOV     v21.16b, v20.16b
70        MOV     v22.16b, v20.16b
71        MOV     v23.16b, v20.16b
72        MOV     v25.16b, v24.16b
73        MOV     v26.16b, v24.16b
74        MOV     v27.16b, v24.16b
75        MOV     v29.16b, v28.16b
76        MOV     v30.16b, v28.16b
77        MOV     v31.16b, v28.16b
78        MOV     x9, x3                  // p = ks
79
80        .p2align 3
811:
82        # Load next 4 A pointers
83        LDP     x13, x14, [x4], 16
84        LDP     x15, x20, [x4], 16
85
86        CMP     x13, x12                // if a0 == zero
87        ADD     x13, x13, x8            // a0 += a_offset
88        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
89        CMP     x14, x12                // if a1 == zero
90        ADD     x14, x14, x8            // a1 += a_offset
91        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
92        CMP     x15, x12                // if a2 == zero
93        ADD     x15, x15, x8            // a2 += a_offset
94        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
95        CMP     x20, x12                // if a3 == zero
96        ADD     x20, x20, x8            // a3 += a_offset
97        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
98
99        # Is there at least 8 bytes for epilogue?
100        SUBS    x0, x2, 8               // k = kc - 8
101        B.LO    5f
102
103        # Prologue
104        LDR     d0, [x13], 8
105        LDP     d4, d6, [x5]
106        LDR     d1, [x14], 8
107        LDR     d2, [x15], 8
108        LDR     d3, [x20], 8
109        SXTL    v0.8h, v0.8b
110        LDR     x11, [x5, 16]
111        SXTL    v4.8h, v4.8b
112        SXTL    v1.8h, v1.8b
113        SXTL    v2.8h, v2.8b
114        SXTL    v3.8h, v3.8b
115        SXTL    v6.8h, v6.8b
116
117        SUBS    x0, x0, 8               // k = k - 8
118        # Is there at least 8 bytes for main loop?
119        B.LO    3f
120
121        # Main loop - 8 bytes of A
122        .p2align 3
1232:
124        SMLAL   v16.4s, v4.4h, v0.h[0]
125        SMLAL2  v20.4s, v4.8h, v0.h[0]
126        PRFM    PLDL1KEEP, [x13, 128]
127        SMLAL   v17.4s, v4.4h, v1.h[0]
128        SMLAL2  v21.4s, v4.8h, v1.h[0]
129        PRFM    PLDL1KEEP, [x14, 128]
130        SMLAL   v18.4s, v4.4h, v2.h[0]
131        SMLAL2  v22.4s, v4.8h, v2.h[0]
132        PRFM    PLDL1KEEP, [x15, 128]
133        SMLAL   v19.4s, v4.4h, v3.h[0]
134        SMLAL2  v23.4s, v4.8h, v3.h[0]
135        PRFM    PLDL1KEEP, [x20, 128]
136        LDR     d4, [x5, 24]
137        INS     v5.d[0], x11
138        SMLAL   v24.4s, v6.4h, v0.h[0]
139        SMLAL2  v28.4s, v6.8h, v0.h[0]
140        PRFM    PLDL1KEEP, [x5, 448]
141        SMLAL   v25.4s, v6.4h, v1.h[0]
142        SMLAL2  v29.4s, v6.8h, v1.h[0]
143        PRFM    PLDL1KEEP, [x5, 512]
144        SXTL    v5.8h, v5.8b
145        SMLAL   v26.4s, v6.4h, v2.h[0]
146        SMLAL2  v30.4s, v6.8h, v2.h[0]
147        SMLAL   v27.4s, v6.4h, v3.h[0]
148        SMLAL2  v31.4s, v6.8h, v3.h[0]
149        LDR     x11, [x5, 32]
150        SMLAL   v16.4s, v5.4h, v0.h[1]
151        SMLAL2  v20.4s, v5.8h, v0.h[1]
152        SMLAL   v17.4s, v5.4h, v1.h[1]
153        SMLAL2  v21.4s, v5.8h, v1.h[1]
154        SXTL    v4.8h, v4.8b
155        SMLAL   v18.4s, v5.4h, v2.h[1]
156        SMLAL2  v22.4s, v5.8h, v2.h[1]
157        SMLAL   v19.4s, v5.4h, v3.h[1]
158        SMLAL2  v23.4s, v5.8h, v3.h[1]
159        LDR     d5, [x5, 40]
160        INS     v6.d[0], x11
161        SMLAL   v24.4s, v4.4h, v0.h[1]
162        SMLAL2  v28.4s, v4.8h, v0.h[1]
163        SMLAL   v25.4s, v4.4h, v1.h[1]
164        SMLAL2  v29.4s, v4.8h, v1.h[1]
165        SXTL    v6.8h, v6.8b
166        SMLAL   v26.4s, v4.4h, v2.h[1]
167        SMLAL2  v30.4s, v4.8h, v2.h[1]
168        SMLAL   v27.4s, v4.4h, v3.h[1]
169        SMLAL2  v31.4s, v4.8h, v3.h[1]
170        LDR     x11, [x5, 48]
171        SMLAL   v16.4s, v6.4h, v0.h[2]
172        SMLAL2  v20.4s, v6.8h, v0.h[2]
173        SMLAL   v17.4s, v6.4h, v1.h[2]
174        SXTL    v5.8h, v5.8b
175        SMLAL2  v21.4s, v6.8h, v1.h[2]
176        SMLAL   v18.4s, v6.4h, v2.h[2]
177        SMLAL2  v22.4s, v6.8h, v2.h[2]
178        SMLAL   v19.4s, v6.4h, v3.h[2]
179        SMLAL2  v23.4s, v6.8h, v3.h[2]
180        LDR     d6, [x5, 56]
181        INS     v4.d[0], x11
182        SMLAL   v24.4s, v5.4h, v0.h[2]
183        SMLAL2  v28.4s, v5.8h, v0.h[2]
184        SMLAL   v25.4s, v5.4h, v1.h[2]
185        SMLAL2  v29.4s, v5.8h, v1.h[2]
186        SXTL    v4.8h, v4.8b
187        SMLAL   v26.4s, v5.4h, v2.h[2]
188        SMLAL2  v30.4s, v5.8h, v2.h[2]
189        SMLAL   v27.4s, v5.4h, v3.h[2]
190        SMLAL2  v31.4s, v5.8h, v3.h[2]
191        LDR     x11, [x5, 64]
192        SMLAL   v16.4s, v4.4h, v0.h[3]
193        SMLAL2  v20.4s, v4.8h, v0.h[3]
194        SMLAL   v17.4s, v4.4h, v1.h[3]
195        SMLAL2  v21.4s, v4.8h, v1.h[3]
196        SXTL    v6.8h, v6.8b
197        SMLAL   v18.4s, v4.4h, v2.h[3]
198        SMLAL2  v22.4s, v4.8h, v2.h[3]
199        SMLAL   v19.4s, v4.4h, v3.h[3]
200        SMLAL2  v23.4s, v4.8h, v3.h[3]
201        LDR     d4, [x5, 72]
202        INS     v5.d[0], x11
203        SMLAL   v24.4s, v6.4h, v0.h[3]
204        SMLAL2  v28.4s, v6.8h, v0.h[3]
205        SXTL    v5.8h, v5.8b
206        SMLAL   v25.4s, v6.4h, v1.h[3]
207        SMLAL2  v29.4s, v6.8h, v1.h[3]
208        SMLAL   v26.4s, v6.4h, v2.h[3]
209        SMLAL2  v30.4s, v6.8h, v2.h[3]
210        SMLAL   v27.4s, v6.4h, v3.h[3]
211        SMLAL2  v31.4s, v6.8h, v3.h[3]
212        LDR     x11, [x5, 80]
213        SMLAL   v16.4s, v5.4h, v0.h[4]
214        SMLAL2  v20.4s, v5.8h, v0.h[4]
215        SMLAL   v17.4s, v5.4h, v1.h[4]
216        SMLAL2  v21.4s, v5.8h, v1.h[4]
217        SXTL    v4.8h, v4.8b
218        SMLAL   v18.4s, v5.4h, v2.h[4]
219        SMLAL2  v22.4s, v5.8h, v2.h[4]
220        SMLAL   v19.4s, v5.4h, v3.h[4]
221        SMLAL2  v23.4s, v5.8h, v3.h[4]
222        LDR     d5, [x5, 88]
223        INS     v6.d[0], x11
224        SMLAL   v24.4s, v4.4h, v0.h[4]
225        SMLAL2  v28.4s, v4.8h, v0.h[4]
226        SMLAL   v25.4s, v4.4h, v1.h[4]
227        SMLAL2  v29.4s, v4.8h, v1.h[4]
228        SXTL    v6.8h, v6.8b
229        SMLAL   v26.4s, v4.4h, v2.h[4]
230        SMLAL2  v30.4s, v4.8h, v2.h[4]
231        SMLAL   v27.4s, v4.4h, v3.h[4]
232        SMLAL2  v31.4s, v4.8h, v3.h[4]
233        LDR     x11, [x5, 96]
234        SMLAL   v16.4s, v6.4h, v0.h[5]
235        SMLAL2  v20.4s, v6.8h, v0.h[5]
236        SMLAL   v17.4s, v6.4h, v1.h[5]
237        SMLAL2  v21.4s, v6.8h, v1.h[5]
238        SXTL    v5.8h, v5.8b
239        SMLAL   v18.4s, v6.4h, v2.h[5]
240        SMLAL2  v22.4s, v6.8h, v2.h[5]
241        SMLAL   v19.4s, v6.4h, v3.h[5]
242        SMLAL2  v23.4s, v6.8h, v3.h[5]
243        LDR     d6, [x5, 104]
244        INS     v4.d[0], x11
245        SMLAL   v24.4s, v5.4h, v0.h[5]
246        SMLAL2  v28.4s, v5.8h, v0.h[5]
247        SMLAL   v25.4s, v5.4h, v1.h[5]
248        SMLAL2  v29.4s, v5.8h, v1.h[5]
249        SXTL    v4.8h, v4.8b
250        SMLAL   v26.4s, v5.4h, v2.h[5]
251        SMLAL2  v30.4s, v5.8h, v2.h[5]
252        SMLAL   v27.4s, v5.4h, v3.h[5]
253        SMLAL2  v31.4s, v5.8h, v3.h[5]
254        SXTL    v6.8h, v6.8b
255        LDR     x11, [x5, 112]
256        SMLAL   v16.4s, v4.4h, v0.h[6]
257        SMLAL2  v20.4s, v4.8h, v0.h[6]
258        SMLAL   v17.4s, v4.4h, v1.h[6]
259        SMLAL2  v21.4s, v4.8h, v1.h[6]
260        SMLAL   v18.4s, v4.4h, v2.h[6]
261        SMLAL2  v22.4s, v4.8h, v2.h[6]
262        SMLAL   v19.4s, v4.4h, v3.h[6]
263        SMLAL2  v23.4s, v4.8h, v3.h[6]
264        LDR     d5, [x5, 120]
265        INS     v4.d[0], x11
266        SMLAL   v24.4s, v6.4h, v0.h[6]
267        SMLAL2  v28.4s, v6.8h, v0.h[6]
268        SMLAL   v25.4s, v6.4h, v1.h[6]
269        SMLAL2  v29.4s, v6.8h, v1.h[6]
270        SXTL    v4.8h, v4.8b
271        ADD     x5, x5, 128
272
273        SMLAL   v26.4s, v6.4h, v2.h[6]
274        SMLAL2  v30.4s, v6.8h, v2.h[6]
275        LDR     x11, [x5]
276        SMLAL   v27.4s, v6.4h, v3.h[6]
277        SMLAL2  v31.4s, v6.8h, v3.h[6]
278        SXTL    v5.8h, v5.8b
279        LDR     x21, [x13], 8
280
281        SMLAL   v16.4s, v4.4h, v0.h[7]
282        SMLAL2  v20.4s, v4.8h, v0.h[7]
283        SMLAL   v17.4s, v4.4h, v1.h[7]
284        SMLAL2  v21.4s, v4.8h, v1.h[7]
285        SMLAL   v18.4s, v4.4h, v2.h[7]
286        SMLAL2  v22.4s, v4.8h, v2.h[7]
287        SMLAL   v19.4s, v4.4h, v3.h[7]
288        SMLAL2  v23.4s, v4.8h, v3.h[7]
289        LDR     d6, [x5, 8]
290        INS     v4.d[0], x11
291        SMLAL   v24.4s, v5.4h, v0.h[7]
292        SMLAL2  v28.4s, v5.8h, v0.h[7]
293        LDR     x11, [x15], 8
294        SMLAL   v25.4s, v5.4h, v1.h[7]
295        SMLAL2  v29.4s, v5.8h, v1.h[7]
296        LDR     d1, [x14], 8
297        INS     v0.d[0], x21
298        SMLAL   v26.4s, v5.4h, v2.h[7]
299        SMLAL2  v30.4s, v5.8h, v2.h[7]
300        SMLAL   v27.4s, v5.4h, v3.h[7]
301        SMLAL2  v31.4s, v5.8h, v3.h[7]
302        LDR     d3, [x20], 8
303        INS     v2.d[0], x11
304
305        SXTL    v0.8h, v0.8b
306        SXTL    v1.8h, v1.8b
307        LDR     x11, [x5, 16]
308        SXTL    v4.8h, v4.8b
309        SXTL    v2.8h, v2.8b
310        SUBS    x0, x0, 8
311        SXTL    v3.8h, v3.8b
312        SXTL    v6.8h, v6.8b
313        B.HS    2b
314
315        # Epilogue.  Same as main loop but no preloads in final group
316
317        .p2align 3
3183:
319        SMLAL   v16.4s, v4.4h, v0.h[0]
320        SMLAL2  v20.4s, v4.8h, v0.h[0]
321        SMLAL   v17.4s, v4.4h, v1.h[0]
322        SMLAL2  v21.4s, v4.8h, v1.h[0]
323        SMLAL   v18.4s, v4.4h, v2.h[0]
324        SMLAL2  v22.4s, v4.8h, v2.h[0]
325        SMLAL   v19.4s, v4.4h, v3.h[0]
326        SMLAL2  v23.4s, v4.8h, v3.h[0]
327        LDR     d4, [x5, 24]
328        INS     v5.d[0], x11
329        SMLAL   v24.4s, v6.4h, v0.h[0]
330        SMLAL2  v28.4s, v6.8h, v0.h[0]
331        SMLAL   v25.4s, v6.4h, v1.h[0]
332        SMLAL2  v29.4s, v6.8h, v1.h[0]
333        SXTL    v5.8h, v5.8b
334        SMLAL   v26.4s, v6.4h, v2.h[0]
335        SMLAL2  v30.4s, v6.8h, v2.h[0]
336        SMLAL   v27.4s, v6.4h, v3.h[0]
337        SMLAL2  v31.4s, v6.8h, v3.h[0]
338        LDR     x11, [x5, 32]
339        SMLAL   v16.4s, v5.4h, v0.h[1]
340        SMLAL2  v20.4s, v5.8h, v0.h[1]
341        SMLAL   v17.4s, v5.4h, v1.h[1]
342        SMLAL2  v21.4s, v5.8h, v1.h[1]
343        SXTL    v4.8h, v4.8b
344        SMLAL   v18.4s, v5.4h, v2.h[1]
345        SMLAL2  v22.4s, v5.8h, v2.h[1]
346        SMLAL   v19.4s, v5.4h, v3.h[1]
347        SMLAL2  v23.4s, v5.8h, v3.h[1]
348        LDR     d5, [x5, 40]
349        INS     v6.d[0], x11
350        SMLAL   v24.4s, v4.4h, v0.h[1]
351        SMLAL2  v28.4s, v4.8h, v0.h[1]
352        SMLAL   v25.4s, v4.4h, v1.h[1]
353        SMLAL2  v29.4s, v4.8h, v1.h[1]
354        SXTL    v6.8h, v6.8b
355        SMLAL   v26.4s, v4.4h, v2.h[1]
356        SMLAL2  v30.4s, v4.8h, v2.h[1]
357        SMLAL   v27.4s, v4.4h, v3.h[1]
358        SMLAL2  v31.4s, v4.8h, v3.h[1]
359        LDR     x11, [x5, 48]
360        SMLAL   v16.4s, v6.4h, v0.h[2]
361        SMLAL2  v20.4s, v6.8h, v0.h[2]
362        SMLAL   v17.4s, v6.4h, v1.h[2]
363        SXTL    v5.8h, v5.8b
364        SMLAL2  v21.4s, v6.8h, v1.h[2]
365        SMLAL   v18.4s, v6.4h, v2.h[2]
366        SMLAL2  v22.4s, v6.8h, v2.h[2]
367        SMLAL   v19.4s, v6.4h, v3.h[2]
368        SMLAL2  v23.4s, v6.8h, v3.h[2]
369        LDR     d6, [x5, 56]
370        INS     v4.d[0], x11
371        SMLAL   v24.4s, v5.4h, v0.h[2]
372        SMLAL2  v28.4s, v5.8h, v0.h[2]
373        SMLAL   v25.4s, v5.4h, v1.h[2]
374        SMLAL2  v29.4s, v5.8h, v1.h[2]
375        SXTL    v4.8h, v4.8b
376        SMLAL   v26.4s, v5.4h, v2.h[2]
377        SMLAL2  v30.4s, v5.8h, v2.h[2]
378        SMLAL   v27.4s, v5.4h, v3.h[2]
379        SMLAL2  v31.4s, v5.8h, v3.h[2]
380        LDR     x11, [x5, 64]
381        SMLAL   v16.4s, v4.4h, v0.h[3]
382        SMLAL2  v20.4s, v4.8h, v0.h[3]
383        SMLAL   v17.4s, v4.4h, v1.h[3]
384        SMLAL2  v21.4s, v4.8h, v1.h[3]
385        SXTL    v6.8h, v6.8b
386        SMLAL   v18.4s, v4.4h, v2.h[3]
387        SMLAL2  v22.4s, v4.8h, v2.h[3]
388        SMLAL   v19.4s, v4.4h, v3.h[3]
389        SMLAL2  v23.4s, v4.8h, v3.h[3]
390        LDR     d4, [x5, 72]
391        INS     v5.d[0], x11
392        SMLAL   v24.4s, v6.4h, v0.h[3]
393        SMLAL2  v28.4s, v6.8h, v0.h[3]
394        SXTL    v5.8h, v5.8b
395        SMLAL   v25.4s, v6.4h, v1.h[3]
396        SMLAL2  v29.4s, v6.8h, v1.h[3]
397        SMLAL   v26.4s, v6.4h, v2.h[3]
398        SMLAL2  v30.4s, v6.8h, v2.h[3]
399        SMLAL   v27.4s, v6.4h, v3.h[3]
400        SMLAL2  v31.4s, v6.8h, v3.h[3]
401        LDR     x11, [x5, 80]
402        SMLAL   v16.4s, v5.4h, v0.h[4]
403        SMLAL2  v20.4s, v5.8h, v0.h[4]
404        SMLAL   v17.4s, v5.4h, v1.h[4]
405        SMLAL2  v21.4s, v5.8h, v1.h[4]
406        SXTL    v4.8h, v4.8b
407        SMLAL   v18.4s, v5.4h, v2.h[4]
408        SMLAL2  v22.4s, v5.8h, v2.h[4]
409        SMLAL   v19.4s, v5.4h, v3.h[4]
410        SMLAL2  v23.4s, v5.8h, v3.h[4]
411        LDR     d5, [x5, 88]
412        INS     v6.d[0], x11
413        SMLAL   v24.4s, v4.4h, v0.h[4]
414        SMLAL2  v28.4s, v4.8h, v0.h[4]
415        SMLAL   v25.4s, v4.4h, v1.h[4]
416        SMLAL2  v29.4s, v4.8h, v1.h[4]
417        SXTL    v6.8h, v6.8b
418        SMLAL   v26.4s, v4.4h, v2.h[4]
419        SMLAL2  v30.4s, v4.8h, v2.h[4]
420        SMLAL   v27.4s, v4.4h, v3.h[4]
421        SMLAL2  v31.4s, v4.8h, v3.h[4]
422        LDR     x11, [x5, 96]
423        SMLAL   v16.4s, v6.4h, v0.h[5]
424        SMLAL2  v20.4s, v6.8h, v0.h[5]
425        SMLAL   v17.4s, v6.4h, v1.h[5]
426        SMLAL2  v21.4s, v6.8h, v1.h[5]
427        SXTL    v5.8h, v5.8b
428        SMLAL   v18.4s, v6.4h, v2.h[5]
429        SMLAL2  v22.4s, v6.8h, v2.h[5]
430        SMLAL   v19.4s, v6.4h, v3.h[5]
431        SMLAL2  v23.4s, v6.8h, v3.h[5]
432        LDR     d6, [x5, 104]
433        INS     v4.d[0], x11
434        SMLAL   v24.4s, v5.4h, v0.h[5]
435        SMLAL2  v28.4s, v5.8h, v0.h[5]
436        SMLAL   v25.4s, v5.4h, v1.h[5]
437        SMLAL2  v29.4s, v5.8h, v1.h[5]
438        SXTL    v4.8h, v4.8b
439        SMLAL   v26.4s, v5.4h, v2.h[5]
440        SMLAL2  v30.4s, v5.8h, v2.h[5]
441        SMLAL   v27.4s, v5.4h, v3.h[5]
442        SMLAL2  v31.4s, v5.8h, v3.h[5]
443        SXTL    v6.8h, v6.8b
444        SMLAL   v16.4s, v4.4h, v0.h[6]
445        SMLAL2  v20.4s, v4.8h, v0.h[6]
446        SMLAL   v17.4s, v4.4h, v1.h[6]
447        SMLAL2  v21.4s, v4.8h, v1.h[6]
448        SMLAL   v18.4s, v4.4h, v2.h[6]
449        SMLAL2  v22.4s, v4.8h, v2.h[6]
450        SMLAL   v19.4s, v4.4h, v3.h[6]
451        SMLAL2  v23.4s, v4.8h, v3.h[6]
452        LDR     x11, [x5, 112]
453        SMLAL   v24.4s, v6.4h, v0.h[6]
454        SMLAL2  v28.4s, v6.8h, v0.h[6]
455        SMLAL   v25.4s, v6.4h, v1.h[6]
456        SMLAL2  v29.4s, v6.8h, v1.h[6]
457        LDR     d5, [x5, 120]
458        INS     v4.d[0], x11
459        SXTL    v4.8h, v4.8b
460        SMLAL   v26.4s, v6.4h, v2.h[6]
461        SMLAL2  v30.4s, v6.8h, v2.h[6]
462        SMLAL   v27.4s, v6.4h, v3.h[6]
463        SMLAL2  v31.4s, v6.8h, v3.h[6]
464        SMLAL   v16.4s, v4.4h, v0.h[7]
465        SMLAL2  v20.4s, v4.8h, v0.h[7]
466        SMLAL   v17.4s, v4.4h, v1.h[7]
467        SMLAL2  v21.4s, v4.8h, v1.h[7]
468        SXTL    v5.8h, v5.8b
469        SMLAL   v18.4s, v4.4h, v2.h[7]
470        SMLAL2  v22.4s, v4.8h, v2.h[7]
471        SMLAL   v19.4s, v4.4h, v3.h[7]
472        SMLAL2  v23.4s, v4.8h, v3.h[7]
473        ADD     x5, x5, 128
474        SMLAL   v24.4s, v5.4h, v0.h[7]
475        SMLAL2  v28.4s, v5.8h, v0.h[7]
476        SMLAL   v25.4s, v5.4h, v1.h[7]
477        SMLAL2  v29.4s, v5.8h, v1.h[7]
478        AND     x0, x2, 7               // kc remainder 0 to 7
479        SMLAL   v26.4s, v5.4h, v2.h[7]
480        SMLAL2  v30.4s, v5.8h, v2.h[7]
481        LDR     x11, [sp, 40]            // reload params pointer
482        SMLAL   v27.4s, v5.4h, v3.h[7]
483        SMLAL2  v31.4s, v5.8h, v3.h[7]
484
485        # Is there a remainder?- 1 to 7 bytes of A
486        CBNZ    x0, 5f
487
4884:
489        # ks loop
490        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
491        B.HI    1b
492
493        # Apply params - preshift, scale, postshift, bias and clamp
494        LD1R    {v4.4s}, [x11], 4
495        SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
496        SQSHL   v17.4s, v17.4s, v4.4s
497        SQSHL   v18.4s, v18.4s, v4.4s
498        SQSHL   v19.4s, v19.4s, v4.4s
499        SQSHL   v20.4s, v20.4s, v4.4s
500        SQSHL   v21.4s, v21.4s, v4.4s
501        SQSHL   v22.4s, v22.4s, v4.4s
502        SQSHL   v23.4s, v23.4s, v4.4s
503        LD1R    {v5.4s}, [x11], 4
504        SQSHL   v24.4s, v24.4s, v4.4s
505        SQSHL   v25.4s, v25.4s, v4.4s
506        SQSHL   v26.4s, v26.4s, v4.4s
507        SQSHL   v27.4s, v27.4s, v4.4s
508        SQSHL   v28.4s, v28.4s, v4.4s
509        SQSHL   v29.4s, v29.4s, v4.4s
510        SQSHL   v30.4s, v30.4s, v4.4s
511        SQSHL   v31.4s, v31.4s, v4.4s
512        LD1R    {v6.4s}, [x11], 4
513        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
514        SQDMULH v17.4s, v17.4s, v5.4s
515        SQDMULH v18.4s, v18.4s, v5.4s
516        SQDMULH v19.4s, v19.4s, v5.4s
517        SQDMULH v20.4s, v20.4s, v5.4s
518        SQDMULH v21.4s, v21.4s, v5.4s
519        SQDMULH v22.4s, v22.4s, v5.4s
520        SQDMULH v23.4s, v23.4s, v5.4s
521        SQDMULH v24.4s, v24.4s, v5.4s
522        SQDMULH v25.4s, v25.4s, v5.4s
523        SQDMULH v26.4s, v26.4s, v5.4s
524        SQDMULH v27.4s, v27.4s, v5.4s
525        SQDMULH v28.4s, v28.4s, v5.4s
526        SQDMULH v29.4s, v29.4s, v5.4s
527        SQDMULH v30.4s, v30.4s, v5.4s
528        SQDMULH v31.4s, v31.4s, v5.4s
529        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
530        SRSHL   v17.4s, v17.4s, v6.4s
531        SRSHL   v18.4s, v18.4s, v6.4s
532        SRSHL   v19.4s, v19.4s, v6.4s
533        SRSHL   v20.4s, v20.4s, v6.4s
534        SRSHL   v21.4s, v21.4s, v6.4s
535        SRSHL   v22.4s, v22.4s, v6.4s
536        SRSHL   v23.4s, v23.4s, v6.4s
537        SRSHL   v24.4s, v24.4s, v6.4s
538        SRSHL   v25.4s, v25.4s, v6.4s
539        SRSHL   v26.4s, v26.4s, v6.4s
540        SRSHL   v27.4s, v27.4s, v6.4s
541        SRSHL   v28.4s, v28.4s, v6.4s
542        SRSHL   v29.4s, v29.4s, v6.4s
543        SRSHL   v30.4s, v30.4s, v6.4s
544        SRSHL   v31.4s, v31.4s, v6.4s
545
546        SQXTN   v16.4h, v16.4s
547        SQXTN   v17.4h, v17.4s
548        SQXTN   v18.4h, v18.4s
549        SQXTN   v19.4h, v19.4s
550        SQXTN   v24.4h, v24.4s
551        SQXTN   v25.4h, v25.4s
552        SQXTN   v26.4h, v26.4s
553        SQXTN   v27.4h, v27.4s
554        LD1R    {v6.8h}, [x11], 2        // add bias
555
556        SQXTN2  v16.8h, v20.4s
557        SQXTN2  v17.8h, v21.4s
558        SQXTN2  v18.8h, v22.4s
559        SQXTN2  v19.8h, v23.4s
560        SQXTN2  v24.8h, v28.4s
561        SQXTN2  v25.8h, v29.4s
562        SQXTN2  v26.8h, v30.4s
563        SQXTN2  v27.8h, v31.4s
564
565        SQADD   v16.8h, v16.8h, v6.8h
566        SQADD   v17.8h, v17.8h, v6.8h
567        SQADD   v18.8h, v18.8h, v6.8h
568        SQADD   v19.8h, v19.8h, v6.8h
569        SQADD   v24.8h, v24.8h, v6.8h
570        SQADD   v25.8h, v25.8h, v6.8h
571        SQADD   v26.8h, v26.8h, v6.8h
572        SQADD   v27.8h, v27.8h, v6.8h
573        LD1R    {v4.16b}, [x11], 1       // clamp min value
574
575        SQXTN   v0.8b, v16.8h
576        SQXTN   v1.8b, v17.8h
577        SQXTN   v2.8b, v18.8h
578        SQXTN   v3.8b, v19.8h
579        LD1R    {v5.16b}, [x11]          // clamp max value
580        SQXTN2  v0.16b, v24.8h
581        SQXTN2  v1.16b, v25.8h
582        SQXTN2  v2.16b, v26.8h
583        SQXTN2  v3.16b, v27.8h
584        SUB     x11, x11, 15             // rewind params pointer
585
586        SMAX    v0.16b, v0.16b, v4.16b
587        SMAX    v1.16b, v1.16b, v4.16b
588        SMAX    v2.16b, v2.16b, v4.16b
589        SMAX    v3.16b, v3.16b, v4.16b
590        SUBS    x1, x1, 16
591        SMIN    v0.16b, v0.16b, v5.16b
592        SMIN    v1.16b, v1.16b, v5.16b
593        SMIN    v2.16b, v2.16b, v5.16b
594        SMIN    v3.16b, v3.16b, v5.16b
595        B.LO    6f
596
597        # Store full 4 x 16
598        ST1     {v3.16b},  [x7], x10
599        ST1     {v2.16b}, [x17], x10
600        ST1     {v1.16b}, [x16], x10
601        ST1     {v0.16b},  [x6], x10
602
603        SUB     x4, x4, x3              // a -= ks
604
605        # nc loop
606        B.HI    0b
607
608        # Restore x20-x21 from stack
609        LDP     x20, x21, [sp], 16
610        RET
611
612        # Remainder- 1 to 7 bytes of A
613        .p2align 3
6145:
615        AND     x0, x2, 7               // kc remainder 1 to 7
616
617        LD1     {v0.8b}, [x13], x0
618        LDP     d4, d5, [x5], 16
619        LD1     {v1.8b}, [x14], x0
620        LD1     {v2.8b}, [x15], x0
621        LD1     {v3.8b}, [x20], x0
622        SXTL    v0.8h, v0.8b
623        SXTL    v4.8h, v4.8b
624        SXTL    v5.8h, v5.8b
625        SXTL    v1.8h, v1.8b
626        SXTL    v2.8h, v2.8b
627        SXTL    v3.8h, v3.8b
628        SMLAL   v16.4s, v4.4h, v0.h[0]
629        SMLAL2  v20.4s, v4.8h, v0.h[0]
630        SMLAL   v24.4s, v5.4h, v0.h[0]
631        SMLAL2  v28.4s, v5.8h, v0.h[0]
632        SMLAL   v17.4s, v4.4h, v1.h[0]
633        SMLAL2  v21.4s, v4.8h, v1.h[0]
634        SMLAL   v25.4s, v5.4h, v1.h[0]
635        SMLAL2  v29.4s, v5.8h, v1.h[0]
636        SMLAL   v18.4s, v4.4h, v2.h[0]
637        SMLAL2  v22.4s, v4.8h, v2.h[0]
638        SMLAL   v26.4s, v5.4h, v2.h[0]
639        SMLAL2  v30.4s, v5.8h, v2.h[0]
640        SMLAL   v19.4s, v4.4h, v3.h[0]
641        SMLAL2  v23.4s, v4.8h, v3.h[0]
642        SMLAL   v27.4s, v5.4h, v3.h[0]
643        SMLAL2  v31.4s, v5.8h, v3.h[0]
644        CMP     x0, 2
645        B.LO    4b
646
647        LDP     d4, d5, [x5], 16
648        SXTL    v4.8h, v4.8b
649        SXTL    v5.8h, v5.8b
650        SMLAL   v16.4s, v4.4h, v0.h[1]
651        SMLAL2  v20.4s, v4.8h, v0.h[1]
652        SMLAL   v24.4s, v5.4h, v0.h[1]
653        SMLAL2  v28.4s, v5.8h, v0.h[1]
654        SMLAL   v17.4s, v4.4h, v1.h[1]
655        SMLAL2  v21.4s, v4.8h, v1.h[1]
656        SMLAL   v25.4s, v5.4h, v1.h[1]
657        SMLAL2  v29.4s, v5.8h, v1.h[1]
658        SMLAL   v18.4s, v4.4h, v2.h[1]
659        SMLAL2  v22.4s, v4.8h, v2.h[1]
660        SMLAL   v26.4s, v5.4h, v2.h[1]
661        SMLAL2  v30.4s, v5.8h, v2.h[1]
662        SMLAL   v19.4s, v4.4h, v3.h[1]
663        SMLAL2  v23.4s, v4.8h, v3.h[1]
664        SMLAL   v27.4s, v5.4h, v3.h[1]
665        SMLAL2  v31.4s, v5.8h, v3.h[1]
666        B.EQ    4b
667
668        LDP     d4, d5, [x5], 16
669        SXTL    v4.8h, v4.8b
670        SXTL    v5.8h, v5.8b
671        SMLAL   v16.4s, v4.4h, v0.h[2]
672        SMLAL2  v20.4s, v4.8h, v0.h[2]
673        SMLAL   v24.4s, v5.4h, v0.h[2]
674        SMLAL2  v28.4s, v5.8h, v0.h[2]
675        SMLAL   v17.4s, v4.4h, v1.h[2]
676        SMLAL2  v21.4s, v4.8h, v1.h[2]
677        SMLAL   v25.4s, v5.4h, v1.h[2]
678        SMLAL2  v29.4s, v5.8h, v1.h[2]
679        SMLAL   v18.4s, v4.4h, v2.h[2]
680        SMLAL2  v22.4s, v4.8h, v2.h[2]
681        SMLAL   v26.4s, v5.4h, v2.h[2]
682        SMLAL2  v30.4s, v5.8h, v2.h[2]
683        SMLAL   v19.4s, v4.4h, v3.h[2]
684        SMLAL2  v23.4s, v4.8h, v3.h[2]
685        SMLAL   v27.4s, v5.4h, v3.h[2]
686        SMLAL2  v31.4s, v5.8h, v3.h[2]
687        CMP     x0, 4
688        B.LO    4b
689
690        LDP     d4, d5, [x5], 16
691        SXTL    v4.8h, v4.8b
692        SXTL    v5.8h, v5.8b
693        SMLAL   v16.4s, v4.4h, v0.h[3]
694        SMLAL2  v20.4s, v4.8h, v0.h[3]
695        SMLAL   v24.4s, v5.4h, v0.h[3]
696        SMLAL2  v28.4s, v5.8h, v0.h[3]
697        SMLAL   v17.4s, v4.4h, v1.h[3]
698        SMLAL2  v21.4s, v4.8h, v1.h[3]
699        SMLAL   v25.4s, v5.4h, v1.h[3]
700        SMLAL2  v29.4s, v5.8h, v1.h[3]
701        SMLAL   v18.4s, v4.4h, v2.h[3]
702        SMLAL2  v22.4s, v4.8h, v2.h[3]
703        SMLAL   v26.4s, v5.4h, v2.h[3]
704        SMLAL2  v30.4s, v5.8h, v2.h[3]
705        SMLAL   v19.4s, v4.4h, v3.h[3]
706        SMLAL2  v23.4s, v4.8h, v3.h[3]
707        SMLAL   v27.4s, v5.4h, v3.h[3]
708        SMLAL2  v31.4s, v5.8h, v3.h[3]
709        B.EQ    4b
710
711        LDP     d4, d5, [x5], 16
712        SXTL    v4.8h, v4.8b
713        SXTL    v5.8h, v5.8b
714        SMLAL   v16.4s, v4.4h, v0.h[4]
715        SMLAL2  v20.4s, v4.8h, v0.h[4]
716        SMLAL   v24.4s, v5.4h, v0.h[4]
717        SMLAL2  v28.4s, v5.8h, v0.h[4]
718        SMLAL   v17.4s, v4.4h, v1.h[4]
719        SMLAL2  v21.4s, v4.8h, v1.h[4]
720        SMLAL   v25.4s, v5.4h, v1.h[4]
721        SMLAL2  v29.4s, v5.8h, v1.h[4]
722        SMLAL   v18.4s, v4.4h, v2.h[4]
723        SMLAL2  v22.4s, v4.8h, v2.h[4]
724        SMLAL   v26.4s, v5.4h, v2.h[4]
725        SMLAL2  v30.4s, v5.8h, v2.h[4]
726        SMLAL   v19.4s, v4.4h, v3.h[4]
727        SMLAL2  v23.4s, v4.8h, v3.h[4]
728        SMLAL   v27.4s, v5.4h, v3.h[4]
729        SMLAL2  v31.4s, v5.8h, v3.h[4]
730        CMP     x0, 6
731        B.LO    4b
732
733        LDP     d4, d5, [x5], 16
734        SXTL    v4.8h, v4.8b
735        SXTL    v5.8h, v5.8b
736        SMLAL   v16.4s, v4.4h, v0.h[5]
737        SMLAL2  v20.4s, v4.8h, v0.h[5]
738        SMLAL   v24.4s, v5.4h, v0.h[5]
739        SMLAL2  v28.4s, v5.8h, v0.h[5]
740        SMLAL   v17.4s, v4.4h, v1.h[5]
741        SMLAL2  v21.4s, v4.8h, v1.h[5]
742        SMLAL   v25.4s, v5.4h, v1.h[5]
743        SMLAL2  v29.4s, v5.8h, v1.h[5]
744        SMLAL   v18.4s, v4.4h, v2.h[5]
745        SMLAL2  v22.4s, v4.8h, v2.h[5]
746        SMLAL   v26.4s, v5.4h, v2.h[5]
747        SMLAL2  v30.4s, v5.8h, v2.h[5]
748        SMLAL   v19.4s, v4.4h, v3.h[5]
749        SMLAL2  v23.4s, v4.8h, v3.h[5]
750        SMLAL   v27.4s, v5.4h, v3.h[5]
751        SMLAL2  v31.4s, v5.8h, v3.h[5]
752        B.EQ    4b
753
754        LDP     d4, d5, [x5], 16
755        SXTL    v4.8h, v4.8b
756        SXTL    v5.8h, v5.8b
757        SMLAL   v16.4s, v4.4h, v0.h[6]
758        SMLAL2  v20.4s, v4.8h, v0.h[6]
759        SMLAL   v24.4s, v5.4h, v0.h[6]
760        SMLAL2  v28.4s, v5.8h, v0.h[6]
761        SMLAL   v17.4s, v4.4h, v1.h[6]
762        SMLAL2  v21.4s, v4.8h, v1.h[6]
763        SMLAL   v25.4s, v5.4h, v1.h[6]
764        SMLAL2  v29.4s, v5.8h, v1.h[6]
765        SMLAL   v18.4s, v4.4h, v2.h[6]
766        SMLAL2  v22.4s, v4.8h, v2.h[6]
767        SMLAL   v26.4s, v5.4h, v2.h[6]
768        SMLAL2  v30.4s, v5.8h, v2.h[6]
769        SMLAL   v19.4s, v4.4h, v3.h[6]
770        SMLAL2  v23.4s, v4.8h, v3.h[6]
771        SMLAL   v27.4s, v5.4h, v3.h[6]
772        SMLAL2  v31.4s, v5.8h, v3.h[6]
773        B       4b
774
775        # Store odd width
776        .p2align 3
7776:
778        TBZ     x1, 3, 7f
779        STR     d3, [x7], 8
780        STR     d2, [x17], 8
781        DUP     d3, v3.d[1]
782        DUP     d2, v2.d[1]
783        STR     d1, [x16], 8
784        STR     d0, [x6], 8
785        DUP     d1, v1.d[1]
786        DUP     d0, v0.d[1]
7877:
788        TBZ     x1, 2, 8f
789        STR     s3, [x7], 4
790        STR     s2, [x17], 4
791        DUP     s3, v3.s[1]
792        DUP     s2, v2.s[1]
793        STR     s1, [x16], 4
794        STR     s0, [x6], 4
795        DUP     s1, v1.s[1]
796        DUP     s0, v0.s[1]
7978:
798        TBZ     x1, 1, 9f
799        STR     h3, [x7], 2
800        STR     h2, [x17], 2
801        DUP     h3, v3.h[1]
802        DUP     h2, v2.h[1]
803        STR     h1, [x16], 2
804        STR     h0, [x6], 2
805        DUP     h1, v1.h[1]
806        DUP     h0, v0.h[1]
8079:
808        TBZ     x1, 0, 10f
809        STR     b3, [x7]
810        STR     b2, [x17]
811        STR     b1, [x16]
812        STR     b0, [x6]
81310:
814        # Restore x20-x21 from stack
815        LDP     x20, x21, [sp], 16
816        RET
817
818END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
819
820#ifdef __ELF__
821.section ".note.GNU-stack","",%progbits
822#endif
823