1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t** restrict a, x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0
31# A1  x14  v1
32# A2  x15  v2
33# A3  x20  v3
34# B    x5  v4  v5  v6
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
40# x11, x21 temp for Cortex-A53 loads
41
42BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
43
44        # Clamp C pointers
45        CMP     x0, 2                   // if mr < 2
46        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
47        ADD     x16, x6, x7             // c1 = c0 + cm_stride
48        CSEL    x16, x6,  x16, LO       //   c1 = c0
49
50        ADD     x17, x16, x7            // c2 = c1 + cm_stride
51        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
52                                        // if mr <= 2
53        CSEL    x17, x16, x17, LS       //   c2 = c1
54
55        CMP     x0, 4                   // if mr < 4
56        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
57        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
58        CSEL    x7,  x17, x7, LO        //   c3 = c2
59
60
61        .p2align 3
620:
63        # Load initial bias from w into accumulators
64        LDP     q16, q20, [x5], 32
65        MOV     v17.16b, v16.16b
66        MOV     v18.16b, v16.16b
67        LDP     q24, q28, [x5], 32
68        MOV     v19.16b, v16.16b
69        MOV     v21.16b, v20.16b
70        MOV     v22.16b, v20.16b
71        MOV     v23.16b, v20.16b
72        MOV     v25.16b, v24.16b
73        MOV     v26.16b, v24.16b
74        MOV     v27.16b, v24.16b
75        MOV     v29.16b, v28.16b
76        MOV     v30.16b, v28.16b
77        MOV     v31.16b, v28.16b
78        MOV     x9, x3                  // p = ks
79
80        .p2align 3
811:
82        # Load next 4 A pointers
83        LDP     x13, x14, [x4], 16
84        LDP     x15, x20, [x4], 16
85
86        CMP     x13, x12                // if a0 == zero
87        ADD     x13, x13, x8            // a0 += a_offset
88        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
89        CMP     x14, x12                // if a1 == zero
90        ADD     x14, x14, x8            // a1 += a_offset
91        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
92        CMP     x15, x12                // if a2 == zero
93        ADD     x15, x15, x8            // a2 += a_offset
94        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
95        CMP     x20, x12                // if a3 == zero
96        ADD     x20, x20, x8            // a3 += a_offset
97        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
98
99        # Is there at least 8 bytes for epilogue?
100        SUBS    x0, x2, 8               // k = kc - 8
101        B.LO    5f
102
103        # Prologue
104        LDR     d0, [x13], 8
105        LDP     d4, d6, [x5]
106        LDR     d1, [x14], 8
107        LDR     d2, [x15], 8
108        LDR     d3, [x20], 8
109        SXTL    v0.8h, v0.8b
110        LDR     x11, [x5, 16]
111        SXTL    v4.8h, v4.8b
112        SXTL    v1.8h, v1.8b
113        SXTL    v2.8h, v2.8b
114        SXTL    v3.8h, v3.8b
115        SXTL    v6.8h, v6.8b
116
117        SUBS    x0, x0, 8               // k = k - 8
118        # Is there at least 8 bytes for main loop?
119        B.LO    3f
120
121        # Main loop - 8 bytes of A
122        .p2align 3
1232:
124        SMLAL   v16.4s, v4.4h, v0.h[0]
125        SMLAL2  v20.4s, v4.8h, v0.h[0]
126        SMLAL   v17.4s, v4.4h, v1.h[0]
127        SMLAL2  v21.4s, v4.8h, v1.h[0]
128        SMLAL   v18.4s, v4.4h, v2.h[0]
129        SMLAL2  v22.4s, v4.8h, v2.h[0]
130        SMLAL   v19.4s, v4.4h, v3.h[0]
131        SMLAL2  v23.4s, v4.8h, v3.h[0]
132        LDR     d4, [x5, 24]
133        INS     v5.d[0], x11
134        SMLAL   v24.4s, v6.4h, v0.h[0]
135        SMLAL2  v28.4s, v6.8h, v0.h[0]
136        SMLAL   v25.4s, v6.4h, v1.h[0]
137        SMLAL2  v29.4s, v6.8h, v1.h[0]
138        SXTL    v5.8h, v5.8b
139        SMLAL   v26.4s, v6.4h, v2.h[0]
140        SMLAL2  v30.4s, v6.8h, v2.h[0]
141        SMLAL   v27.4s, v6.4h, v3.h[0]
142        SMLAL2  v31.4s, v6.8h, v3.h[0]
143        LDR     x11, [x5, 32]
144        SMLAL   v16.4s, v5.4h, v0.h[1]
145        SMLAL2  v20.4s, v5.8h, v0.h[1]
146        SMLAL   v17.4s, v5.4h, v1.h[1]
147        SMLAL2  v21.4s, v5.8h, v1.h[1]
148        SXTL    v4.8h, v4.8b
149        SMLAL   v18.4s, v5.4h, v2.h[1]
150        SMLAL2  v22.4s, v5.8h, v2.h[1]
151        SMLAL   v19.4s, v5.4h, v3.h[1]
152        SMLAL2  v23.4s, v5.8h, v3.h[1]
153        LDR     d5, [x5, 40]
154        INS     v6.d[0], x11
155        SMLAL   v24.4s, v4.4h, v0.h[1]
156        SMLAL2  v28.4s, v4.8h, v0.h[1]
157        SMLAL   v25.4s, v4.4h, v1.h[1]
158        SMLAL2  v29.4s, v4.8h, v1.h[1]
159        SXTL    v6.8h, v6.8b
160        SMLAL   v26.4s, v4.4h, v2.h[1]
161        SMLAL2  v30.4s, v4.8h, v2.h[1]
162        SMLAL   v27.4s, v4.4h, v3.h[1]
163        SMLAL2  v31.4s, v4.8h, v3.h[1]
164        LDR     x11, [x5, 48]
165        SMLAL   v16.4s, v6.4h, v0.h[2]
166        SMLAL2  v20.4s, v6.8h, v0.h[2]
167        SMLAL   v17.4s, v6.4h, v1.h[2]
168        SXTL    v5.8h, v5.8b
169        SMLAL2  v21.4s, v6.8h, v1.h[2]
170        SMLAL   v18.4s, v6.4h, v2.h[2]
171        SMLAL2  v22.4s, v6.8h, v2.h[2]
172        SMLAL   v19.4s, v6.4h, v3.h[2]
173        SMLAL2  v23.4s, v6.8h, v3.h[2]
174        LDR     d6, [x5, 56]
175        INS     v4.d[0], x11
176        SMLAL   v24.4s, v5.4h, v0.h[2]
177        SMLAL2  v28.4s, v5.8h, v0.h[2]
178        SMLAL   v25.4s, v5.4h, v1.h[2]
179        SMLAL2  v29.4s, v5.8h, v1.h[2]
180        SXTL    v4.8h, v4.8b
181        SMLAL   v26.4s, v5.4h, v2.h[2]
182        SMLAL2  v30.4s, v5.8h, v2.h[2]
183        SMLAL   v27.4s, v5.4h, v3.h[2]
184        SMLAL2  v31.4s, v5.8h, v3.h[2]
185        LDR     x11, [x5, 64]
186        SMLAL   v16.4s, v4.4h, v0.h[3]
187        SMLAL2  v20.4s, v4.8h, v0.h[3]
188        SMLAL   v17.4s, v4.4h, v1.h[3]
189        SMLAL2  v21.4s, v4.8h, v1.h[3]
190        SXTL    v6.8h, v6.8b
191        SMLAL   v18.4s, v4.4h, v2.h[3]
192        SMLAL2  v22.4s, v4.8h, v2.h[3]
193        SMLAL   v19.4s, v4.4h, v3.h[3]
194        SMLAL2  v23.4s, v4.8h, v3.h[3]
195        LDR     d4, [x5, 72]
196        INS     v5.d[0], x11
197        SMLAL   v24.4s, v6.4h, v0.h[3]
198        SMLAL2  v28.4s, v6.8h, v0.h[3]
199        SXTL    v5.8h, v5.8b
200        SMLAL   v25.4s, v6.4h, v1.h[3]
201        SMLAL2  v29.4s, v6.8h, v1.h[3]
202        SMLAL   v26.4s, v6.4h, v2.h[3]
203        SMLAL2  v30.4s, v6.8h, v2.h[3]
204        SMLAL   v27.4s, v6.4h, v3.h[3]
205        SMLAL2  v31.4s, v6.8h, v3.h[3]
206        LDR     x11, [x5, 80]
207        SMLAL   v16.4s, v5.4h, v0.h[4]
208        SMLAL2  v20.4s, v5.8h, v0.h[4]
209        SMLAL   v17.4s, v5.4h, v1.h[4]
210        SMLAL2  v21.4s, v5.8h, v1.h[4]
211        SXTL    v4.8h, v4.8b
212        SMLAL   v18.4s, v5.4h, v2.h[4]
213        SMLAL2  v22.4s, v5.8h, v2.h[4]
214        SMLAL   v19.4s, v5.4h, v3.h[4]
215        SMLAL2  v23.4s, v5.8h, v3.h[4]
216        LDR     d5, [x5, 88]
217        INS     v6.d[0], x11
218        SMLAL   v24.4s, v4.4h, v0.h[4]
219        SMLAL2  v28.4s, v4.8h, v0.h[4]
220        SMLAL   v25.4s, v4.4h, v1.h[4]
221        SMLAL2  v29.4s, v4.8h, v1.h[4]
222        SXTL    v6.8h, v6.8b
223        SMLAL   v26.4s, v4.4h, v2.h[4]
224        SMLAL2  v30.4s, v4.8h, v2.h[4]
225        SMLAL   v27.4s, v4.4h, v3.h[4]
226        SMLAL2  v31.4s, v4.8h, v3.h[4]
227        LDR     x11, [x5, 96]
228        SMLAL   v16.4s, v6.4h, v0.h[5]
229        SMLAL2  v20.4s, v6.8h, v0.h[5]
230        SMLAL   v17.4s, v6.4h, v1.h[5]
231        SMLAL2  v21.4s, v6.8h, v1.h[5]
232        SXTL    v5.8h, v5.8b
233        SMLAL   v18.4s, v6.4h, v2.h[5]
234        SMLAL2  v22.4s, v6.8h, v2.h[5]
235        SMLAL   v19.4s, v6.4h, v3.h[5]
236        SMLAL2  v23.4s, v6.8h, v3.h[5]
237        LDR     d6, [x5, 104]
238        INS     v4.d[0], x11
239        SMLAL   v24.4s, v5.4h, v0.h[5]
240        SMLAL2  v28.4s, v5.8h, v0.h[5]
241        SMLAL   v25.4s, v5.4h, v1.h[5]
242        SMLAL2  v29.4s, v5.8h, v1.h[5]
243        SXTL    v4.8h, v4.8b
244        SMLAL   v26.4s, v5.4h, v2.h[5]
245        SMLAL2  v30.4s, v5.8h, v2.h[5]
246        SMLAL   v27.4s, v5.4h, v3.h[5]
247        SMLAL2  v31.4s, v5.8h, v3.h[5]
248        SXTL    v6.8h, v6.8b
249        LDR     x11, [x5, 112]
250        SMLAL   v16.4s, v4.4h, v0.h[6]
251        SMLAL2  v20.4s, v4.8h, v0.h[6]
252        SMLAL   v17.4s, v4.4h, v1.h[6]
253        SMLAL2  v21.4s, v4.8h, v1.h[6]
254        SMLAL   v18.4s, v4.4h, v2.h[6]
255        SMLAL2  v22.4s, v4.8h, v2.h[6]
256        SMLAL   v19.4s, v4.4h, v3.h[6]
257        SMLAL2  v23.4s, v4.8h, v3.h[6]
258        LDR     d5, [x5, 120]
259        INS     v4.d[0], x11
260        SMLAL   v24.4s, v6.4h, v0.h[6]
261        SMLAL2  v28.4s, v6.8h, v0.h[6]
262        SMLAL   v25.4s, v6.4h, v1.h[6]
263        SMLAL2  v29.4s, v6.8h, v1.h[6]
264        SXTL    v4.8h, v4.8b
265        ADD     x5, x5, 128
266
267        SMLAL   v26.4s, v6.4h, v2.h[6]
268        SMLAL2  v30.4s, v6.8h, v2.h[6]
269        LDR     x11, [x5]
270        SMLAL   v27.4s, v6.4h, v3.h[6]
271        SMLAL2  v31.4s, v6.8h, v3.h[6]
272        SXTL    v5.8h, v5.8b
273        LDR     x21, [x13], 8
274
275        SMLAL   v16.4s, v4.4h, v0.h[7]
276        SMLAL2  v20.4s, v4.8h, v0.h[7]
277        SMLAL   v17.4s, v4.4h, v1.h[7]
278        SMLAL2  v21.4s, v4.8h, v1.h[7]
279        SMLAL   v18.4s, v4.4h, v2.h[7]
280        SMLAL2  v22.4s, v4.8h, v2.h[7]
281        SMLAL   v19.4s, v4.4h, v3.h[7]
282        SMLAL2  v23.4s, v4.8h, v3.h[7]
283        LDR     d6, [x5, 8]
284        INS     v4.d[0], x11
285        SMLAL   v24.4s, v5.4h, v0.h[7]
286        SMLAL2  v28.4s, v5.8h, v0.h[7]
287        LDR     x11, [x15], 8
288        SMLAL   v25.4s, v5.4h, v1.h[7]
289        SMLAL2  v29.4s, v5.8h, v1.h[7]
290        LDR     d1, [x14], 8
291        INS     v0.d[0], x21
292        SMLAL   v26.4s, v5.4h, v2.h[7]
293        SMLAL2  v30.4s, v5.8h, v2.h[7]
294        SMLAL   v27.4s, v5.4h, v3.h[7]
295        SMLAL2  v31.4s, v5.8h, v3.h[7]
296        LDR     d3, [x20], 8
297        INS     v2.d[0], x11
298
299        SXTL    v0.8h, v0.8b
300        SXTL    v1.8h, v1.8b
301        LDR     x11, [x5, 16]
302        SXTL    v4.8h, v4.8b
303        SXTL    v2.8h, v2.8b
304        SUBS    x0, x0, 8
305        SXTL    v3.8h, v3.8b
306        SXTL    v6.8h, v6.8b
307        B.HS    2b
308
309        # Epilogue.  Same as main loop but no preloads in final group
310
311        .p2align 3
3123:
313        SMLAL   v16.4s, v4.4h, v0.h[0]
314        SMLAL2  v20.4s, v4.8h, v0.h[0]
315        SMLAL   v17.4s, v4.4h, v1.h[0]
316        SMLAL2  v21.4s, v4.8h, v1.h[0]
317        SMLAL   v18.4s, v4.4h, v2.h[0]
318        SMLAL2  v22.4s, v4.8h, v2.h[0]
319        SMLAL   v19.4s, v4.4h, v3.h[0]
320        SMLAL2  v23.4s, v4.8h, v3.h[0]
321        LDR     d4, [x5, 24]
322        INS     v5.d[0], x11
323        SMLAL   v24.4s, v6.4h, v0.h[0]
324        SMLAL2  v28.4s, v6.8h, v0.h[0]
325        SMLAL   v25.4s, v6.4h, v1.h[0]
326        SMLAL2  v29.4s, v6.8h, v1.h[0]
327        SXTL    v5.8h, v5.8b
328        SMLAL   v26.4s, v6.4h, v2.h[0]
329        SMLAL2  v30.4s, v6.8h, v2.h[0]
330        SMLAL   v27.4s, v6.4h, v3.h[0]
331        SMLAL2  v31.4s, v6.8h, v3.h[0]
332        LDR     x11, [x5, 32]
333        SMLAL   v16.4s, v5.4h, v0.h[1]
334        SMLAL2  v20.4s, v5.8h, v0.h[1]
335        SMLAL   v17.4s, v5.4h, v1.h[1]
336        SMLAL2  v21.4s, v5.8h, v1.h[1]
337        SXTL    v4.8h, v4.8b
338        SMLAL   v18.4s, v5.4h, v2.h[1]
339        SMLAL2  v22.4s, v5.8h, v2.h[1]
340        SMLAL   v19.4s, v5.4h, v3.h[1]
341        SMLAL2  v23.4s, v5.8h, v3.h[1]
342        LDR     d5, [x5, 40]
343        INS     v6.d[0], x11
344        SMLAL   v24.4s, v4.4h, v0.h[1]
345        SMLAL2  v28.4s, v4.8h, v0.h[1]
346        SMLAL   v25.4s, v4.4h, v1.h[1]
347        SMLAL2  v29.4s, v4.8h, v1.h[1]
348        SXTL    v6.8h, v6.8b
349        SMLAL   v26.4s, v4.4h, v2.h[1]
350        SMLAL2  v30.4s, v4.8h, v2.h[1]
351        SMLAL   v27.4s, v4.4h, v3.h[1]
352        SMLAL2  v31.4s, v4.8h, v3.h[1]
353        LDR     x11, [x5, 48]
354        SMLAL   v16.4s, v6.4h, v0.h[2]
355        SMLAL2  v20.4s, v6.8h, v0.h[2]
356        SMLAL   v17.4s, v6.4h, v1.h[2]
357        SXTL    v5.8h, v5.8b
358        SMLAL2  v21.4s, v6.8h, v1.h[2]
359        SMLAL   v18.4s, v6.4h, v2.h[2]
360        SMLAL2  v22.4s, v6.8h, v2.h[2]
361        SMLAL   v19.4s, v6.4h, v3.h[2]
362        SMLAL2  v23.4s, v6.8h, v3.h[2]
363        LDR     d6, [x5, 56]
364        INS     v4.d[0], x11
365        SMLAL   v24.4s, v5.4h, v0.h[2]
366        SMLAL2  v28.4s, v5.8h, v0.h[2]
367        SMLAL   v25.4s, v5.4h, v1.h[2]
368        SMLAL2  v29.4s, v5.8h, v1.h[2]
369        SXTL    v4.8h, v4.8b
370        SMLAL   v26.4s, v5.4h, v2.h[2]
371        SMLAL2  v30.4s, v5.8h, v2.h[2]
372        SMLAL   v27.4s, v5.4h, v3.h[2]
373        SMLAL2  v31.4s, v5.8h, v3.h[2]
374        LDR     x11, [x5, 64]
375        SMLAL   v16.4s, v4.4h, v0.h[3]
376        SMLAL2  v20.4s, v4.8h, v0.h[3]
377        SMLAL   v17.4s, v4.4h, v1.h[3]
378        SMLAL2  v21.4s, v4.8h, v1.h[3]
379        SXTL    v6.8h, v6.8b
380        SMLAL   v18.4s, v4.4h, v2.h[3]
381        SMLAL2  v22.4s, v4.8h, v2.h[3]
382        SMLAL   v19.4s, v4.4h, v3.h[3]
383        SMLAL2  v23.4s, v4.8h, v3.h[3]
384        LDR     d4, [x5, 72]
385        INS     v5.d[0], x11
386        SMLAL   v24.4s, v6.4h, v0.h[3]
387        SMLAL2  v28.4s, v6.8h, v0.h[3]
388        SXTL    v5.8h, v5.8b
389        SMLAL   v25.4s, v6.4h, v1.h[3]
390        SMLAL2  v29.4s, v6.8h, v1.h[3]
391        SMLAL   v26.4s, v6.4h, v2.h[3]
392        SMLAL2  v30.4s, v6.8h, v2.h[3]
393        SMLAL   v27.4s, v6.4h, v3.h[3]
394        SMLAL2  v31.4s, v6.8h, v3.h[3]
395        LDR     x11, [x5, 80]
396        SMLAL   v16.4s, v5.4h, v0.h[4]
397        SMLAL2  v20.4s, v5.8h, v0.h[4]
398        SMLAL   v17.4s, v5.4h, v1.h[4]
399        SMLAL2  v21.4s, v5.8h, v1.h[4]
400        SXTL    v4.8h, v4.8b
401        SMLAL   v18.4s, v5.4h, v2.h[4]
402        SMLAL2  v22.4s, v5.8h, v2.h[4]
403        SMLAL   v19.4s, v5.4h, v3.h[4]
404        SMLAL2  v23.4s, v5.8h, v3.h[4]
405        LDR     d5, [x5, 88]
406        INS     v6.d[0], x11
407        SMLAL   v24.4s, v4.4h, v0.h[4]
408        SMLAL2  v28.4s, v4.8h, v0.h[4]
409        SMLAL   v25.4s, v4.4h, v1.h[4]
410        SMLAL2  v29.4s, v4.8h, v1.h[4]
411        SXTL    v6.8h, v6.8b
412        SMLAL   v26.4s, v4.4h, v2.h[4]
413        SMLAL2  v30.4s, v4.8h, v2.h[4]
414        SMLAL   v27.4s, v4.4h, v3.h[4]
415        SMLAL2  v31.4s, v4.8h, v3.h[4]
416        LDR     x11, [x5, 96]
417        SMLAL   v16.4s, v6.4h, v0.h[5]
418        SMLAL2  v20.4s, v6.8h, v0.h[5]
419        SMLAL   v17.4s, v6.4h, v1.h[5]
420        SMLAL2  v21.4s, v6.8h, v1.h[5]
421        SXTL    v5.8h, v5.8b
422        SMLAL   v18.4s, v6.4h, v2.h[5]
423        SMLAL2  v22.4s, v6.8h, v2.h[5]
424        SMLAL   v19.4s, v6.4h, v3.h[5]
425        SMLAL2  v23.4s, v6.8h, v3.h[5]
426        LDR     d6, [x5, 104]
427        INS     v4.d[0], x11
428        SMLAL   v24.4s, v5.4h, v0.h[5]
429        SMLAL2  v28.4s, v5.8h, v0.h[5]
430        SMLAL   v25.4s, v5.4h, v1.h[5]
431        SMLAL2  v29.4s, v5.8h, v1.h[5]
432        SXTL    v4.8h, v4.8b
433        SMLAL   v26.4s, v5.4h, v2.h[5]
434        SMLAL2  v30.4s, v5.8h, v2.h[5]
435        SMLAL   v27.4s, v5.4h, v3.h[5]
436        SMLAL2  v31.4s, v5.8h, v3.h[5]
437        SXTL    v6.8h, v6.8b
438        SMLAL   v16.4s, v4.4h, v0.h[6]
439        SMLAL2  v20.4s, v4.8h, v0.h[6]
440        SMLAL   v17.4s, v4.4h, v1.h[6]
441        SMLAL2  v21.4s, v4.8h, v1.h[6]
442        SMLAL   v18.4s, v4.4h, v2.h[6]
443        SMLAL2  v22.4s, v4.8h, v2.h[6]
444        SMLAL   v19.4s, v4.4h, v3.h[6]
445        SMLAL2  v23.4s, v4.8h, v3.h[6]
446        LDR     x11, [x5, 112]
447        SMLAL   v24.4s, v6.4h, v0.h[6]
448        SMLAL2  v28.4s, v6.8h, v0.h[6]
449        SMLAL   v25.4s, v6.4h, v1.h[6]
450        SMLAL2  v29.4s, v6.8h, v1.h[6]
451        LDR     d5, [x5, 120]
452        INS     v4.d[0], x11
453        SXTL    v4.8h, v4.8b
454        SMLAL   v26.4s, v6.4h, v2.h[6]
455        SMLAL2  v30.4s, v6.8h, v2.h[6]
456        SMLAL   v27.4s, v6.4h, v3.h[6]
457        SMLAL2  v31.4s, v6.8h, v3.h[6]
458        SMLAL   v16.4s, v4.4h, v0.h[7]
459        SMLAL2  v20.4s, v4.8h, v0.h[7]
460        SMLAL   v17.4s, v4.4h, v1.h[7]
461        SMLAL2  v21.4s, v4.8h, v1.h[7]
462        SXTL    v5.8h, v5.8b
463        SMLAL   v18.4s, v4.4h, v2.h[7]
464        SMLAL2  v22.4s, v4.8h, v2.h[7]
465        SMLAL   v19.4s, v4.4h, v3.h[7]
466        SMLAL2  v23.4s, v4.8h, v3.h[7]
467        ADD     x5, x5, 128
468        SMLAL   v24.4s, v5.4h, v0.h[7]
469        SMLAL2  v28.4s, v5.8h, v0.h[7]
470        SMLAL   v25.4s, v5.4h, v1.h[7]
471        SMLAL2  v29.4s, v5.8h, v1.h[7]
472        AND     x0, x2, 7               // kc remainder 0 to 7
473        SMLAL   v26.4s, v5.4h, v2.h[7]
474        SMLAL2  v30.4s, v5.8h, v2.h[7]
475        LDR     x11, [sp, 40]            // reload params pointer
476        SMLAL   v27.4s, v5.4h, v3.h[7]
477        SMLAL2  v31.4s, v5.8h, v3.h[7]
478
479        # Is there a remainder?- 1 to 7 bytes of A
480        CBNZ    x0, 5f
481
4824:
483        # ks loop
484        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
485        B.HI    1b
486
487        # Apply params - preshift, scale, postshift, bias and clamp
488        LD1R    {v4.4s}, [x11], 4
489        SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
490        SQSHL   v17.4s, v17.4s, v4.4s
491        SQSHL   v18.4s, v18.4s, v4.4s
492        SQSHL   v19.4s, v19.4s, v4.4s
493        SQSHL   v20.4s, v20.4s, v4.4s
494        SQSHL   v21.4s, v21.4s, v4.4s
495        SQSHL   v22.4s, v22.4s, v4.4s
496        SQSHL   v23.4s, v23.4s, v4.4s
497        LD1R    {v5.4s}, [x11], 4
498        SQSHL   v24.4s, v24.4s, v4.4s
499        SQSHL   v25.4s, v25.4s, v4.4s
500        SQSHL   v26.4s, v26.4s, v4.4s
501        SQSHL   v27.4s, v27.4s, v4.4s
502        SQSHL   v28.4s, v28.4s, v4.4s
503        SQSHL   v29.4s, v29.4s, v4.4s
504        SQSHL   v30.4s, v30.4s, v4.4s
505        SQSHL   v31.4s, v31.4s, v4.4s
506        LD1R    {v6.4s}, [x11], 4
507        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
508        SQDMULH v17.4s, v17.4s, v5.4s
509        SQDMULH v18.4s, v18.4s, v5.4s
510        SQDMULH v19.4s, v19.4s, v5.4s
511        SQDMULH v20.4s, v20.4s, v5.4s
512        SQDMULH v21.4s, v21.4s, v5.4s
513        SQDMULH v22.4s, v22.4s, v5.4s
514        SQDMULH v23.4s, v23.4s, v5.4s
515        SQDMULH v24.4s, v24.4s, v5.4s
516        SQDMULH v25.4s, v25.4s, v5.4s
517        SQDMULH v26.4s, v26.4s, v5.4s
518        SQDMULH v27.4s, v27.4s, v5.4s
519        SQDMULH v28.4s, v28.4s, v5.4s
520        SQDMULH v29.4s, v29.4s, v5.4s
521        SQDMULH v30.4s, v30.4s, v5.4s
522        SQDMULH v31.4s, v31.4s, v5.4s
523        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
524        SRSHL   v17.4s, v17.4s, v6.4s
525        SRSHL   v18.4s, v18.4s, v6.4s
526        SRSHL   v19.4s, v19.4s, v6.4s
527        SRSHL   v20.4s, v20.4s, v6.4s
528        SRSHL   v21.4s, v21.4s, v6.4s
529        SRSHL   v22.4s, v22.4s, v6.4s
530        SRSHL   v23.4s, v23.4s, v6.4s
531        SRSHL   v24.4s, v24.4s, v6.4s
532        SRSHL   v25.4s, v25.4s, v6.4s
533        SRSHL   v26.4s, v26.4s, v6.4s
534        SRSHL   v27.4s, v27.4s, v6.4s
535        SRSHL   v28.4s, v28.4s, v6.4s
536        SRSHL   v29.4s, v29.4s, v6.4s
537        SRSHL   v30.4s, v30.4s, v6.4s
538        SRSHL   v31.4s, v31.4s, v6.4s
539
540        SQXTN   v16.4h, v16.4s
541        SQXTN   v17.4h, v17.4s
542        SQXTN   v18.4h, v18.4s
543        SQXTN   v19.4h, v19.4s
544        SQXTN   v24.4h, v24.4s
545        SQXTN   v25.4h, v25.4s
546        SQXTN   v26.4h, v26.4s
547        SQXTN   v27.4h, v27.4s
548        LD1R    {v6.8h}, [x11], 2        // add bias
549
550        SQXTN2  v16.8h, v20.4s
551        SQXTN2  v17.8h, v21.4s
552        SQXTN2  v18.8h, v22.4s
553        SQXTN2  v19.8h, v23.4s
554        SQXTN2  v24.8h, v28.4s
555        SQXTN2  v25.8h, v29.4s
556        SQXTN2  v26.8h, v30.4s
557        SQXTN2  v27.8h, v31.4s
558
559        SQADD   v16.8h, v16.8h, v6.8h
560        SQADD   v17.8h, v17.8h, v6.8h
561        SQADD   v18.8h, v18.8h, v6.8h
562        SQADD   v19.8h, v19.8h, v6.8h
563        SQADD   v24.8h, v24.8h, v6.8h
564        SQADD   v25.8h, v25.8h, v6.8h
565        SQADD   v26.8h, v26.8h, v6.8h
566        SQADD   v27.8h, v27.8h, v6.8h
567        LD1R    {v4.16b}, [x11], 1       // clamp min value
568
569        SQXTN   v0.8b, v16.8h
570        SQXTN   v1.8b, v17.8h
571        SQXTN   v2.8b, v18.8h
572        SQXTN   v3.8b, v19.8h
573        LD1R    {v5.16b}, [x11]          // clamp max value
574        SQXTN2  v0.16b, v24.8h
575        SQXTN2  v1.16b, v25.8h
576        SQXTN2  v2.16b, v26.8h
577        SQXTN2  v3.16b, v27.8h
578        SUB     x11, x11, 15             // rewind params pointer
579
580        SMAX    v0.16b, v0.16b, v4.16b
581        SMAX    v1.16b, v1.16b, v4.16b
582        SMAX    v2.16b, v2.16b, v4.16b
583        SMAX    v3.16b, v3.16b, v4.16b
584        SUBS    x1, x1, 16
585        SMIN    v0.16b, v0.16b, v5.16b
586        SMIN    v1.16b, v1.16b, v5.16b
587        SMIN    v2.16b, v2.16b, v5.16b
588        SMIN    v3.16b, v3.16b, v5.16b
589        B.LO    6f
590
591        # Store full 4 x 16
592        ST1     {v3.16b},  [x7], x10
593        ST1     {v2.16b}, [x17], x10
594        ST1     {v1.16b}, [x16], x10
595        ST1     {v0.16b},  [x6], x10
596
597        SUB     x4, x4, x3              // a -= ks
598
599        # nc loop
600        B.HI    0b
601
602        # Restore x20-x21 from stack
603        LDP     x20, x21, [sp], 16
604        RET
605
606        # Remainder- 1 to 7 bytes of A
607        .p2align 3
6085:
609        AND     x0, x2, 7               // kc remainder 1 to 7
610
611        LD1     {v0.8b}, [x13], x0
612        LDP     d4, d5, [x5], 16
613        LD1     {v1.8b}, [x14], x0
614        LD1     {v2.8b}, [x15], x0
615        LD1     {v3.8b}, [x20], x0
616        SXTL    v0.8h, v0.8b
617        SXTL    v4.8h, v4.8b
618        SXTL    v5.8h, v5.8b
619        SXTL    v1.8h, v1.8b
620        SXTL    v2.8h, v2.8b
621        SXTL    v3.8h, v3.8b
622        SMLAL   v16.4s, v4.4h, v0.h[0]
623        SMLAL2  v20.4s, v4.8h, v0.h[0]
624        SMLAL   v24.4s, v5.4h, v0.h[0]
625        SMLAL2  v28.4s, v5.8h, v0.h[0]
626        SMLAL   v17.4s, v4.4h, v1.h[0]
627        SMLAL2  v21.4s, v4.8h, v1.h[0]
628        SMLAL   v25.4s, v5.4h, v1.h[0]
629        SMLAL2  v29.4s, v5.8h, v1.h[0]
630        SMLAL   v18.4s, v4.4h, v2.h[0]
631        SMLAL2  v22.4s, v4.8h, v2.h[0]
632        SMLAL   v26.4s, v5.4h, v2.h[0]
633        SMLAL2  v30.4s, v5.8h, v2.h[0]
634        SMLAL   v19.4s, v4.4h, v3.h[0]
635        SMLAL2  v23.4s, v4.8h, v3.h[0]
636        SMLAL   v27.4s, v5.4h, v3.h[0]
637        SMLAL2  v31.4s, v5.8h, v3.h[0]
638        CMP     x0, 2
639        B.LO    4b
640
641        LDP     d4, d5, [x5], 16
642        SXTL    v4.8h, v4.8b
643        SXTL    v5.8h, v5.8b
644        SMLAL   v16.4s, v4.4h, v0.h[1]
645        SMLAL2  v20.4s, v4.8h, v0.h[1]
646        SMLAL   v24.4s, v5.4h, v0.h[1]
647        SMLAL2  v28.4s, v5.8h, v0.h[1]
648        SMLAL   v17.4s, v4.4h, v1.h[1]
649        SMLAL2  v21.4s, v4.8h, v1.h[1]
650        SMLAL   v25.4s, v5.4h, v1.h[1]
651        SMLAL2  v29.4s, v5.8h, v1.h[1]
652        SMLAL   v18.4s, v4.4h, v2.h[1]
653        SMLAL2  v22.4s, v4.8h, v2.h[1]
654        SMLAL   v26.4s, v5.4h, v2.h[1]
655        SMLAL2  v30.4s, v5.8h, v2.h[1]
656        SMLAL   v19.4s, v4.4h, v3.h[1]
657        SMLAL2  v23.4s, v4.8h, v3.h[1]
658        SMLAL   v27.4s, v5.4h, v3.h[1]
659        SMLAL2  v31.4s, v5.8h, v3.h[1]
660        B.EQ    4b
661
662        LDP     d4, d5, [x5], 16
663        SXTL    v4.8h, v4.8b
664        SXTL    v5.8h, v5.8b
665        SMLAL   v16.4s, v4.4h, v0.h[2]
666        SMLAL2  v20.4s, v4.8h, v0.h[2]
667        SMLAL   v24.4s, v5.4h, v0.h[2]
668        SMLAL2  v28.4s, v5.8h, v0.h[2]
669        SMLAL   v17.4s, v4.4h, v1.h[2]
670        SMLAL2  v21.4s, v4.8h, v1.h[2]
671        SMLAL   v25.4s, v5.4h, v1.h[2]
672        SMLAL2  v29.4s, v5.8h, v1.h[2]
673        SMLAL   v18.4s, v4.4h, v2.h[2]
674        SMLAL2  v22.4s, v4.8h, v2.h[2]
675        SMLAL   v26.4s, v5.4h, v2.h[2]
676        SMLAL2  v30.4s, v5.8h, v2.h[2]
677        SMLAL   v19.4s, v4.4h, v3.h[2]
678        SMLAL2  v23.4s, v4.8h, v3.h[2]
679        SMLAL   v27.4s, v5.4h, v3.h[2]
680        SMLAL2  v31.4s, v5.8h, v3.h[2]
681        CMP     x0, 4
682        B.LO    4b
683
684        LDP     d4, d5, [x5], 16
685        SXTL    v4.8h, v4.8b
686        SXTL    v5.8h, v5.8b
687        SMLAL   v16.4s, v4.4h, v0.h[3]
688        SMLAL2  v20.4s, v4.8h, v0.h[3]
689        SMLAL   v24.4s, v5.4h, v0.h[3]
690        SMLAL2  v28.4s, v5.8h, v0.h[3]
691        SMLAL   v17.4s, v4.4h, v1.h[3]
692        SMLAL2  v21.4s, v4.8h, v1.h[3]
693        SMLAL   v25.4s, v5.4h, v1.h[3]
694        SMLAL2  v29.4s, v5.8h, v1.h[3]
695        SMLAL   v18.4s, v4.4h, v2.h[3]
696        SMLAL2  v22.4s, v4.8h, v2.h[3]
697        SMLAL   v26.4s, v5.4h, v2.h[3]
698        SMLAL2  v30.4s, v5.8h, v2.h[3]
699        SMLAL   v19.4s, v4.4h, v3.h[3]
700        SMLAL2  v23.4s, v4.8h, v3.h[3]
701        SMLAL   v27.4s, v5.4h, v3.h[3]
702        SMLAL2  v31.4s, v5.8h, v3.h[3]
703        B.EQ    4b
704
705        LDP     d4, d5, [x5], 16
706        SXTL    v4.8h, v4.8b
707        SXTL    v5.8h, v5.8b
708        SMLAL   v16.4s, v4.4h, v0.h[4]
709        SMLAL2  v20.4s, v4.8h, v0.h[4]
710        SMLAL   v24.4s, v5.4h, v0.h[4]
711        SMLAL2  v28.4s, v5.8h, v0.h[4]
712        SMLAL   v17.4s, v4.4h, v1.h[4]
713        SMLAL2  v21.4s, v4.8h, v1.h[4]
714        SMLAL   v25.4s, v5.4h, v1.h[4]
715        SMLAL2  v29.4s, v5.8h, v1.h[4]
716        SMLAL   v18.4s, v4.4h, v2.h[4]
717        SMLAL2  v22.4s, v4.8h, v2.h[4]
718        SMLAL   v26.4s, v5.4h, v2.h[4]
719        SMLAL2  v30.4s, v5.8h, v2.h[4]
720        SMLAL   v19.4s, v4.4h, v3.h[4]
721        SMLAL2  v23.4s, v4.8h, v3.h[4]
722        SMLAL   v27.4s, v5.4h, v3.h[4]
723        SMLAL2  v31.4s, v5.8h, v3.h[4]
724        CMP     x0, 6
725        B.LO    4b
726
727        LDP     d4, d5, [x5], 16
728        SXTL    v4.8h, v4.8b
729        SXTL    v5.8h, v5.8b
730        SMLAL   v16.4s, v4.4h, v0.h[5]
731        SMLAL2  v20.4s, v4.8h, v0.h[5]
732        SMLAL   v24.4s, v5.4h, v0.h[5]
733        SMLAL2  v28.4s, v5.8h, v0.h[5]
734        SMLAL   v17.4s, v4.4h, v1.h[5]
735        SMLAL2  v21.4s, v4.8h, v1.h[5]
736        SMLAL   v25.4s, v5.4h, v1.h[5]
737        SMLAL2  v29.4s, v5.8h, v1.h[5]
738        SMLAL   v18.4s, v4.4h, v2.h[5]
739        SMLAL2  v22.4s, v4.8h, v2.h[5]
740        SMLAL   v26.4s, v5.4h, v2.h[5]
741        SMLAL2  v30.4s, v5.8h, v2.h[5]
742        SMLAL   v19.4s, v4.4h, v3.h[5]
743        SMLAL2  v23.4s, v4.8h, v3.h[5]
744        SMLAL   v27.4s, v5.4h, v3.h[5]
745        SMLAL2  v31.4s, v5.8h, v3.h[5]
746        B.EQ    4b
747
748        LDP     d4, d5, [x5], 16
749        SXTL    v4.8h, v4.8b
750        SXTL    v5.8h, v5.8b
751        SMLAL   v16.4s, v4.4h, v0.h[6]
752        SMLAL2  v20.4s, v4.8h, v0.h[6]
753        SMLAL   v24.4s, v5.4h, v0.h[6]
754        SMLAL2  v28.4s, v5.8h, v0.h[6]
755        SMLAL   v17.4s, v4.4h, v1.h[6]
756        SMLAL2  v21.4s, v4.8h, v1.h[6]
757        SMLAL   v25.4s, v5.4h, v1.h[6]
758        SMLAL2  v29.4s, v5.8h, v1.h[6]
759        SMLAL   v18.4s, v4.4h, v2.h[6]
760        SMLAL2  v22.4s, v4.8h, v2.h[6]
761        SMLAL   v26.4s, v5.4h, v2.h[6]
762        SMLAL2  v30.4s, v5.8h, v2.h[6]
763        SMLAL   v19.4s, v4.4h, v3.h[6]
764        SMLAL2  v23.4s, v4.8h, v3.h[6]
765        SMLAL   v27.4s, v5.4h, v3.h[6]
766        SMLAL2  v31.4s, v5.8h, v3.h[6]
767        B       4b
768
769        # Store odd width
770        .p2align 3
7716:
772        TBZ     x1, 3, 7f
773        STR     d3, [x7], 8
774        STR     d2, [x17], 8
775        DUP     d3, v3.d[1]
776        DUP     d2, v2.d[1]
777        STR     d1, [x16], 8
778        STR     d0, [x6], 8
779        DUP     d1, v1.d[1]
780        DUP     d0, v0.d[1]
7817:
782        TBZ     x1, 2, 8f
783        STR     s3, [x7], 4
784        STR     s2, [x17], 4
785        DUP     s3, v3.s[1]
786        DUP     s2, v2.s[1]
787        STR     s1, [x16], 4
788        STR     s0, [x6], 4
789        DUP     s1, v1.s[1]
790        DUP     s0, v0.s[1]
7918:
792        TBZ     x1, 1, 9f
793        STR     h3, [x7], 2
794        STR     h2, [x17], 2
795        DUP     h3, v3.h[1]
796        DUP     h2, v2.h[1]
797        STR     h1, [x16], 2
798        STR     h0, [x6], 2
799        DUP     h1, v1.h[1]
800        DUP     h0, v0.h[1]
8019:
802        TBZ     x1, 0, 10f
803        STR     b3, [x7]
804        STR     b2, [x17]
805        STR     b1, [x16]
806        STR     b0, [x6]
80710:
808        # Restore x20-x21 from stack
809        LDP     x20, x21, [sp], 16
810        RET
811
812END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
813
814#ifdef __ELF__
815.section ".note.GNU-stack","",%progbits
816#endif
817