1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# A1 x15 v1
30# A2 x13 v2
31# A3  x4 v3
32# B   x5 v4  v5  v6
33# C0  x6 v16 v20 v24 v28
34# C1  x8 v17 v21 v25 v29
35# C2  x9 v18 v22 v26 v30
36# C3  x7 v19 v23 v27 v31
37# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
38
39# x10 x17 a53 temp registers
40
41BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
42
43        # Clamp A and C pointers
44        CMP     x0, 2                   // if mr < 2
45        LDP     x12, x11, [sp]          // Load cn_stride, params
46        ADD     x15, x3, x4             // a1 = a0 + a_stride
47        ADD     x8, x6, x7              // c1 = c0 + cm_stride
48        CSEL    x15, x3, x15, LO        //   a1 = a0
49        CSEL    x8, x6,  x8, LO         //   c1 = c0
50
51        ADD     x13, x15, x4            // a2 = a1 + a_stride
52        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
53                                        // if mr <= 2
54        CSEL    x13, x15, x13, LS       //   a2 = a1
55        CSEL    x9,  x8,  x9, LS        //   c2 = c1
56
57        CMP     x0, 4                   // if mr < 4
58        ADD     x4, x13, x4             // a3 = a2 + a_stride
59        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
60        CSEL    x4, x13, x4, LO         //   a3 = a2
61        CSEL    x7,  x9, x7, LO         //   c3 = c2
62
63        .p2align 3
640:
65        # Load initial bias from w into accumulators
66        LDP     q16, q20, [x5], 32
67        MOV     v17.16b, v16.16b
68        MOV     v18.16b, v16.16b
69        LDP     q24, q28, [x5], 32
70        MOV     v19.16b, v16.16b
71        MOV     v21.16b, v20.16b
72        MOV     v22.16b, v20.16b
73        MOV     v23.16b, v20.16b
74        SUBS    x0, x2, 8               // k = kc - 8
75        MOV     v25.16b, v24.16b
76        MOV     v26.16b, v24.16b
77        MOV     v27.16b, v24.16b
78        MOV     v29.16b, v28.16b
79        MOV     v30.16b, v28.16b
80        MOV     v31.16b, v28.16b
81        # Is there at least 8 bytes for epilogue?
82        B.LO    4f
83
84        # Prologue
85        LDR     d0, [x3], 8
86        LDP     d4, d6, [x5]
87        LDR     d1, [x15], 8
88        LDR     d2, [x13], 8
89        LDR     d3, [x4], 8
90        SXTL    v0.8h, v0.8b
91        LDR     x17, [x5, 16]
92        SXTL    v4.8h, v4.8b
93        SXTL    v1.8h, v1.8b
94        SXTL    v2.8h, v2.8b
95        SXTL    v3.8h, v3.8b
96        SXTL    v6.8h, v6.8b
97
98        SUBS    x0, x0, 8               // k = k - 8
99        # Is there at least 8 bytes for main loop?
100        B.LO    2f
101
102        # Main loop - 8 bytes of A
103        .p2align 3
1041:
105        SMLAL   v16.4s, v4.4h, v0.h[0]
106        SMLAL2  v20.4s, v4.8h, v0.h[0]
107        SMLAL   v17.4s, v4.4h, v1.h[0]
108        SMLAL2  v21.4s, v4.8h, v1.h[0]
109        SMLAL   v18.4s, v4.4h, v2.h[0]
110        SMLAL2  v22.4s, v4.8h, v2.h[0]
111        SMLAL   v19.4s, v4.4h, v3.h[0]
112        SMLAL2  v23.4s, v4.8h, v3.h[0]
113        LDR     d4, [x5, 24]
114        INS     v5.d[0], x17
115        SMLAL   v24.4s, v6.4h, v0.h[0]
116        SMLAL2  v28.4s, v6.8h, v0.h[0]
117        SMLAL   v25.4s, v6.4h, v1.h[0]
118        SMLAL2  v29.4s, v6.8h, v1.h[0]
119        SXTL    v5.8h, v5.8b
120        SMLAL   v26.4s, v6.4h, v2.h[0]
121        SMLAL2  v30.4s, v6.8h, v2.h[0]
122        SMLAL   v27.4s, v6.4h, v3.h[0]
123        SMLAL2  v31.4s, v6.8h, v3.h[0]
124        LDR     x17, [x5, 32]
125        SMLAL   v16.4s, v5.4h, v0.h[1]
126        SMLAL2  v20.4s, v5.8h, v0.h[1]
127        SMLAL   v17.4s, v5.4h, v1.h[1]
128        SMLAL2  v21.4s, v5.8h, v1.h[1]
129        SXTL    v4.8h, v4.8b
130        SMLAL   v18.4s, v5.4h, v2.h[1]
131        SMLAL2  v22.4s, v5.8h, v2.h[1]
132        SMLAL   v19.4s, v5.4h, v3.h[1]
133        SMLAL2  v23.4s, v5.8h, v3.h[1]
134        LDR     d5, [x5, 40]
135        INS     v6.d[0], x17
136        SMLAL   v24.4s, v4.4h, v0.h[1]
137        SMLAL2  v28.4s, v4.8h, v0.h[1]
138        SMLAL   v25.4s, v4.4h, v1.h[1]
139        SMLAL2  v29.4s, v4.8h, v1.h[1]
140        SXTL    v6.8h, v6.8b
141        SMLAL   v26.4s, v4.4h, v2.h[1]
142        SMLAL2  v30.4s, v4.8h, v2.h[1]
143        SMLAL   v27.4s, v4.4h, v3.h[1]
144        SMLAL2  v31.4s, v4.8h, v3.h[1]
145        LDR     x17, [x5, 48]
146        SMLAL   v16.4s, v6.4h, v0.h[2]
147        SMLAL2  v20.4s, v6.8h, v0.h[2]
148        SMLAL   v17.4s, v6.4h, v1.h[2]
149        SXTL    v5.8h, v5.8b
150        SMLAL2  v21.4s, v6.8h, v1.h[2]
151        SMLAL   v18.4s, v6.4h, v2.h[2]
152        SMLAL2  v22.4s, v6.8h, v2.h[2]
153        SMLAL   v19.4s, v6.4h, v3.h[2]
154        SMLAL2  v23.4s, v6.8h, v3.h[2]
155        LDR     d6, [x5, 56]
156        INS     v4.d[0], x17
157        SMLAL   v24.4s, v5.4h, v0.h[2]
158        SMLAL2  v28.4s, v5.8h, v0.h[2]
159        SMLAL   v25.4s, v5.4h, v1.h[2]
160        SMLAL2  v29.4s, v5.8h, v1.h[2]
161        SXTL    v4.8h, v4.8b
162        SMLAL   v26.4s, v5.4h, v2.h[2]
163        SMLAL2  v30.4s, v5.8h, v2.h[2]
164        SMLAL   v27.4s, v5.4h, v3.h[2]
165        SMLAL2  v31.4s, v5.8h, v3.h[2]
166        LDR     x17, [x5, 64]
167        SMLAL   v16.4s, v4.4h, v0.h[3]
168        SMLAL2  v20.4s, v4.8h, v0.h[3]
169        SMLAL   v17.4s, v4.4h, v1.h[3]
170        SMLAL2  v21.4s, v4.8h, v1.h[3]
171        SXTL    v6.8h, v6.8b
172        SMLAL   v18.4s, v4.4h, v2.h[3]
173        SMLAL2  v22.4s, v4.8h, v2.h[3]
174        SMLAL   v19.4s, v4.4h, v3.h[3]
175        SMLAL2  v23.4s, v4.8h, v3.h[3]
176        LDR     d4, [x5, 72]
177        INS     v5.d[0], x17
178        SMLAL   v24.4s, v6.4h, v0.h[3]
179        SMLAL2  v28.4s, v6.8h, v0.h[3]
180        SXTL    v5.8h, v5.8b
181        SMLAL   v25.4s, v6.4h, v1.h[3]
182        SMLAL2  v29.4s, v6.8h, v1.h[3]
183        SMLAL   v26.4s, v6.4h, v2.h[3]
184        SMLAL2  v30.4s, v6.8h, v2.h[3]
185        SMLAL   v27.4s, v6.4h, v3.h[3]
186        SMLAL2  v31.4s, v6.8h, v3.h[3]
187        LDR     x17, [x5, 80]
188        SMLAL   v16.4s, v5.4h, v0.h[4]
189        SMLAL2  v20.4s, v5.8h, v0.h[4]
190        SMLAL   v17.4s, v5.4h, v1.h[4]
191        SMLAL2  v21.4s, v5.8h, v1.h[4]
192        SXTL    v4.8h, v4.8b
193        SMLAL   v18.4s, v5.4h, v2.h[4]
194        SMLAL2  v22.4s, v5.8h, v2.h[4]
195        SMLAL   v19.4s, v5.4h, v3.h[4]
196        SMLAL2  v23.4s, v5.8h, v3.h[4]
197        LDR     d5, [x5, 88]
198        INS     v6.d[0], x17
199        SMLAL   v24.4s, v4.4h, v0.h[4]
200        SMLAL2  v28.4s, v4.8h, v0.h[4]
201        SMLAL   v25.4s, v4.4h, v1.h[4]
202        SMLAL2  v29.4s, v4.8h, v1.h[4]
203        SXTL    v6.8h, v6.8b
204        SMLAL   v26.4s, v4.4h, v2.h[4]
205        SMLAL2  v30.4s, v4.8h, v2.h[4]
206        SMLAL   v27.4s, v4.4h, v3.h[4]
207        SMLAL2  v31.4s, v4.8h, v3.h[4]
208        LDR     x17, [x5, 96]
209        SMLAL   v16.4s, v6.4h, v0.h[5]
210        SMLAL2  v20.4s, v6.8h, v0.h[5]
211        SMLAL   v17.4s, v6.4h, v1.h[5]
212        SMLAL2  v21.4s, v6.8h, v1.h[5]
213        SXTL    v5.8h, v5.8b
214        SMLAL   v18.4s, v6.4h, v2.h[5]
215        SMLAL2  v22.4s, v6.8h, v2.h[5]
216        SMLAL   v19.4s, v6.4h, v3.h[5]
217        SMLAL2  v23.4s, v6.8h, v3.h[5]
218        LDR     d6, [x5, 104]
219        INS     v4.d[0], x17
220        SMLAL   v24.4s, v5.4h, v0.h[5]
221        SMLAL2  v28.4s, v5.8h, v0.h[5]
222        SMLAL   v25.4s, v5.4h, v1.h[5]
223        SMLAL2  v29.4s, v5.8h, v1.h[5]
224        SXTL    v4.8h, v4.8b
225        SMLAL   v26.4s, v5.4h, v2.h[5]
226        SMLAL2  v30.4s, v5.8h, v2.h[5]
227        SMLAL   v27.4s, v5.4h, v3.h[5]
228        SMLAL2  v31.4s, v5.8h, v3.h[5]
229        SXTL    v6.8h, v6.8b
230        LDR     x17, [x5, 112]
231        SMLAL   v16.4s, v4.4h, v0.h[6]
232        SMLAL2  v20.4s, v4.8h, v0.h[6]
233        SMLAL   v17.4s, v4.4h, v1.h[6]
234        SMLAL2  v21.4s, v4.8h, v1.h[6]
235        SMLAL   v18.4s, v4.4h, v2.h[6]
236        SMLAL2  v22.4s, v4.8h, v2.h[6]
237        SMLAL   v19.4s, v4.4h, v3.h[6]
238        SMLAL2  v23.4s, v4.8h, v3.h[6]
239        LDR     d5, [x5, 120]
240        INS     v4.d[0], x17
241        SMLAL   v24.4s, v6.4h, v0.h[6]
242        SMLAL2  v28.4s, v6.8h, v0.h[6]
243        SMLAL   v25.4s, v6.4h, v1.h[6]
244        SMLAL2  v29.4s, v6.8h, v1.h[6]
245        SXTL    v4.8h, v4.8b
246        ADD     x5, x5, 128
247
248        SMLAL   v26.4s, v6.4h, v2.h[6]
249        SMLAL2  v30.4s, v6.8h, v2.h[6]
250        LDR     x17, [x5]
251        SMLAL   v27.4s, v6.4h, v3.h[6]
252        SMLAL2  v31.4s, v6.8h, v3.h[6]
253        SXTL    v5.8h, v5.8b
254        LDR     x10, [x3], 8
255
256        SMLAL   v16.4s, v4.4h, v0.h[7]
257        SMLAL2  v20.4s, v4.8h, v0.h[7]
258        SMLAL   v17.4s, v4.4h, v1.h[7]
259        SMLAL2  v21.4s, v4.8h, v1.h[7]
260        SMLAL   v18.4s, v4.4h, v2.h[7]
261        SMLAL2  v22.4s, v4.8h, v2.h[7]
262        SMLAL   v19.4s, v4.4h, v3.h[7]
263        SMLAL2  v23.4s, v4.8h, v3.h[7]
264        LDR     d6, [x5, 8]
265        INS     v4.d[0], x17
266        SMLAL   v24.4s, v5.4h, v0.h[7]
267        SMLAL2  v28.4s, v5.8h, v0.h[7]
268        LDR     x17, [x13], 8
269        SMLAL   v25.4s, v5.4h, v1.h[7]
270        SMLAL2  v29.4s, v5.8h, v1.h[7]
271        LDR     d1, [x15], 8
272        INS     v0.d[0], x10
273        SMLAL   v26.4s, v5.4h, v2.h[7]
274        SMLAL2  v30.4s, v5.8h, v2.h[7]
275        SMLAL   v27.4s, v5.4h, v3.h[7]
276        SMLAL2  v31.4s, v5.8h, v3.h[7]
277        LDR     d3, [x4], 8
278        INS     v2.d[0], x17
279
280        SXTL    v0.8h, v0.8b
281        SXTL    v1.8h, v1.8b
282        LDR     x17, [x5, 16]
283        SXTL    v4.8h, v4.8b
284        SXTL    v2.8h, v2.8b
285        SUBS    x0, x0, 8
286        SXTL    v3.8h, v3.8b
287        SXTL    v6.8h, v6.8b
288        B.HS    1b
289
290        # Epilogue.  Same as main loop but no preloads in final group
291
292        .p2align 3
2932:
294        SMLAL   v16.4s, v4.4h, v0.h[0]
295        SMLAL2  v20.4s, v4.8h, v0.h[0]
296        SMLAL   v17.4s, v4.4h, v1.h[0]
297        SMLAL2  v21.4s, v4.8h, v1.h[0]
298        SMLAL   v18.4s, v4.4h, v2.h[0]
299        SMLAL2  v22.4s, v4.8h, v2.h[0]
300        SMLAL   v19.4s, v4.4h, v3.h[0]
301        SMLAL2  v23.4s, v4.8h, v3.h[0]
302        LDR     d4, [x5, 24]
303        INS     v5.d[0], x17
304        SMLAL   v24.4s, v6.4h, v0.h[0]
305        SMLAL2  v28.4s, v6.8h, v0.h[0]
306        SMLAL   v25.4s, v6.4h, v1.h[0]
307        SMLAL2  v29.4s, v6.8h, v1.h[0]
308        SXTL    v5.8h, v5.8b
309        SMLAL   v26.4s, v6.4h, v2.h[0]
310        SMLAL2  v30.4s, v6.8h, v2.h[0]
311        SMLAL   v27.4s, v6.4h, v3.h[0]
312        SMLAL2  v31.4s, v6.8h, v3.h[0]
313        LDR     x17, [x5, 32]
314        SMLAL   v16.4s, v5.4h, v0.h[1]
315        SMLAL2  v20.4s, v5.8h, v0.h[1]
316        SMLAL   v17.4s, v5.4h, v1.h[1]
317        SMLAL2  v21.4s, v5.8h, v1.h[1]
318        SXTL    v4.8h, v4.8b
319        SMLAL   v18.4s, v5.4h, v2.h[1]
320        SMLAL2  v22.4s, v5.8h, v2.h[1]
321        SMLAL   v19.4s, v5.4h, v3.h[1]
322        SMLAL2  v23.4s, v5.8h, v3.h[1]
323        LDR     d5, [x5, 40]
324        INS     v6.d[0], x17
325        SMLAL   v24.4s, v4.4h, v0.h[1]
326        SMLAL2  v28.4s, v4.8h, v0.h[1]
327        SMLAL   v25.4s, v4.4h, v1.h[1]
328        SMLAL2  v29.4s, v4.8h, v1.h[1]
329        SXTL    v6.8h, v6.8b
330        SMLAL   v26.4s, v4.4h, v2.h[1]
331        SMLAL2  v30.4s, v4.8h, v2.h[1]
332        SMLAL   v27.4s, v4.4h, v3.h[1]
333        SMLAL2  v31.4s, v4.8h, v3.h[1]
334        LDR     x17, [x5, 48]
335        SMLAL   v16.4s, v6.4h, v0.h[2]
336        SMLAL2  v20.4s, v6.8h, v0.h[2]
337        SMLAL   v17.4s, v6.4h, v1.h[2]
338        SXTL    v5.8h, v5.8b
339        SMLAL2  v21.4s, v6.8h, v1.h[2]
340        SMLAL   v18.4s, v6.4h, v2.h[2]
341        SMLAL2  v22.4s, v6.8h, v2.h[2]
342        SMLAL   v19.4s, v6.4h, v3.h[2]
343        SMLAL2  v23.4s, v6.8h, v3.h[2]
344        LDR     d6, [x5, 56]
345        INS     v4.d[0], x17
346        SMLAL   v24.4s, v5.4h, v0.h[2]
347        SMLAL2  v28.4s, v5.8h, v0.h[2]
348        SMLAL   v25.4s, v5.4h, v1.h[2]
349        SMLAL2  v29.4s, v5.8h, v1.h[2]
350        SXTL    v4.8h, v4.8b
351        SMLAL   v26.4s, v5.4h, v2.h[2]
352        SMLAL2  v30.4s, v5.8h, v2.h[2]
353        SMLAL   v27.4s, v5.4h, v3.h[2]
354        SMLAL2  v31.4s, v5.8h, v3.h[2]
355        LDR     x17, [x5, 64]
356        SMLAL   v16.4s, v4.4h, v0.h[3]
357        SMLAL2  v20.4s, v4.8h, v0.h[3]
358        SMLAL   v17.4s, v4.4h, v1.h[3]
359        SMLAL2  v21.4s, v4.8h, v1.h[3]
360        SXTL    v6.8h, v6.8b
361        SMLAL   v18.4s, v4.4h, v2.h[3]
362        SMLAL2  v22.4s, v4.8h, v2.h[3]
363        SMLAL   v19.4s, v4.4h, v3.h[3]
364        SMLAL2  v23.4s, v4.8h, v3.h[3]
365        LDR     d4, [x5, 72]
366        INS     v5.d[0], x17
367        SMLAL   v24.4s, v6.4h, v0.h[3]
368        SMLAL2  v28.4s, v6.8h, v0.h[3]
369        SXTL    v5.8h, v5.8b
370        SMLAL   v25.4s, v6.4h, v1.h[3]
371        SMLAL2  v29.4s, v6.8h, v1.h[3]
372        SMLAL   v26.4s, v6.4h, v2.h[3]
373        SMLAL2  v30.4s, v6.8h, v2.h[3]
374        SMLAL   v27.4s, v6.4h, v3.h[3]
375        SMLAL2  v31.4s, v6.8h, v3.h[3]
376        LDR     x17, [x5, 80]
377        SMLAL   v16.4s, v5.4h, v0.h[4]
378        SMLAL2  v20.4s, v5.8h, v0.h[4]
379        SMLAL   v17.4s, v5.4h, v1.h[4]
380        SMLAL2  v21.4s, v5.8h, v1.h[4]
381        SXTL    v4.8h, v4.8b
382        SMLAL   v18.4s, v5.4h, v2.h[4]
383        SMLAL2  v22.4s, v5.8h, v2.h[4]
384        SMLAL   v19.4s, v5.4h, v3.h[4]
385        SMLAL2  v23.4s, v5.8h, v3.h[4]
386        LDR     d5, [x5, 88]
387        INS     v6.d[0], x17
388        SMLAL   v24.4s, v4.4h, v0.h[4]
389        SMLAL2  v28.4s, v4.8h, v0.h[4]
390        SMLAL   v25.4s, v4.4h, v1.h[4]
391        SMLAL2  v29.4s, v4.8h, v1.h[4]
392        SXTL    v6.8h, v6.8b
393        SMLAL   v26.4s, v4.4h, v2.h[4]
394        SMLAL2  v30.4s, v4.8h, v2.h[4]
395        SMLAL   v27.4s, v4.4h, v3.h[4]
396        SMLAL2  v31.4s, v4.8h, v3.h[4]
397        LDR     x17, [x5, 96]
398        SMLAL   v16.4s, v6.4h, v0.h[5]
399        SMLAL2  v20.4s, v6.8h, v0.h[5]
400        SMLAL   v17.4s, v6.4h, v1.h[5]
401        SMLAL2  v21.4s, v6.8h, v1.h[5]
402        SXTL    v5.8h, v5.8b
403        SMLAL   v18.4s, v6.4h, v2.h[5]
404        SMLAL2  v22.4s, v6.8h, v2.h[5]
405        SMLAL   v19.4s, v6.4h, v3.h[5]
406        SMLAL2  v23.4s, v6.8h, v3.h[5]
407        LDR     d6, [x5, 104]
408        INS     v4.d[0], x17
409        SMLAL   v24.4s, v5.4h, v0.h[5]
410        SMLAL2  v28.4s, v5.8h, v0.h[5]
411        SMLAL   v25.4s, v5.4h, v1.h[5]
412        SMLAL2  v29.4s, v5.8h, v1.h[5]
413        SXTL    v4.8h, v4.8b
414        SMLAL   v26.4s, v5.4h, v2.h[5]
415        SMLAL2  v30.4s, v5.8h, v2.h[5]
416        SMLAL   v27.4s, v5.4h, v3.h[5]
417        SMLAL2  v31.4s, v5.8h, v3.h[5]
418        SXTL    v6.8h, v6.8b
419        SMLAL   v16.4s, v4.4h, v0.h[6]
420        SMLAL2  v20.4s, v4.8h, v0.h[6]
421        SMLAL   v17.4s, v4.4h, v1.h[6]
422        SMLAL2  v21.4s, v4.8h, v1.h[6]
423        SMLAL   v18.4s, v4.4h, v2.h[6]
424        SMLAL2  v22.4s, v4.8h, v2.h[6]
425        SMLAL   v19.4s, v4.4h, v3.h[6]
426        SMLAL2  v23.4s, v4.8h, v3.h[6]
427        LDR     x17, [x5, 112]
428        SMLAL   v24.4s, v6.4h, v0.h[6]
429        SMLAL2  v28.4s, v6.8h, v0.h[6]
430        SMLAL   v25.4s, v6.4h, v1.h[6]
431        SMLAL2  v29.4s, v6.8h, v1.h[6]
432        LDR     d5, [x5, 120]
433        INS     v4.d[0], x17
434        SXTL    v4.8h, v4.8b
435        SMLAL   v26.4s, v6.4h, v2.h[6]
436        SMLAL2  v30.4s, v6.8h, v2.h[6]
437        SMLAL   v27.4s, v6.4h, v3.h[6]
438        SMLAL2  v31.4s, v6.8h, v3.h[6]
439        SMLAL   v16.4s, v4.4h, v0.h[7]
440        SMLAL2  v20.4s, v4.8h, v0.h[7]
441        SMLAL   v17.4s, v4.4h, v1.h[7]
442        SMLAL2  v21.4s, v4.8h, v1.h[7]
443        SXTL    v5.8h, v5.8b
444        SMLAL   v18.4s, v4.4h, v2.h[7]
445        SMLAL2  v22.4s, v4.8h, v2.h[7]
446        SMLAL   v19.4s, v4.4h, v3.h[7]
447        SMLAL2  v23.4s, v4.8h, v3.h[7]
448        ADD     x5, x5, 128
449        SMLAL   v24.4s, v5.4h, v0.h[7]
450        SMLAL2  v28.4s, v5.8h, v0.h[7]
451        SMLAL   v25.4s, v5.4h, v1.h[7]
452        SMLAL2  v29.4s, v5.8h, v1.h[7]
453        AND     x0, x2, 7               // kc remainder 0 to 7
454        SMLAL   v26.4s, v5.4h, v2.h[7]
455        SMLAL2  v30.4s, v5.8h, v2.h[7]
456        SMLAL   v27.4s, v5.4h, v3.h[7]
457        SMLAL2  v31.4s, v5.8h, v3.h[7]
458
459        # Is there a remainder?- 1 to 7 bytes of A
460        CBNZ    x0, 4f
461
4623:
463        # Apply params - preshift, scale, postshift, bias and clamp
464        LD1R    {v4.4s}, [x11], 4
465        SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
466        SQSHL   v17.4s, v17.4s, v4.4s
467        SQSHL   v18.4s, v18.4s, v4.4s
468        SQSHL   v19.4s, v19.4s, v4.4s
469        SQSHL   v20.4s, v20.4s, v4.4s
470        SQSHL   v21.4s, v21.4s, v4.4s
471        SQSHL   v22.4s, v22.4s, v4.4s
472        SQSHL   v23.4s, v23.4s, v4.4s
473        LD1R    {v5.4s}, [x11], 4
474        SQSHL   v24.4s, v24.4s, v4.4s
475        SQSHL   v25.4s, v25.4s, v4.4s
476        SQSHL   v26.4s, v26.4s, v4.4s
477        SQSHL   v27.4s, v27.4s, v4.4s
478        SQSHL   v28.4s, v28.4s, v4.4s
479        SQSHL   v29.4s, v29.4s, v4.4s
480        SQSHL   v30.4s, v30.4s, v4.4s
481        SQSHL   v31.4s, v31.4s, v4.4s
482        LD1R    {v6.4s}, [x11], 4
483        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
484        SQDMULH v17.4s, v17.4s, v5.4s
485        SQDMULH v18.4s, v18.4s, v5.4s
486        SQDMULH v19.4s, v19.4s, v5.4s
487        SQDMULH v20.4s, v20.4s, v5.4s
488        SQDMULH v21.4s, v21.4s, v5.4s
489        SQDMULH v22.4s, v22.4s, v5.4s
490        SQDMULH v23.4s, v23.4s, v5.4s
491        SQDMULH v24.4s, v24.4s, v5.4s
492        SQDMULH v25.4s, v25.4s, v5.4s
493        SQDMULH v26.4s, v26.4s, v5.4s
494        SQDMULH v27.4s, v27.4s, v5.4s
495        SQDMULH v28.4s, v28.4s, v5.4s
496        SQDMULH v29.4s, v29.4s, v5.4s
497        SQDMULH v30.4s, v30.4s, v5.4s
498        SQDMULH v31.4s, v31.4s, v5.4s
499        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
500        SRSHL   v17.4s, v17.4s, v6.4s
501        SRSHL   v18.4s, v18.4s, v6.4s
502        SRSHL   v19.4s, v19.4s, v6.4s
503        SRSHL   v20.4s, v20.4s, v6.4s
504        SRSHL   v21.4s, v21.4s, v6.4s
505        SRSHL   v22.4s, v22.4s, v6.4s
506        SRSHL   v23.4s, v23.4s, v6.4s
507        SRSHL   v24.4s, v24.4s, v6.4s
508        SRSHL   v25.4s, v25.4s, v6.4s
509        SRSHL   v26.4s, v26.4s, v6.4s
510        SRSHL   v27.4s, v27.4s, v6.4s
511        SRSHL   v28.4s, v28.4s, v6.4s
512        SRSHL   v29.4s, v29.4s, v6.4s
513        SRSHL   v30.4s, v30.4s, v6.4s
514        SRSHL   v31.4s, v31.4s, v6.4s
515
516        SQXTN   v16.4h, v16.4s
517        SQXTN   v17.4h, v17.4s
518        SQXTN   v18.4h, v18.4s
519        SQXTN   v19.4h, v19.4s
520        SQXTN   v24.4h, v24.4s
521        SQXTN   v25.4h, v25.4s
522        SQXTN   v26.4h, v26.4s
523        SQXTN   v27.4h, v27.4s
524        LD1R    {v6.8h}, [x11], 2       // add bias
525
526        SQXTN2  v16.8h, v20.4s
527        SQXTN2  v17.8h, v21.4s
528        SQXTN2  v18.8h, v22.4s
529        SQXTN2  v19.8h, v23.4s
530        SQXTN2  v24.8h, v28.4s
531        SQXTN2  v25.8h, v29.4s
532        SQXTN2  v26.8h, v30.4s
533        SQXTN2  v27.8h, v31.4s
534
535        SQADD   v16.8h, v16.8h, v6.8h
536        SQADD   v17.8h, v17.8h, v6.8h
537        SQADD   v18.8h, v18.8h, v6.8h
538        SQADD   v19.8h, v19.8h, v6.8h
539        SQADD   v24.8h, v24.8h, v6.8h
540        SQADD   v25.8h, v25.8h, v6.8h
541        SQADD   v26.8h, v26.8h, v6.8h
542        SQADD   v27.8h, v27.8h, v6.8h
543        LD1R    {v4.16b}, [x11], 1      // clamp min value
544
545        SQXTN   v0.8b, v16.8h
546        SQXTN   v1.8b, v17.8h
547        SQXTN   v2.8b, v18.8h
548        SQXTN   v3.8b, v19.8h
549        LD1R    {v5.16b}, [x11]         // clamp max value
550        SQXTN2  v0.16b, v24.8h
551        SQXTN2  v1.16b, v25.8h
552        SQXTN2  v2.16b, v26.8h
553        SQXTN2  v3.16b, v27.8h
554        SUB     x11, x11, 15             // rewind params pointer
555
556        SMAX    v0.16b, v0.16b, v4.16b
557        SMAX    v1.16b, v1.16b, v4.16b
558        SMAX    v2.16b, v2.16b, v4.16b
559        SMAX    v3.16b, v3.16b, v4.16b
560        SUBS    x1, x1, 16
561        SMIN    v0.16b, v0.16b, v5.16b
562        SMIN    v1.16b, v1.16b, v5.16b
563        SMIN    v2.16b, v2.16b, v5.16b
564        SMIN    v3.16b, v3.16b, v5.16b
565        B.LO    5f
566
567        # Store full 4 x 16
568        ST1     {v0.16b}, [x6], x12
569        SUB     x3,  x3, x2             // a0 -= kc
570        ST1     {v1.16b}, [x8], x12
571        SUB     x15, x15, x2            // a1 -= kc
572        ST1     {v2.16b}, [x9], x12
573        SUB     x13, x13, x2            // a2 -= kc
574        ST1     {v3.16b}, [x7], x12
575        SUB     x4,  x4, x2             // a3 -= kc
576        B.NE    0b
577        RET
578
579        # Remainder- 1 to 7 bytes of A
580        .p2align 3
5814:
582        AND     x0, x2, 7               // kc remainder 1 to 7
583
584        LD1     {v0.8b},  [x3], x0
585        LDP     d4, d5, [x5], 16
586        LD1     {v1.8b}, [x15], x0
587        LD1     {v2.8b}, [x13], x0
588        LD1     {v3.8b},  [x4], x0
589        SXTL    v0.8h, v0.8b
590        SXTL    v4.8h, v4.8b
591        SXTL    v5.8h, v5.8b
592        SXTL    v1.8h, v1.8b
593        SXTL    v2.8h, v2.8b
594        SXTL    v3.8h, v3.8b
595        SMLAL   v16.4s, v4.4h, v0.h[0]
596        SMLAL2  v20.4s, v4.8h, v0.h[0]
597        SMLAL   v24.4s, v5.4h, v0.h[0]
598        SMLAL2  v28.4s, v5.8h, v0.h[0]
599        SMLAL   v17.4s, v4.4h, v1.h[0]
600        SMLAL2  v21.4s, v4.8h, v1.h[0]
601        SMLAL   v25.4s, v5.4h, v1.h[0]
602        SMLAL2  v29.4s, v5.8h, v1.h[0]
603        SMLAL   v18.4s, v4.4h, v2.h[0]
604        SMLAL2  v22.4s, v4.8h, v2.h[0]
605        SMLAL   v26.4s, v5.4h, v2.h[0]
606        SMLAL2  v30.4s, v5.8h, v2.h[0]
607        SMLAL   v19.4s, v4.4h, v3.h[0]
608        SMLAL2  v23.4s, v4.8h, v3.h[0]
609        SMLAL   v27.4s, v5.4h, v3.h[0]
610        SMLAL2  v31.4s, v5.8h, v3.h[0]
611        CMP     x0, 2
612        B.LO    3b
613
614        LDP     d4, d5, [x5], 16
615        SXTL    v4.8h, v4.8b
616        SXTL    v5.8h, v5.8b
617        SMLAL   v16.4s, v4.4h, v0.h[1]
618        SMLAL2  v20.4s, v4.8h, v0.h[1]
619        SMLAL   v24.4s, v5.4h, v0.h[1]
620        SMLAL2  v28.4s, v5.8h, v0.h[1]
621        SMLAL   v17.4s, v4.4h, v1.h[1]
622        SMLAL2  v21.4s, v4.8h, v1.h[1]
623        SMLAL   v25.4s, v5.4h, v1.h[1]
624        SMLAL2  v29.4s, v5.8h, v1.h[1]
625        SMLAL   v18.4s, v4.4h, v2.h[1]
626        SMLAL2  v22.4s, v4.8h, v2.h[1]
627        SMLAL   v26.4s, v5.4h, v2.h[1]
628        SMLAL2  v30.4s, v5.8h, v2.h[1]
629        SMLAL   v19.4s, v4.4h, v3.h[1]
630        SMLAL2  v23.4s, v4.8h, v3.h[1]
631        SMLAL   v27.4s, v5.4h, v3.h[1]
632        SMLAL2  v31.4s, v5.8h, v3.h[1]
633        B.EQ    3b
634
635        LDP     d4, d5, [x5], 16
636        SXTL    v4.8h, v4.8b
637        SXTL    v5.8h, v5.8b
638        SMLAL   v16.4s, v4.4h, v0.h[2]
639        SMLAL2  v20.4s, v4.8h, v0.h[2]
640        SMLAL   v24.4s, v5.4h, v0.h[2]
641        SMLAL2  v28.4s, v5.8h, v0.h[2]
642        SMLAL   v17.4s, v4.4h, v1.h[2]
643        SMLAL2  v21.4s, v4.8h, v1.h[2]
644        SMLAL   v25.4s, v5.4h, v1.h[2]
645        SMLAL2  v29.4s, v5.8h, v1.h[2]
646        SMLAL   v18.4s, v4.4h, v2.h[2]
647        SMLAL2  v22.4s, v4.8h, v2.h[2]
648        SMLAL   v26.4s, v5.4h, v2.h[2]
649        SMLAL2  v30.4s, v5.8h, v2.h[2]
650        SMLAL   v19.4s, v4.4h, v3.h[2]
651        SMLAL2  v23.4s, v4.8h, v3.h[2]
652        SMLAL   v27.4s, v5.4h, v3.h[2]
653        SMLAL2  v31.4s, v5.8h, v3.h[2]
654        CMP     x0, 4
655        B.LO    3b
656
657        LDP     d4, d5, [x5], 16
658        SXTL    v4.8h, v4.8b
659        SXTL    v5.8h, v5.8b
660        SMLAL   v16.4s, v4.4h, v0.h[3]
661        SMLAL2  v20.4s, v4.8h, v0.h[3]
662        SMLAL   v24.4s, v5.4h, v0.h[3]
663        SMLAL2  v28.4s, v5.8h, v0.h[3]
664        SMLAL   v17.4s, v4.4h, v1.h[3]
665        SMLAL2  v21.4s, v4.8h, v1.h[3]
666        SMLAL   v25.4s, v5.4h, v1.h[3]
667        SMLAL2  v29.4s, v5.8h, v1.h[3]
668        SMLAL   v18.4s, v4.4h, v2.h[3]
669        SMLAL2  v22.4s, v4.8h, v2.h[3]
670        SMLAL   v26.4s, v5.4h, v2.h[3]
671        SMLAL2  v30.4s, v5.8h, v2.h[3]
672        SMLAL   v19.4s, v4.4h, v3.h[3]
673        SMLAL2  v23.4s, v4.8h, v3.h[3]
674        SMLAL   v27.4s, v5.4h, v3.h[3]
675        SMLAL2  v31.4s, v5.8h, v3.h[3]
676        B.EQ    3b
677
678        LDP     d4, d5, [x5], 16
679        SXTL    v4.8h, v4.8b
680        SXTL    v5.8h, v5.8b
681        SMLAL   v16.4s, v4.4h, v0.h[4]
682        SMLAL2  v20.4s, v4.8h, v0.h[4]
683        SMLAL   v24.4s, v5.4h, v0.h[4]
684        SMLAL2  v28.4s, v5.8h, v0.h[4]
685        SMLAL   v17.4s, v4.4h, v1.h[4]
686        SMLAL2  v21.4s, v4.8h, v1.h[4]
687        SMLAL   v25.4s, v5.4h, v1.h[4]
688        SMLAL2  v29.4s, v5.8h, v1.h[4]
689        SMLAL   v18.4s, v4.4h, v2.h[4]
690        SMLAL2  v22.4s, v4.8h, v2.h[4]
691        SMLAL   v26.4s, v5.4h, v2.h[4]
692        SMLAL2  v30.4s, v5.8h, v2.h[4]
693        SMLAL   v19.4s, v4.4h, v3.h[4]
694        SMLAL2  v23.4s, v4.8h, v3.h[4]
695        SMLAL   v27.4s, v5.4h, v3.h[4]
696        SMLAL2  v31.4s, v5.8h, v3.h[4]
697        CMP     x0, 6
698        B.LO    3b
699
700        LDP     d4, d5, [x5], 16
701        SXTL    v4.8h, v4.8b
702        SXTL    v5.8h, v5.8b
703        SMLAL   v16.4s, v4.4h, v0.h[5]
704        SMLAL2  v20.4s, v4.8h, v0.h[5]
705        SMLAL   v24.4s, v5.4h, v0.h[5]
706        SMLAL2  v28.4s, v5.8h, v0.h[5]
707        SMLAL   v17.4s, v4.4h, v1.h[5]
708        SMLAL2  v21.4s, v4.8h, v1.h[5]
709        SMLAL   v25.4s, v5.4h, v1.h[5]
710        SMLAL2  v29.4s, v5.8h, v1.h[5]
711        SMLAL   v18.4s, v4.4h, v2.h[5]
712        SMLAL2  v22.4s, v4.8h, v2.h[5]
713        SMLAL   v26.4s, v5.4h, v2.h[5]
714        SMLAL2  v30.4s, v5.8h, v2.h[5]
715        SMLAL   v19.4s, v4.4h, v3.h[5]
716        SMLAL2  v23.4s, v4.8h, v3.h[5]
717        SMLAL   v27.4s, v5.4h, v3.h[5]
718        SMLAL2  v31.4s, v5.8h, v3.h[5]
719        B.EQ    3b
720
721        LDP     d4, d5, [x5], 16
722        SXTL    v4.8h, v4.8b
723        SXTL    v5.8h, v5.8b
724        SMLAL   v16.4s, v4.4h, v0.h[6]
725        SMLAL2  v20.4s, v4.8h, v0.h[6]
726        SMLAL   v24.4s, v5.4h, v0.h[6]
727        SMLAL2  v28.4s, v5.8h, v0.h[6]
728        SMLAL   v17.4s, v4.4h, v1.h[6]
729        SMLAL2  v21.4s, v4.8h, v1.h[6]
730        SMLAL   v25.4s, v5.4h, v1.h[6]
731        SMLAL2  v29.4s, v5.8h, v1.h[6]
732        SMLAL   v18.4s, v4.4h, v2.h[6]
733        SMLAL2  v22.4s, v4.8h, v2.h[6]
734        SMLAL   v26.4s, v5.4h, v2.h[6]
735        SMLAL2  v30.4s, v5.8h, v2.h[6]
736        SMLAL   v19.4s, v4.4h, v3.h[6]
737        SMLAL2  v23.4s, v4.8h, v3.h[6]
738        SMLAL   v27.4s, v5.4h, v3.h[6]
739        SMLAL2  v31.4s, v5.8h, v3.h[6]
740        B       3b
741
742        # Store odd width
743        .p2align 3
7445:
745        TBZ     x1, 3, 6f
746        STR     d0, [x6], 8
747        STR     d1, [x8], 8
748        DUP     d0, v0.d[1]
749        DUP     d1, v1.d[1]
750        STR     d2, [x9], 8
751        STR     d3, [x7], 8
752        DUP     d2, v2.d[1]
753        DUP     d3, v3.d[1]
7546:
755        TBZ     x1, 2, 7f
756        STR     s0, [x6], 4
757        STR     s1, [x8], 4
758        DUP     s0, v0.s[1]
759        DUP     s1, v1.s[1]
760        STR     s2, [x9], 4
761        STR     s3, [x7], 4
762        DUP     s2, v2.s[1]
763        DUP     s3, v3.s[1]
7647:
765        TBZ     x1, 1, 8f
766        STR     h0, [x6], 2
767        STR     h1, [x8], 2
768        DUP     h0, v0.h[1]
769        DUP     h1, v1.h[1]
770        STR     h2, [x9], 2
771        STR     h3, [x7], 2
772        DUP     h2, v2.h[1]
773        DUP     h3, v3.h[1]
7748:
775        TBZ     x1, 0, 9f
776        STR     b0, [x6]
777        STR     b1, [x8]
778        STR     b2, [x9]
779        STR     b3, [x7]
7809:
781        RET
782
783END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
784
785#ifdef __ELF__
786.section ".note.GNU-stack","",%progbits
787#endif
788