1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# A1 x15 v1
30# A2 x13 v2
31# A3  x4 v3
32# B   x5 v4  v5  v6
33# C0  x6 v16 v20 v24 v28
34# C1  x8 v17 v21 v25 v29
35# C2  x9 v18 v22 v26 v30
36# C3  x7 v19 v23 v27 v31
37# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
38
39# x10 x17 a53 temp registers
40
41BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
42
43        # Clamp A and C pointers
44        CMP     x0, 2                   // if mr < 2
45        LDP     x12, x11, [sp]          // Load cn_stride, params
46        ADD     x15, x3, x4             // a1 = a0 + a_stride
47        ADD     x8, x6, x7              // c1 = c0 + cm_stride
48        CSEL    x15, x3, x15, LO        //   a1 = a0
49        CSEL    x8, x6,  x8, LO         //   c1 = c0
50
51        ADD     x13, x15, x4            // a2 = a1 + a_stride
52        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
53                                        // if mr <= 2
54        CSEL    x13, x15, x13, LS       //   a2 = a1
55        CSEL    x9,  x8,  x9, LS        //   c2 = c1
56
57        CMP     x0, 4                   // if mr < 4
58        ADD     x4, x13, x4             // a3 = a2 + a_stride
59        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
60        CSEL    x4, x13, x4, LO         //   a3 = a2
61        CSEL    x7,  x9, x7, LO         //   c3 = c2
62
63        .p2align 3
640:
65        # Load initial bias from w into accumulators
66        LDP     q16, q20, [x5], 32
67        MOV     v17.16b, v16.16b
68        MOV     v18.16b, v16.16b
69        LDP     q24, q28, [x5], 32
70        MOV     v19.16b, v16.16b
71        MOV     v21.16b, v20.16b
72        MOV     v22.16b, v20.16b
73        MOV     v23.16b, v20.16b
74        SUBS    x0, x2, 8               // k = kc - 8
75        MOV     v25.16b, v24.16b
76        MOV     v26.16b, v24.16b
77        MOV     v27.16b, v24.16b
78        MOV     v29.16b, v28.16b
79        MOV     v30.16b, v28.16b
80        MOV     v31.16b, v28.16b
81        # Is there at least 8 bytes for epilogue?
82        B.LO    4f
83
84        # Prologue
85        LDR     d0, [x3], 8
86        LDP     d4, d6, [x5]
87        LDR     d1, [x15], 8
88        LDR     d2, [x13], 8
89        LDR     d3, [x4], 8
90        SXTL    v0.8h, v0.8b
91        LDR     x17, [x5, 16]
92        SXTL    v4.8h, v4.8b
93        SXTL    v1.8h, v1.8b
94        SXTL    v2.8h, v2.8b
95        SXTL    v3.8h, v3.8b
96        SXTL    v6.8h, v6.8b
97
98        SUBS    x0, x0, 8               // k = k - 8
99        # Is there at least 8 bytes for main loop?
100        B.LO    2f
101
102        # Main loop - 8 bytes of A
103        .p2align 3
1041:
105        SMLAL   v16.4s, v4.4h, v0.h[0]
106        SMLAL2  v20.4s, v4.8h, v0.h[0]
107        SMLAL   v17.4s, v4.4h, v1.h[0]
108        SMLAL2  v21.4s, v4.8h, v1.h[0]
109        SMLAL   v18.4s, v4.4h, v2.h[0]
110        SMLAL2  v22.4s, v4.8h, v2.h[0]
111        SMLAL   v19.4s, v4.4h, v3.h[0]
112        SMLAL2  v23.4s, v4.8h, v3.h[0]
113        LDR     d4, [x5, 24]
114        INS     v5.d[0], x17
115        SMLAL   v24.4s, v6.4h, v0.h[0]
116        SMLAL2  v28.4s, v6.8h, v0.h[0]
117        SMLAL   v25.4s, v6.4h, v1.h[0]
118        SMLAL2  v29.4s, v6.8h, v1.h[0]
119        SXTL    v5.8h, v5.8b
120        SMLAL   v26.4s, v6.4h, v2.h[0]
121        SMLAL2  v30.4s, v6.8h, v2.h[0]
122        SMLAL   v27.4s, v6.4h, v3.h[0]
123        SMLAL2  v31.4s, v6.8h, v3.h[0]
124        LDR     x17, [x5, 32]
125        SMLAL   v16.4s, v5.4h, v0.h[1]
126        SMLAL2  v20.4s, v5.8h, v0.h[1]
127        SMLAL   v17.4s, v5.4h, v1.h[1]
128        SMLAL2  v21.4s, v5.8h, v1.h[1]
129        SXTL    v4.8h, v4.8b
130        SMLAL   v18.4s, v5.4h, v2.h[1]
131        SMLAL2  v22.4s, v5.8h, v2.h[1]
132        SMLAL   v19.4s, v5.4h, v3.h[1]
133        SMLAL2  v23.4s, v5.8h, v3.h[1]
134        LDR     d5, [x5, 40]
135        INS     v6.d[0], x17
136        SMLAL   v24.4s, v4.4h, v0.h[1]
137        SMLAL2  v28.4s, v4.8h, v0.h[1]
138        SMLAL   v25.4s, v4.4h, v1.h[1]
139        SMLAL2  v29.4s, v4.8h, v1.h[1]
140        SXTL    v6.8h, v6.8b
141        SMLAL   v26.4s, v4.4h, v2.h[1]
142        SMLAL2  v30.4s, v4.8h, v2.h[1]
143        SMLAL   v27.4s, v4.4h, v3.h[1]
144        SMLAL2  v31.4s, v4.8h, v3.h[1]
145        LDR     x17, [x5, 48]
146        SMLAL   v16.4s, v6.4h, v0.h[2]
147        SMLAL2  v20.4s, v6.8h, v0.h[2]
148        SMLAL   v17.4s, v6.4h, v1.h[2]
149        SXTL    v5.8h, v5.8b
150        SMLAL2  v21.4s, v6.8h, v1.h[2]
151        SMLAL   v18.4s, v6.4h, v2.h[2]
152        SMLAL2  v22.4s, v6.8h, v2.h[2]
153        SMLAL   v19.4s, v6.4h, v3.h[2]
154        SMLAL2  v23.4s, v6.8h, v3.h[2]
155        LDR     d6, [x5, 56]
156        INS     v4.d[0], x17
157        SMLAL   v24.4s, v5.4h, v0.h[2]
158        SMLAL2  v28.4s, v5.8h, v0.h[2]
159        SMLAL   v25.4s, v5.4h, v1.h[2]
160        SMLAL2  v29.4s, v5.8h, v1.h[2]
161        SXTL    v4.8h, v4.8b
162        SMLAL   v26.4s, v5.4h, v2.h[2]
163        SMLAL2  v30.4s, v5.8h, v2.h[2]
164        SMLAL   v27.4s, v5.4h, v3.h[2]
165        SMLAL2  v31.4s, v5.8h, v3.h[2]
166        LDR     x17, [x5, 64]
167        SMLAL   v16.4s, v4.4h, v0.h[3]
168        SMLAL2  v20.4s, v4.8h, v0.h[3]
169        SMLAL   v17.4s, v4.4h, v1.h[3]
170        SMLAL2  v21.4s, v4.8h, v1.h[3]
171        SXTL    v6.8h, v6.8b
172        SMLAL   v18.4s, v4.4h, v2.h[3]
173        SMLAL2  v22.4s, v4.8h, v2.h[3]
174        SMLAL   v19.4s, v4.4h, v3.h[3]
175        SMLAL2  v23.4s, v4.8h, v3.h[3]
176        LDR     d4, [x5, 72]
177        INS     v5.d[0], x17
178        SMLAL   v24.4s, v6.4h, v0.h[3]
179        SMLAL2  v28.4s, v6.8h, v0.h[3]
180        SXTL    v5.8h, v5.8b
181        SMLAL   v25.4s, v6.4h, v1.h[3]
182        SMLAL2  v29.4s, v6.8h, v1.h[3]
183        SMLAL   v26.4s, v6.4h, v2.h[3]
184        SMLAL2  v30.4s, v6.8h, v2.h[3]
185        SMLAL   v27.4s, v6.4h, v3.h[3]
186        SMLAL2  v31.4s, v6.8h, v3.h[3]
187        LDR     x17, [x5, 80]
188        SMLAL   v16.4s, v5.4h, v0.h[4]
189        SMLAL2  v20.4s, v5.8h, v0.h[4]
190        SMLAL   v17.4s, v5.4h, v1.h[4]
191        SMLAL2  v21.4s, v5.8h, v1.h[4]
192        SXTL    v4.8h, v4.8b
193        SMLAL   v18.4s, v5.4h, v2.h[4]
194        SMLAL2  v22.4s, v5.8h, v2.h[4]
195        SMLAL   v19.4s, v5.4h, v3.h[4]
196        SMLAL2  v23.4s, v5.8h, v3.h[4]
197        LDR     d5, [x5, 88]
198        INS     v6.d[0], x17
199        SMLAL   v24.4s, v4.4h, v0.h[4]
200        SMLAL2  v28.4s, v4.8h, v0.h[4]
201        SMLAL   v25.4s, v4.4h, v1.h[4]
202        SMLAL2  v29.4s, v4.8h, v1.h[4]
203        SXTL    v6.8h, v6.8b
204        SMLAL   v26.4s, v4.4h, v2.h[4]
205        SMLAL2  v30.4s, v4.8h, v2.h[4]
206        SMLAL   v27.4s, v4.4h, v3.h[4]
207        SMLAL2  v31.4s, v4.8h, v3.h[4]
208        LDR     x17, [x5, 96]
209        SMLAL   v16.4s, v6.4h, v0.h[5]
210        SMLAL2  v20.4s, v6.8h, v0.h[5]
211        SMLAL   v17.4s, v6.4h, v1.h[5]
212        SMLAL2  v21.4s, v6.8h, v1.h[5]
213        SXTL    v5.8h, v5.8b
214        SMLAL   v18.4s, v6.4h, v2.h[5]
215        SMLAL2  v22.4s, v6.8h, v2.h[5]
216        SMLAL   v19.4s, v6.4h, v3.h[5]
217        SMLAL2  v23.4s, v6.8h, v3.h[5]
218        LDR     d6, [x5, 104]
219        INS     v4.d[0], x17
220        SMLAL   v24.4s, v5.4h, v0.h[5]
221        SMLAL2  v28.4s, v5.8h, v0.h[5]
222        SMLAL   v25.4s, v5.4h, v1.h[5]
223        SMLAL2  v29.4s, v5.8h, v1.h[5]
224        SXTL    v4.8h, v4.8b
225        SMLAL   v26.4s, v5.4h, v2.h[5]
226        SMLAL2  v30.4s, v5.8h, v2.h[5]
227        SMLAL   v27.4s, v5.4h, v3.h[5]
228        SMLAL2  v31.4s, v5.8h, v3.h[5]
229        SXTL    v6.8h, v6.8b
230        LDR     x17, [x5, 112]
231        SMLAL   v16.4s, v4.4h, v0.h[6]
232        SMLAL2  v20.4s, v4.8h, v0.h[6]
233        SMLAL   v17.4s, v4.4h, v1.h[6]
234        SMLAL2  v21.4s, v4.8h, v1.h[6]
235        SMLAL   v18.4s, v4.4h, v2.h[6]
236        SMLAL2  v22.4s, v4.8h, v2.h[6]
237        SMLAL   v19.4s, v4.4h, v3.h[6]
238        SMLAL2  v23.4s, v4.8h, v3.h[6]
239        LDR     d5, [x5, 120]
240        INS     v4.d[0], x17
241        SMLAL   v24.4s, v6.4h, v0.h[6]
242        SMLAL2  v28.4s, v6.8h, v0.h[6]
243        SMLAL   v25.4s, v6.4h, v1.h[6]
244        SMLAL2  v29.4s, v6.8h, v1.h[6]
245        SXTL    v4.8h, v4.8b
246        ADD     x5, x5, 128
247
248        SMLAL   v26.4s, v6.4h, v2.h[6]
249        SMLAL2  v30.4s, v6.8h, v2.h[6]
250        LDR     x17, [x5]
251        SMLAL   v27.4s, v6.4h, v3.h[6]
252        SMLAL2  v31.4s, v6.8h, v3.h[6]
253        SXTL    v5.8h, v5.8b
254        LDR     x10, [x3], 8
255
256        SMLAL   v16.4s, v4.4h, v0.h[7]
257        SMLAL2  v20.4s, v4.8h, v0.h[7]
258        SMLAL   v17.4s, v4.4h, v1.h[7]
259        SMLAL2  v21.4s, v4.8h, v1.h[7]
260        SMLAL   v18.4s, v4.4h, v2.h[7]
261        SMLAL2  v22.4s, v4.8h, v2.h[7]
262        SMLAL   v19.4s, v4.4h, v3.h[7]
263        SMLAL2  v23.4s, v4.8h, v3.h[7]
264        LDR     d6, [x5, 8]
265        INS     v4.d[0], x17
266        SMLAL   v24.4s, v5.4h, v0.h[7]
267        SMLAL2  v28.4s, v5.8h, v0.h[7]
268        LDR     x17, [x13], 8
269        SMLAL   v25.4s, v5.4h, v1.h[7]
270        SMLAL2  v29.4s, v5.8h, v1.h[7]
271        LDR     d1, [x15], 8
272        INS     v0.d[0], x10
273        SMLAL   v26.4s, v5.4h, v2.h[7]
274        SMLAL2  v30.4s, v5.8h, v2.h[7]
275        SMLAL   v27.4s, v5.4h, v3.h[7]
276        SMLAL2  v31.4s, v5.8h, v3.h[7]
277        LDR     d3, [x4], 8
278        INS     v2.d[0], x17
279
280        SXTL    v0.8h, v0.8b
281        SXTL    v1.8h, v1.8b
282        LDR     x17, [x5, 16]
283        SXTL    v4.8h, v4.8b
284        SXTL    v2.8h, v2.8b
285        SUBS    x0, x0, 8
286        SXTL    v3.8h, v3.8b
287        SXTL    v6.8h, v6.8b
288        B.HS    1b
289
290        # Epilogue.  Same as main loop but no preloads in final group
291
292        .p2align 3
2932:
294        SMLAL   v16.4s, v4.4h, v0.h[0]
295        SMLAL2  v20.4s, v4.8h, v0.h[0]
296        SMLAL   v17.4s, v4.4h, v1.h[0]
297        SMLAL2  v21.4s, v4.8h, v1.h[0]
298        SMLAL   v18.4s, v4.4h, v2.h[0]
299        SMLAL2  v22.4s, v4.8h, v2.h[0]
300        SMLAL   v19.4s, v4.4h, v3.h[0]
301        SMLAL2  v23.4s, v4.8h, v3.h[0]
302        LDR     d4, [x5, 24]
303        INS     v5.d[0], x17
304        SMLAL   v24.4s, v6.4h, v0.h[0]
305        SMLAL2  v28.4s, v6.8h, v0.h[0]
306        SMLAL   v25.4s, v6.4h, v1.h[0]
307        SMLAL2  v29.4s, v6.8h, v1.h[0]
308        SXTL    v5.8h, v5.8b
309        SMLAL   v26.4s, v6.4h, v2.h[0]
310        SMLAL2  v30.4s, v6.8h, v2.h[0]
311        SMLAL   v27.4s, v6.4h, v3.h[0]
312        SMLAL2  v31.4s, v6.8h, v3.h[0]
313        LDR     x17, [x5, 32]
314        SMLAL   v16.4s, v5.4h, v0.h[1]
315        SMLAL2  v20.4s, v5.8h, v0.h[1]
316        SMLAL   v17.4s, v5.4h, v1.h[1]
317        SMLAL2  v21.4s, v5.8h, v1.h[1]
318        SXTL    v4.8h, v4.8b
319        SMLAL   v18.4s, v5.4h, v2.h[1]
320        SMLAL2  v22.4s, v5.8h, v2.h[1]
321        SMLAL   v19.4s, v5.4h, v3.h[1]
322        SMLAL2  v23.4s, v5.8h, v3.h[1]
323        LDR     d5, [x5, 40]
324        INS     v6.d[0], x17
325        SMLAL   v24.4s, v4.4h, v0.h[1]
326        SMLAL2  v28.4s, v4.8h, v0.h[1]
327        SMLAL   v25.4s, v4.4h, v1.h[1]
328        SMLAL2  v29.4s, v4.8h, v1.h[1]
329        SXTL    v6.8h, v6.8b
330        SMLAL   v26.4s, v4.4h, v2.h[1]
331        SMLAL2  v30.4s, v4.8h, v2.h[1]
332        SMLAL   v27.4s, v4.4h, v3.h[1]
333        SMLAL2  v31.4s, v4.8h, v3.h[1]
334        LDR     x17, [x5, 48]
335        SMLAL   v16.4s, v6.4h, v0.h[2]
336        SMLAL2  v20.4s, v6.8h, v0.h[2]
337        SMLAL   v17.4s, v6.4h, v1.h[2]
338        SXTL    v5.8h, v5.8b
339        SMLAL2  v21.4s, v6.8h, v1.h[2]
340        SMLAL   v18.4s, v6.4h, v2.h[2]
341        SMLAL2  v22.4s, v6.8h, v2.h[2]
342        SMLAL   v19.4s, v6.4h, v3.h[2]
343        SMLAL2  v23.4s, v6.8h, v3.h[2]
344        LDR     d6, [x5, 56]
345        INS     v4.d[0], x17
346        SMLAL   v24.4s, v5.4h, v0.h[2]
347        SMLAL2  v28.4s, v5.8h, v0.h[2]
348        SMLAL   v25.4s, v5.4h, v1.h[2]
349        SMLAL2  v29.4s, v5.8h, v1.h[2]
350        SXTL    v4.8h, v4.8b
351        SMLAL   v26.4s, v5.4h, v2.h[2]
352        SMLAL2  v30.4s, v5.8h, v2.h[2]
353        SMLAL   v27.4s, v5.4h, v3.h[2]
354        SMLAL2  v31.4s, v5.8h, v3.h[2]
355        LDR     x17, [x5, 64]
356        SMLAL   v16.4s, v4.4h, v0.h[3]
357        SMLAL2  v20.4s, v4.8h, v0.h[3]
358        SMLAL   v17.4s, v4.4h, v1.h[3]
359        SMLAL2  v21.4s, v4.8h, v1.h[3]
360        SXTL    v6.8h, v6.8b
361        SMLAL   v18.4s, v4.4h, v2.h[3]
362        SMLAL2  v22.4s, v4.8h, v2.h[3]
363        SMLAL   v19.4s, v4.4h, v3.h[3]
364        SMLAL2  v23.4s, v4.8h, v3.h[3]
365        LDR     d4, [x5, 72]
366        INS     v5.d[0], x17
367        SMLAL   v24.4s, v6.4h, v0.h[3]
368        SMLAL2  v28.4s, v6.8h, v0.h[3]
369        SXTL    v5.8h, v5.8b
370        SMLAL   v25.4s, v6.4h, v1.h[3]
371        SMLAL2  v29.4s, v6.8h, v1.h[3]
372        SMLAL   v26.4s, v6.4h, v2.h[3]
373        SMLAL2  v30.4s, v6.8h, v2.h[3]
374        SMLAL   v27.4s, v6.4h, v3.h[3]
375        SMLAL2  v31.4s, v6.8h, v3.h[3]
376        LDR     x17, [x5, 80]
377        SMLAL   v16.4s, v5.4h, v0.h[4]
378        SMLAL2  v20.4s, v5.8h, v0.h[4]
379        SMLAL   v17.4s, v5.4h, v1.h[4]
380        SMLAL2  v21.4s, v5.8h, v1.h[4]
381        SXTL    v4.8h, v4.8b
382        SMLAL   v18.4s, v5.4h, v2.h[4]
383        SMLAL2  v22.4s, v5.8h, v2.h[4]
384        SMLAL   v19.4s, v5.4h, v3.h[4]
385        SMLAL2  v23.4s, v5.8h, v3.h[4]
386        LDR     d5, [x5, 88]
387        INS     v6.d[0], x17
388        SMLAL   v24.4s, v4.4h, v0.h[4]
389        SMLAL2  v28.4s, v4.8h, v0.h[4]
390        SMLAL   v25.4s, v4.4h, v1.h[4]
391        SMLAL2  v29.4s, v4.8h, v1.h[4]
392        SXTL    v6.8h, v6.8b
393        SMLAL   v26.4s, v4.4h, v2.h[4]
394        SMLAL2  v30.4s, v4.8h, v2.h[4]
395        SMLAL   v27.4s, v4.4h, v3.h[4]
396        SMLAL2  v31.4s, v4.8h, v3.h[4]
397        LDR     x17, [x5, 96]
398        SMLAL   v16.4s, v6.4h, v0.h[5]
399        SMLAL2  v20.4s, v6.8h, v0.h[5]
400        SMLAL   v17.4s, v6.4h, v1.h[5]
401        SMLAL2  v21.4s, v6.8h, v1.h[5]
402        SXTL    v5.8h, v5.8b
403        SMLAL   v18.4s, v6.4h, v2.h[5]
404        SMLAL2  v22.4s, v6.8h, v2.h[5]
405        SMLAL   v19.4s, v6.4h, v3.h[5]
406        SMLAL2  v23.4s, v6.8h, v3.h[5]
407        LDR     d6, [x5, 104]
408        INS     v4.d[0], x17
409        SMLAL   v24.4s, v5.4h, v0.h[5]
410        SMLAL2  v28.4s, v5.8h, v0.h[5]
411        SMLAL   v25.4s, v5.4h, v1.h[5]
412        SMLAL2  v29.4s, v5.8h, v1.h[5]
413        SXTL    v4.8h, v4.8b
414        SMLAL   v26.4s, v5.4h, v2.h[5]
415        SMLAL2  v30.4s, v5.8h, v2.h[5]
416        SMLAL   v27.4s, v5.4h, v3.h[5]
417        SMLAL2  v31.4s, v5.8h, v3.h[5]
418        SXTL    v6.8h, v6.8b
419        SMLAL   v16.4s, v4.4h, v0.h[6]
420        SMLAL2  v20.4s, v4.8h, v0.h[6]
421        SMLAL   v17.4s, v4.4h, v1.h[6]
422        SMLAL2  v21.4s, v4.8h, v1.h[6]
423        SMLAL   v18.4s, v4.4h, v2.h[6]
424        SMLAL2  v22.4s, v4.8h, v2.h[6]
425        SMLAL   v19.4s, v4.4h, v3.h[6]
426        SMLAL2  v23.4s, v4.8h, v3.h[6]
427        LDR     x17, [x5, 112]
428        SMLAL   v24.4s, v6.4h, v0.h[6]
429        SMLAL2  v28.4s, v6.8h, v0.h[6]
430        SMLAL   v25.4s, v6.4h, v1.h[6]
431        SMLAL2  v29.4s, v6.8h, v1.h[6]
432        LDR     d5, [x5, 120]
433        INS     v4.d[0], x17
434        SXTL    v4.8h, v4.8b
435        SMLAL   v26.4s, v6.4h, v2.h[6]
436        SMLAL2  v30.4s, v6.8h, v2.h[6]
437        SMLAL   v27.4s, v6.4h, v3.h[6]
438        SMLAL2  v31.4s, v6.8h, v3.h[6]
439        SMLAL   v16.4s, v4.4h, v0.h[7]
440        SMLAL2  v20.4s, v4.8h, v0.h[7]
441        SMLAL   v17.4s, v4.4h, v1.h[7]
442        SMLAL2  v21.4s, v4.8h, v1.h[7]
443        SXTL    v5.8h, v5.8b
444        SMLAL   v18.4s, v4.4h, v2.h[7]
445        SMLAL2  v22.4s, v4.8h, v2.h[7]
446        SMLAL   v19.4s, v4.4h, v3.h[7]
447        SMLAL2  v23.4s, v4.8h, v3.h[7]
448        ADD     x5, x5, 128
449        SMLAL   v24.4s, v5.4h, v0.h[7]
450        SMLAL2  v28.4s, v5.8h, v0.h[7]
451        SMLAL   v25.4s, v5.4h, v1.h[7]
452        SMLAL2  v29.4s, v5.8h, v1.h[7]
453        AND     x0, x2, 7               // kc remainder 0 to 7
454        SMLAL   v26.4s, v5.4h, v2.h[7]
455        SMLAL2  v30.4s, v5.8h, v2.h[7]
456        SMLAL   v27.4s, v5.4h, v3.h[7]
457        SMLAL2  v31.4s, v5.8h, v3.h[7]
458
459        # Is there a remainder?- 1 to 7 bytes of A
460        CBNZ    x0, 4f
461
4623:
463        SCVTF   v16.4s, v16.4s
464        SCVTF   v17.4s, v17.4s
465        # Load per channel scale values from weights
466        LDR     q4, [x5], 16
467        SCVTF   v18.4s, v18.4s
468        SCVTF   v19.4s, v19.4s
469        LDR     q5, [x5], 16
470        SCVTF   v20.4s, v20.4s
471        SCVTF   v21.4s, v21.4s
472        SCVTF   v22.4s, v22.4s
473        SCVTF   v23.4s, v23.4s
474        SCVTF   v24.4s, v24.4s
475        SCVTF   v25.4s, v25.4s
476        SCVTF   v26.4s, v26.4s
477        SCVTF   v27.4s, v27.4s
478        SCVTF   v28.4s, v28.4s
479        SCVTF   v29.4s, v29.4s
480        SCVTF   v30.4s, v30.4s
481        SCVTF   v31.4s, v31.4s
482
483        LDR     q6, [x5], 16
484        FMUL    v16.4s, v16.4s, v4.4s
485        FMUL    v17.4s, v17.4s, v4.4s
486        FMUL    v18.4s, v18.4s, v4.4s
487        FMUL    v19.4s, v19.4s, v4.4s
488        FMUL    v20.4s, v20.4s, v5.4s
489        LDR     q4, [x5], 16
490        FMUL    v21.4s, v21.4s, v5.4s
491        FMUL    v22.4s, v22.4s, v5.4s
492        FMUL    v23.4s, v23.4s, v5.4s
493        FMUL    v24.4s, v24.4s, v6.4s
494        FMUL    v25.4s, v25.4s, v6.4s
495        FMUL    v26.4s, v26.4s, v6.4s
496        FMUL    v27.4s, v27.4s, v6.4s
497        FMUL    v28.4s, v28.4s, v4.4s
498        FMUL    v29.4s, v29.4s, v4.4s
499        FMUL    v30.4s, v30.4s, v4.4s
500        FMUL    v31.4s, v31.4s, v4.4s
501
502        FCVTNS  v16.4s, v16.4s
503        FCVTNS  v17.4s, v17.4s
504        FCVTNS  v18.4s, v18.4s
505        FCVTNS  v19.4s, v19.4s
506        FCVTNS  v20.4s, v20.4s
507        FCVTNS  v21.4s, v21.4s
508        FCVTNS  v22.4s, v22.4s
509        FCVTNS  v23.4s, v23.4s
510        FCVTNS  v24.4s, v24.4s
511        FCVTNS  v25.4s, v25.4s
512        FCVTNS  v26.4s, v26.4s
513        FCVTNS  v27.4s, v27.4s
514        FCVTNS  v28.4s, v28.4s
515        FCVTNS  v29.4s, v29.4s
516        FCVTNS  v30.4s, v30.4s
517        FCVTNS  v31.4s, v31.4s
518
519        SQXTN   v16.4h, v16.4s
520        SQXTN   v17.4h, v17.4s
521        SQXTN   v18.4h, v18.4s
522        SQXTN   v19.4h, v19.4s
523        SQXTN   v24.4h, v24.4s
524        SQXTN   v25.4h, v25.4s
525        SQXTN   v26.4h, v26.4s
526        SQXTN   v27.4h, v27.4s
527        LD1R    {v6.8h}, [x11], 2       // add bias
528
529        SQXTN2  v16.8h, v20.4s
530        SQXTN2  v17.8h, v21.4s
531        SQXTN2  v18.8h, v22.4s
532        SQXTN2  v19.8h, v23.4s
533        SQXTN2  v24.8h, v28.4s
534        SQXTN2  v25.8h, v29.4s
535        SQXTN2  v26.8h, v30.4s
536        SQXTN2  v27.8h, v31.4s
537
538        SQADD   v16.8h, v16.8h, v6.8h
539        SQADD   v17.8h, v17.8h, v6.8h
540        SQADD   v18.8h, v18.8h, v6.8h
541        SQADD   v19.8h, v19.8h, v6.8h
542        SQADD   v24.8h, v24.8h, v6.8h
543        SQADD   v25.8h, v25.8h, v6.8h
544        SQADD   v26.8h, v26.8h, v6.8h
545        SQADD   v27.8h, v27.8h, v6.8h
546        LD1R    {v4.16b}, [x11], 1      // clamp min value
547
548        SQXTN   v0.8b, v16.8h
549        SQXTN   v1.8b, v17.8h
550        SQXTN   v2.8b, v18.8h
551        SQXTN   v3.8b, v19.8h
552        LD1R    {v5.16b}, [x11]         // clamp max value
553        SQXTN2  v0.16b, v24.8h
554        SQXTN2  v1.16b, v25.8h
555        SQXTN2  v2.16b, v26.8h
556        SQXTN2  v3.16b, v27.8h
557        SUB     x11, x11, 3             // rewind params pointer
558
559        SMAX    v0.16b, v0.16b, v4.16b
560        SMAX    v1.16b, v1.16b, v4.16b
561        SMAX    v2.16b, v2.16b, v4.16b
562        SMAX    v3.16b, v3.16b, v4.16b
563        SUBS    x1, x1, 16
564        SMIN    v0.16b, v0.16b, v5.16b
565        SMIN    v1.16b, v1.16b, v5.16b
566        SMIN    v2.16b, v2.16b, v5.16b
567        SMIN    v3.16b, v3.16b, v5.16b
568        B.LO    5f
569
570        # Store full 4 x 16
571        ST1     {v0.16b}, [x6], x12
572        SUB     x3,  x3, x2             // a0 -= kc
573        ST1     {v1.16b}, [x8], x12
574        SUB     x15, x15, x2            // a1 -= kc
575        ST1     {v2.16b}, [x9], x12
576        SUB     x13, x13, x2            // a2 -= kc
577        ST1     {v3.16b}, [x7], x12
578        SUB     x4,  x4, x2             // a3 -= kc
579        B.NE    0b
580        RET
581
582        # Remainder- 1 to 7 bytes of A
583        .p2align 3
5844:
585        AND     x0, x2, 7               // kc remainder 1 to 7
586
587        LD1     {v0.8b},  [x3], x0
588        LDP     d4, d5, [x5], 16
589        LD1     {v1.8b}, [x15], x0
590        LD1     {v2.8b}, [x13], x0
591        LD1     {v3.8b},  [x4], x0
592        SXTL    v0.8h, v0.8b
593        SXTL    v4.8h, v4.8b
594        SXTL    v5.8h, v5.8b
595        SXTL    v1.8h, v1.8b
596        SXTL    v2.8h, v2.8b
597        SXTL    v3.8h, v3.8b
598        SMLAL   v16.4s, v4.4h, v0.h[0]
599        SMLAL2  v20.4s, v4.8h, v0.h[0]
600        SMLAL   v24.4s, v5.4h, v0.h[0]
601        SMLAL2  v28.4s, v5.8h, v0.h[0]
602        SMLAL   v17.4s, v4.4h, v1.h[0]
603        SMLAL2  v21.4s, v4.8h, v1.h[0]
604        SMLAL   v25.4s, v5.4h, v1.h[0]
605        SMLAL2  v29.4s, v5.8h, v1.h[0]
606        SMLAL   v18.4s, v4.4h, v2.h[0]
607        SMLAL2  v22.4s, v4.8h, v2.h[0]
608        SMLAL   v26.4s, v5.4h, v2.h[0]
609        SMLAL2  v30.4s, v5.8h, v2.h[0]
610        SMLAL   v19.4s, v4.4h, v3.h[0]
611        SMLAL2  v23.4s, v4.8h, v3.h[0]
612        SMLAL   v27.4s, v5.4h, v3.h[0]
613        SMLAL2  v31.4s, v5.8h, v3.h[0]
614        CMP     x0, 2
615        B.LO    3b
616
617        LDP     d4, d5, [x5], 16
618        SXTL    v4.8h, v4.8b
619        SXTL    v5.8h, v5.8b
620        SMLAL   v16.4s, v4.4h, v0.h[1]
621        SMLAL2  v20.4s, v4.8h, v0.h[1]
622        SMLAL   v24.4s, v5.4h, v0.h[1]
623        SMLAL2  v28.4s, v5.8h, v0.h[1]
624        SMLAL   v17.4s, v4.4h, v1.h[1]
625        SMLAL2  v21.4s, v4.8h, v1.h[1]
626        SMLAL   v25.4s, v5.4h, v1.h[1]
627        SMLAL2  v29.4s, v5.8h, v1.h[1]
628        SMLAL   v18.4s, v4.4h, v2.h[1]
629        SMLAL2  v22.4s, v4.8h, v2.h[1]
630        SMLAL   v26.4s, v5.4h, v2.h[1]
631        SMLAL2  v30.4s, v5.8h, v2.h[1]
632        SMLAL   v19.4s, v4.4h, v3.h[1]
633        SMLAL2  v23.4s, v4.8h, v3.h[1]
634        SMLAL   v27.4s, v5.4h, v3.h[1]
635        SMLAL2  v31.4s, v5.8h, v3.h[1]
636        B.EQ    3b
637
638        LDP     d4, d5, [x5], 16
639        SXTL    v4.8h, v4.8b
640        SXTL    v5.8h, v5.8b
641        SMLAL   v16.4s, v4.4h, v0.h[2]
642        SMLAL2  v20.4s, v4.8h, v0.h[2]
643        SMLAL   v24.4s, v5.4h, v0.h[2]
644        SMLAL2  v28.4s, v5.8h, v0.h[2]
645        SMLAL   v17.4s, v4.4h, v1.h[2]
646        SMLAL2  v21.4s, v4.8h, v1.h[2]
647        SMLAL   v25.4s, v5.4h, v1.h[2]
648        SMLAL2  v29.4s, v5.8h, v1.h[2]
649        SMLAL   v18.4s, v4.4h, v2.h[2]
650        SMLAL2  v22.4s, v4.8h, v2.h[2]
651        SMLAL   v26.4s, v5.4h, v2.h[2]
652        SMLAL2  v30.4s, v5.8h, v2.h[2]
653        SMLAL   v19.4s, v4.4h, v3.h[2]
654        SMLAL2  v23.4s, v4.8h, v3.h[2]
655        SMLAL   v27.4s, v5.4h, v3.h[2]
656        SMLAL2  v31.4s, v5.8h, v3.h[2]
657        CMP     x0, 4
658        B.LO    3b
659
660        LDP     d4, d5, [x5], 16
661        SXTL    v4.8h, v4.8b
662        SXTL    v5.8h, v5.8b
663        SMLAL   v16.4s, v4.4h, v0.h[3]
664        SMLAL2  v20.4s, v4.8h, v0.h[3]
665        SMLAL   v24.4s, v5.4h, v0.h[3]
666        SMLAL2  v28.4s, v5.8h, v0.h[3]
667        SMLAL   v17.4s, v4.4h, v1.h[3]
668        SMLAL2  v21.4s, v4.8h, v1.h[3]
669        SMLAL   v25.4s, v5.4h, v1.h[3]
670        SMLAL2  v29.4s, v5.8h, v1.h[3]
671        SMLAL   v18.4s, v4.4h, v2.h[3]
672        SMLAL2  v22.4s, v4.8h, v2.h[3]
673        SMLAL   v26.4s, v5.4h, v2.h[3]
674        SMLAL2  v30.4s, v5.8h, v2.h[3]
675        SMLAL   v19.4s, v4.4h, v3.h[3]
676        SMLAL2  v23.4s, v4.8h, v3.h[3]
677        SMLAL   v27.4s, v5.4h, v3.h[3]
678        SMLAL2  v31.4s, v5.8h, v3.h[3]
679        B.EQ    3b
680
681        LDP     d4, d5, [x5], 16
682        SXTL    v4.8h, v4.8b
683        SXTL    v5.8h, v5.8b
684        SMLAL   v16.4s, v4.4h, v0.h[4]
685        SMLAL2  v20.4s, v4.8h, v0.h[4]
686        SMLAL   v24.4s, v5.4h, v0.h[4]
687        SMLAL2  v28.4s, v5.8h, v0.h[4]
688        SMLAL   v17.4s, v4.4h, v1.h[4]
689        SMLAL2  v21.4s, v4.8h, v1.h[4]
690        SMLAL   v25.4s, v5.4h, v1.h[4]
691        SMLAL2  v29.4s, v5.8h, v1.h[4]
692        SMLAL   v18.4s, v4.4h, v2.h[4]
693        SMLAL2  v22.4s, v4.8h, v2.h[4]
694        SMLAL   v26.4s, v5.4h, v2.h[4]
695        SMLAL2  v30.4s, v5.8h, v2.h[4]
696        SMLAL   v19.4s, v4.4h, v3.h[4]
697        SMLAL2  v23.4s, v4.8h, v3.h[4]
698        SMLAL   v27.4s, v5.4h, v3.h[4]
699        SMLAL2  v31.4s, v5.8h, v3.h[4]
700        CMP     x0, 6
701        B.LO    3b
702
703        LDP     d4, d5, [x5], 16
704        SXTL    v4.8h, v4.8b
705        SXTL    v5.8h, v5.8b
706        SMLAL   v16.4s, v4.4h, v0.h[5]
707        SMLAL2  v20.4s, v4.8h, v0.h[5]
708        SMLAL   v24.4s, v5.4h, v0.h[5]
709        SMLAL2  v28.4s, v5.8h, v0.h[5]
710        SMLAL   v17.4s, v4.4h, v1.h[5]
711        SMLAL2  v21.4s, v4.8h, v1.h[5]
712        SMLAL   v25.4s, v5.4h, v1.h[5]
713        SMLAL2  v29.4s, v5.8h, v1.h[5]
714        SMLAL   v18.4s, v4.4h, v2.h[5]
715        SMLAL2  v22.4s, v4.8h, v2.h[5]
716        SMLAL   v26.4s, v5.4h, v2.h[5]
717        SMLAL2  v30.4s, v5.8h, v2.h[5]
718        SMLAL   v19.4s, v4.4h, v3.h[5]
719        SMLAL2  v23.4s, v4.8h, v3.h[5]
720        SMLAL   v27.4s, v5.4h, v3.h[5]
721        SMLAL2  v31.4s, v5.8h, v3.h[5]
722        B.EQ    3b
723
724        LDP     d4, d5, [x5], 16
725        SXTL    v4.8h, v4.8b
726        SXTL    v5.8h, v5.8b
727        SMLAL   v16.4s, v4.4h, v0.h[6]
728        SMLAL2  v20.4s, v4.8h, v0.h[6]
729        SMLAL   v24.4s, v5.4h, v0.h[6]
730        SMLAL2  v28.4s, v5.8h, v0.h[6]
731        SMLAL   v17.4s, v4.4h, v1.h[6]
732        SMLAL2  v21.4s, v4.8h, v1.h[6]
733        SMLAL   v25.4s, v5.4h, v1.h[6]
734        SMLAL2  v29.4s, v5.8h, v1.h[6]
735        SMLAL   v18.4s, v4.4h, v2.h[6]
736        SMLAL2  v22.4s, v4.8h, v2.h[6]
737        SMLAL   v26.4s, v5.4h, v2.h[6]
738        SMLAL2  v30.4s, v5.8h, v2.h[6]
739        SMLAL   v19.4s, v4.4h, v3.h[6]
740        SMLAL2  v23.4s, v4.8h, v3.h[6]
741        SMLAL   v27.4s, v5.4h, v3.h[6]
742        SMLAL2  v31.4s, v5.8h, v3.h[6]
743        B       3b
744
745        # Store odd width
746        .p2align 3
7475:
748        TBZ     x1, 3, 6f
749        STR     d0, [x6], 8
750        STR     d1, [x8], 8
751        DUP     d0, v0.d[1]
752        DUP     d1, v1.d[1]
753        STR     d2, [x9], 8
754        STR     d3, [x7], 8
755        DUP     d2, v2.d[1]
756        DUP     d3, v3.d[1]
7576:
758        TBZ     x1, 2, 7f
759        STR     s0, [x6], 4
760        STR     s1, [x8], 4
761        DUP     s0, v0.s[1]
762        DUP     s1, v1.s[1]
763        STR     s2, [x9], 4
764        STR     s3, [x7], 4
765        DUP     s2, v2.s[1]
766        DUP     s3, v3.s[1]
7677:
768        TBZ     x1, 1, 8f
769        STR     h0, [x6], 2
770        STR     h1, [x8], 2
771        DUP     h0, v0.h[1]
772        DUP     h1, v1.h[1]
773        STR     h2, [x9], 2
774        STR     h3, [x7], 2
775        DUP     h2, v2.h[1]
776        DUP     h3, v3.h[1]
7778:
778        TBZ     x1, 0, 9f
779        STR     b0, [x6]
780        STR     b1, [x8]
781        STR     b2, [x9]
782        STR     b3, [x7]
7839:
784        RET
785
786END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
787
788#ifdef __ELF__
789.section ".note.GNU-stack","",%progbits
790#endif
791