xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v4
29# A1 x15  v1  v5
30# A2 x13  v2  v6
31# A3  x4  v3  v7
32# B   x5  v8  v9 v10 v11
33# C0  x6 v16 v20 v24 v28
34# C1  x8 v17 v21 v25 v29
35# C2  x9 v18 v22 v26 v30
36# C3  x7 v19 v23 v27 v31
37# unused v12 v13 v14 v15
38
39# x14 temp for Cortex-A55 loads
40
41BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
42
43        # Clamp A and C pointers
44        CMP     x0, 2                   // if mr < 2
45
46        LDP     x12, x11, [sp]          // cn_stride, params
47
48        ADD     x15, x3, x4             // a1 = a0 + a_stride
49        ADD     x8, x6, x7              // c1 = c0 + cm_stride
50
51        STP     d8,  d9, [sp, -32]!
52
53        CSEL    x15, x3, x15, LO        //   a1 = a0
54        CSEL    x8, x6,  x8, LO         //   c1 = c0
55        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
56
57        ADD     x13, x15, x4            // a2 = a1 + a_stride
58        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
59                                        // if mr <= 2
60        CSEL    x13, x15, x13, LS       //   a2 = a1
61        CSEL    x9,  x8,  x9, LS        //   c2 = c1
62        BIC     x2, x2, 3
63
64        STP     d10, d11, [sp, 16]
65
66        CMP     x0, 4                   // if mr < 4
67        ADD     x4, x13, x4             // a3 = a2 + a_stride
68        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
69        CSEL    x4, x13, x4, LO         //   a3 = a2
70        CSEL    x7,  x9, x7, LO         //   c3 = c2
71
72        .p2align 3
730:
74        # Load initial bias from w into accumulators
75        LDP     q16, q20, [x5], 32
76        MOV     v17.16b, v16.16b
77        MOV     v18.16b, v16.16b
78        LDP     q24, q28, [x5], 32
79        MOV     v19.16b, v16.16b
80        MOV     v21.16b, v20.16b
81        MOV     v22.16b, v20.16b
82        MOV     v23.16b, v20.16b
83        MOV     v25.16b, v24.16b
84        MOV     v26.16b, v24.16b
85        SUBS    x0, x2, 16              // k = kc - 16
86        MOV     v27.16b, v24.16b
87        MOV     v29.16b, v28.16b
88        MOV     v30.16b, v28.16b
89        MOV     v31.16b, v28.16b
90        # Is there at least 16 bytes for prologue/epilogue?
91        B.LO    4f
92
93        # prologue - read A and B values for block 0 and 1
94        LDR     d0,  [x3], 8
95        LDR     q8,  [x5], 16
96        LDR     d1, [x15], 8
97        LDR     d2, [x13], 8
98        LDR     d3,  [x4], 8
99        SUBS    x0, x0, 16              // is there 16 for main loop?
100        LDR     d9,  [x5], 8
101        LDR     x14,  [x5], 8
102        # Is there at least 16 bytes for main loop?
103        B.LO    2f
104
105        # Main loop - 16 bytes of A in 4 groups.
106        # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
107        # 4 LD64 for A
108        # 4 LD128 for W. = 2 LD64 + INS.
109        # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.
110
111        .p2align 3
1121:
113        # BLOCK 0
114        SDOT    v16.4s,  v8.16b, v0.4b[0]
115        LDR     d10,  [x5], 8
116        SDOT    v17.4s,  v8.16b, v1.4b[0]
117        INS     v9.d[1], x14
118        SDOT    v18.4s,  v8.16b, v2.4b[0]
119        LDR     x14,  [x5], 8
120        SDOT    v19.4s,  v8.16b, v3.4b[0]
121        LDR     d4,  [x3], 8
122
123        # BLOCK 1
124        SDOT    v20.4s,  v9.16b, v0.4b[0]
125        LDR     d11,  [x5], 8
126        SDOT    v21.4s,  v9.16b, v1.4b[0]
127        INS     v10.d[1], x14
128        SDOT    v22.4s,  v9.16b, v2.4b[0]
129        LDR     x14,  [x5], 8
130        SDOT    v23.4s,  v9.16b, v3.4b[0]
131        LDR     d5, [x15], 8
132
133        # BLOCK 2
134        SDOT    v24.4s, v10.16b, v0.4b[0]
135        LDR     d8,  [x5], 8
136        SDOT    v25.4s, v10.16b, v1.4b[0]
137        INS     v11.d[1], x14
138        SDOT    v26.4s, v10.16b, v2.4b[0]
139        LDR     x14,  [x5], 8
140        SDOT    v27.4s, v10.16b, v3.4b[0]
141        LDR     d6, [x13], 8
142
143        # BLOCK 3
144        SDOT    v28.4s, v11.16b, v0.4b[0]
145        LDR     d9,  [x5], 8
146        SDOT    v29.4s, v11.16b, v1.4b[0]
147        INS     v8.d[1], x14
148        SDOT    v30.4s, v11.16b, v2.4b[0]
149        LDR     x14,  [x5], 8
150        SDOT    v31.4s, v11.16b, v3.4b[0]
151        LDR     d7,  [x4], 8
152
153        # BLOCK 0
154        SDOT    v16.4s,  v8.16b, v0.4b[1]
155        LDR     d10,  [x5], 8
156        SDOT    v17.4s,  v8.16b, v1.4b[1]
157        INS     v9.d[1], x14
158        SDOT    v18.4s,  v8.16b, v2.4b[1]
159        LDR     x14,  [x5], 8
160        SDOT    v19.4s,  v8.16b, v3.4b[1]
161
162        # BLOCK 1
163        SDOT    v20.4s,  v9.16b, v0.4b[1]
164        LDR     d11,  [x5], 8
165        SDOT    v21.4s,  v9.16b, v1.4b[1]
166        INS     v10.d[1], x14
167        SDOT    v22.4s,  v9.16b, v2.4b[1]
168        LDR     x14,  [x5], 8
169        SDOT    v23.4s,  v9.16b, v3.4b[1]
170
171        # BLOCK 2
172        SDOT    v24.4s, v10.16b, v0.4b[1]
173        LDR     d8,  [x5], 8
174        SDOT    v25.4s, v10.16b, v1.4b[1]
175        INS     v11.d[1], x14
176        SDOT    v26.4s, v10.16b, v2.4b[1]
177        LDR     x14,  [x5], 8
178        SDOT    v27.4s, v10.16b, v3.4b[1]
179
180        # BLOCK 3
181        SDOT    v28.4s, v11.16b, v0.4b[1]
182        LDR     d9,  [x5], 8
183        SDOT    v29.4s, v11.16b, v1.4b[1]
184        INS     v8.d[1], x14
185        SDOT    v30.4s, v11.16b, v2.4b[1]
186        LDR     x14,  [x5], 8
187        SDOT    v31.4s, v11.16b, v3.4b[1]
188
189        # BLOCK 0
190        SDOT    v16.4s,  v8.16b, v4.4b[0]
191        LDR     d10,  [x5], 8
192        SDOT    v17.4s,  v8.16b, v5.4b[0]
193        INS     v9.d[1], x14
194        SDOT    v18.4s,  v8.16b, v6.4b[0]
195        LDR     x14,  [x5], 8
196        SDOT    v19.4s,  v8.16b, v7.4b[0]
197        LDR     d0,  [x3], 8
198
199        # BLOCK 1
200        SDOT    v20.4s,  v9.16b, v4.4b[0]
201        LDR     d11,  [x5], 8
202        SDOT    v21.4s,  v9.16b, v5.4b[0]
203        INS     v10.d[1], x14
204        SDOT    v22.4s,  v9.16b, v6.4b[0]
205        LDR     x14,  [x5], 8
206        SDOT    v23.4s,  v9.16b, v7.4b[0]
207        LDR     d1, [x15], 8
208
209        # BLOCK 2
210        SDOT    v24.4s, v10.16b, v4.4b[0]
211        LDR     d8,  [x5], 8
212        SDOT    v25.4s, v10.16b, v5.4b[0]
213        INS     v11.d[1], x14
214        SDOT    v26.4s, v10.16b, v6.4b[0]
215        LDR     x14,  [x5], 8
216        SDOT    v27.4s, v10.16b, v7.4b[0]
217        LDR     d2, [x13], 8
218
219        # BLOCK 3
220        SDOT    v28.4s, v11.16b, v4.4b[0]
221        LDR     d9,  [x5], 8
222        SDOT    v29.4s, v11.16b, v5.4b[0]
223        INS     v8.d[1], x14
224        SDOT    v30.4s, v11.16b, v6.4b[0]
225        LDR     x14,  [x5], 8
226        SDOT    v31.4s, v11.16b, v7.4b[0]
227        LDR     d3,  [x4], 8
228
229        # BLOCK 0
230        SDOT    v16.4s,  v8.16b, v4.4b[1]
231        LDR     d10,  [x5], 8
232        SDOT    v17.4s,  v8.16b, v5.4b[1]
233        INS     v9.d[1], x14
234        SDOT    v18.4s,  v8.16b, v6.4b[1]
235        LDR     x14,  [x5], 8
236        SDOT    v19.4s,  v8.16b, v7.4b[1]
237
238        # BLOCK 1
239        SDOT    v20.4s,  v9.16b, v4.4b[1]
240        LDR     d11,  [x5], 8
241        SDOT    v21.4s,  v9.16b, v5.4b[1]
242        INS     v10.d[1], x14
243        SDOT    v22.4s,  v9.16b, v6.4b[1]
244        LDR     x14,  [x5], 8
245        SDOT    v23.4s,  v9.16b, v7.4b[1]
246
247        # BLOCK 2
248        SDOT    v24.4s, v10.16b, v4.4b[1]
249        LDR     d8,  [x5], 8            // First B values for block 0 and 1
250        SDOT    v25.4s, v10.16b, v5.4b[1]
251        INS     v11.d[1], x14
252        SDOT    v26.4s, v10.16b, v6.4b[1]
253        LDR     x14,  [x5], 8
254        SDOT    v27.4s, v10.16b, v7.4b[1]
255        SUBS    x0, x0, 16
256
257        # BLOCK 3
258        SDOT    v28.4s, v11.16b, v4.4b[1]
259        LDR     d9,  [x5], 8
260        SDOT    v29.4s, v11.16b, v5.4b[1]
261        INS     v8.d[1], x14
262        SDOT    v30.4s, v11.16b, v6.4b[1]
263        LDR     x14,  [x5], 8
264        SDOT    v31.4s, v11.16b, v7.4b[1]
265        B.HS    1b
266
267        # Epilogue.  Same as main loop but no preloads in final group
2682:
269        # BLOCK 0
270        SDOT    v16.4s,  v8.16b, v0.4b[0]
271        LDR     d10,  [x5], 8
272        SDOT    v17.4s,  v8.16b, v1.4b[0]
273        INS     v9.d[1], x14
274        SDOT    v18.4s,  v8.16b, v2.4b[0]
275        LDR     x14,  [x5], 8
276        SDOT    v19.4s,  v8.16b, v3.4b[0]
277        LDR     d4,  [x3], 8
278
279        # BLOCK 1
280        SDOT    v20.4s,  v9.16b, v0.4b[0]
281        LDR     d11,  [x5], 8
282        SDOT    v21.4s,  v9.16b, v1.4b[0]
283        INS     v10.d[1], x14
284        SDOT    v22.4s,  v9.16b, v2.4b[0]
285        LDR     x14,  [x5], 8
286        SDOT    v23.4s,  v9.16b, v3.4b[0]
287        LDR     d5, [x15], 8
288
289        # BLOCK 2
290        SDOT    v24.4s, v10.16b, v0.4b[0]
291        LDR     d8,  [x5], 8
292        SDOT    v25.4s, v10.16b, v1.4b[0]
293        INS     v11.d[1], x14
294        SDOT    v26.4s, v10.16b, v2.4b[0]
295        LDR     x14,  [x5], 8
296        SDOT    v27.4s, v10.16b, v3.4b[0]
297        LDR     d6, [x13], 8
298
299        # BLOCK 3
300        SDOT    v28.4s, v11.16b, v0.4b[0]
301        LDR     d9,  [x5], 8
302        SDOT    v29.4s, v11.16b, v1.4b[0]
303        INS     v8.d[1], x14
304        SDOT    v30.4s, v11.16b, v2.4b[0]
305        LDR     x14,  [x5], 8
306        SDOT    v31.4s, v11.16b, v3.4b[0]
307        LDR     d7,  [x4], 8
308
309        # BLOCK 0
310        SDOT    v16.4s,  v8.16b, v0.4b[1]
311        LDR     d10,  [x5], 8
312        SDOT    v17.4s,  v8.16b, v1.4b[1]
313        INS     v9.d[1], x14
314        SDOT    v18.4s,  v8.16b, v2.4b[1]
315        LDR     x14,  [x5], 8
316        SDOT    v19.4s,  v8.16b, v3.4b[1]
317
318        # BLOCK 1
319        SDOT    v20.4s,  v9.16b, v0.4b[1]
320        LDR     d11,  [x5], 8
321        SDOT    v21.4s,  v9.16b, v1.4b[1]
322        INS     v10.d[1], x14
323        SDOT    v22.4s,  v9.16b, v2.4b[1]
324        LDR     x14,  [x5], 8
325        SDOT    v23.4s,  v9.16b, v3.4b[1]
326
327        # BLOCK 2
328        SDOT    v24.4s, v10.16b, v0.4b[1]
329        LDR     d8,  [x5], 8
330        SDOT    v25.4s, v10.16b, v1.4b[1]
331        INS     v11.d[1], x14
332        SDOT    v26.4s, v10.16b, v2.4b[1]
333        LDR     x14,  [x5], 8
334        SDOT    v27.4s, v10.16b, v3.4b[1]
335
336        # BLOCK 3
337        SDOT    v28.4s, v11.16b, v0.4b[1]
338        LDR     d9,  [x5], 8
339        SDOT    v29.4s, v11.16b, v1.4b[1]
340        INS     v8.d[1], x14
341        SDOT    v30.4s, v11.16b, v2.4b[1]
342        LDR     x14,  [x5], 8
343        SDOT    v31.4s, v11.16b, v3.4b[1]
344
345        # BLOCK 0
346        SDOT    v16.4s,  v8.16b, v4.4b[0]
347        LDR     d10,  [x5], 8
348        SDOT    v17.4s,  v8.16b, v5.4b[0]
349        INS     v9.d[1], x14
350        SDOT    v18.4s,  v8.16b, v6.4b[0]
351        LDR     x14,  [x5], 8
352        SDOT    v19.4s,  v8.16b, v7.4b[0]
353
354        # BLOCK 1
355        SDOT    v20.4s,  v9.16b, v4.4b[0]
356        LDR     d11,  [x5], 8
357        SDOT    v21.4s,  v9.16b, v5.4b[0]
358        INS     v10.d[1], x14
359        SDOT    v22.4s,  v9.16b, v6.4b[0]
360        LDR     x14,  [x5], 8
361        SDOT    v23.4s,  v9.16b, v7.4b[0]
362
363        # BLOCK 2
364        SDOT    v24.4s, v10.16b, v4.4b[0]
365        LDR     d8,  [x5], 8
366        SDOT    v25.4s, v10.16b, v5.4b[0]
367        INS     v11.d[1], x14
368        SDOT    v26.4s, v10.16b, v6.4b[0]
369        LDR     x14,  [x5], 8
370        SDOT    v27.4s, v10.16b, v7.4b[0]
371
372        # BLOCK 3
373        SDOT    v28.4s, v11.16b, v4.4b[0]
374        LDR     d9,  [x5], 8
375        SDOT    v29.4s, v11.16b, v5.4b[0]
376        INS     v8.d[1], x14
377        SDOT    v30.4s, v11.16b, v6.4b[0]
378        LDR     x14,  [x5], 8
379        SDOT    v31.4s, v11.16b, v7.4b[0]
380
381        # BLOCK 0
382        SDOT    v16.4s,  v8.16b, v4.4b[1]
383        LDR     d10,  [x5], 8
384        SDOT    v17.4s,  v8.16b, v5.4b[1]
385        INS     v9.d[1], x14
386        SDOT    v18.4s,  v8.16b, v6.4b[1]
387        LDR     x14,  [x5], 8
388        SDOT    v19.4s,  v8.16b, v7.4b[1]
389
390        # BLOCK 1
391        SDOT    v20.4s,  v9.16b, v4.4b[1]
392        LDR     d11,  [x5], 8
393        SDOT    v21.4s,  v9.16b, v5.4b[1]
394        INS     v10.d[1], x14
395        SDOT    v22.4s,  v9.16b, v6.4b[1]
396        LDR     x14,  [x5], 8
397        SDOT    v23.4s,  v9.16b, v7.4b[1]
398
399        # BLOCK 2
400        SDOT    v24.4s, v10.16b, v4.4b[1]
401        SDOT    v25.4s, v10.16b, v5.4b[1]
402        INS     v11.d[1], x14
403        SDOT    v26.4s, v10.16b, v6.4b[1]
404        SDOT    v27.4s, v10.16b, v7.4b[1]
405        AND     x0, x2, 15              // kc remainder 0 to 12
406
407        # BLOCK 3
408        SDOT    v28.4s, v11.16b, v4.4b[1]
409        SDOT    v29.4s, v11.16b, v5.4b[1]
410        SDOT    v30.4s, v11.16b, v6.4b[1]
411        SDOT    v31.4s, v11.16b, v7.4b[1]
412
413        # Is there a remainder?- 4 to 12 bytes of A
414        CBNZ    x0, 5f
415
416        .p2align 3
4173:
418        SCVTF   v16.4s, v16.4s
419        SCVTF   v17.4s, v17.4s
420        # Apply params - scale, bias and clamp
421        LD1R    {v4.4s}, [x11], 4
422        SCVTF   v18.4s, v18.4s
423        SCVTF   v19.4s, v19.4s
424        SCVTF   v20.4s, v20.4s
425        SCVTF   v21.4s, v21.4s
426        SCVTF   v22.4s, v22.4s
427        SCVTF   v23.4s, v23.4s
428        SCVTF   v24.4s, v24.4s
429        SCVTF   v25.4s, v25.4s
430        SCVTF   v26.4s, v26.4s
431        SCVTF   v27.4s, v27.4s
432        SCVTF   v28.4s, v28.4s
433        SCVTF   v29.4s, v29.4s
434        SCVTF   v30.4s, v30.4s
435        SCVTF   v31.4s, v31.4s
436
437        FMUL    v16.4s, v16.4s, v4.4s
438        FMUL    v17.4s, v17.4s, v4.4s
439        FMUL    v18.4s, v18.4s, v4.4s
440        FMUL    v19.4s, v19.4s, v4.4s
441        FMUL    v20.4s, v20.4s, v4.4s
442        FMUL    v21.4s, v21.4s, v4.4s
443        FMUL    v22.4s, v22.4s, v4.4s
444        FMUL    v23.4s, v23.4s, v4.4s
445        FMUL    v24.4s, v24.4s, v4.4s
446        FMUL    v25.4s, v25.4s, v4.4s
447        FMUL    v26.4s, v26.4s, v4.4s
448        FMUL    v27.4s, v27.4s, v4.4s
449        FMUL    v28.4s, v28.4s, v4.4s
450        FMUL    v29.4s, v29.4s, v4.4s
451        FMUL    v30.4s, v30.4s, v4.4s
452        FMUL    v31.4s, v31.4s, v4.4s
453
454        FCVTNS  v16.4s, v16.4s
455        FCVTNS  v17.4s, v17.4s
456        FCVTNS  v18.4s, v18.4s
457        FCVTNS  v19.4s, v19.4s
458        FCVTNS  v20.4s, v20.4s
459        FCVTNS  v21.4s, v21.4s
460        FCVTNS  v22.4s, v22.4s
461        FCVTNS  v23.4s, v23.4s
462        FCVTNS  v24.4s, v24.4s
463        FCVTNS  v25.4s, v25.4s
464        FCVTNS  v26.4s, v26.4s
465        FCVTNS  v27.4s, v27.4s
466        FCVTNS  v28.4s, v28.4s
467        FCVTNS  v29.4s, v29.4s
468        FCVTNS  v30.4s, v30.4s
469        FCVTNS  v31.4s, v31.4s
470
471        SQXTN   v16.4h, v16.4s
472        SQXTN   v17.4h, v17.4s
473        SQXTN   v18.4h, v18.4s
474        SQXTN   v19.4h, v19.4s
475        SQXTN   v24.4h, v24.4s
476        SQXTN   v25.4h, v25.4s
477        SQXTN   v26.4h, v26.4s
478        SQXTN   v27.4h, v27.4s
479        LD1R    {v6.8h}, [x11], 2       // add bias
480
481        SQXTN2  v16.8h, v20.4s
482        SQXTN2  v17.8h, v21.4s
483        SQXTN2  v18.8h, v22.4s
484        SQXTN2  v19.8h, v23.4s
485        SQXTN2  v24.8h, v28.4s
486        SQXTN2  v25.8h, v29.4s
487        SQXTN2  v26.8h, v30.4s
488        SQXTN2  v27.8h, v31.4s
489
490        SQADD   v16.8h, v16.8h, v6.8h
491        SQADD   v17.8h, v17.8h, v6.8h
492        SQADD   v18.8h, v18.8h, v6.8h
493        SQADD   v19.8h, v19.8h, v6.8h
494        SQADD   v24.8h, v24.8h, v6.8h
495        SQADD   v25.8h, v25.8h, v6.8h
496        SQADD   v26.8h, v26.8h, v6.8h
497        SQADD   v27.8h, v27.8h, v6.8h
498        LD1R    {v4.16b}, [x11], 1      // clamp min value
499
500        SQXTN   v0.8b, v16.8h
501        SQXTN   v1.8b, v17.8h
502        SQXTN   v2.8b, v18.8h
503        SQXTN   v3.8b, v19.8h
504        LD1R    {v5.16b}, [x11]         // clamp max value
505        SQXTN2  v0.16b, v24.8h
506        SQXTN2  v1.16b, v25.8h
507        SQXTN2  v2.16b, v26.8h
508        SQXTN2  v3.16b, v27.8h
509        SUB     x11, x11, 7            // rewind params pointer
510
511        SMAX    v0.16b, v0.16b, v4.16b
512        SMAX    v1.16b, v1.16b, v4.16b
513        SMAX    v2.16b, v2.16b, v4.16b
514        SMAX    v3.16b, v3.16b, v4.16b
515        SUBS    x1, x1, 16
516        SMIN    v0.16b, v0.16b, v5.16b
517        SMIN    v1.16b, v1.16b, v5.16b
518        SMIN    v2.16b, v2.16b, v5.16b
519        SMIN    v3.16b, v3.16b, v5.16b
520        B.LO    6f
521
522        # Store full 4 x 16
523        ST1     {v0.16b}, [x6], x12
524        SUB     x3,  x3, x2             // a0 -= kc
525        ST1     {v1.16b}, [x8], x12
526        SUB     x15, x15, x2            // a1 -= kc
527        ST1     {v2.16b}, [x9], x12
528        SUB     x13, x13, x2            // a2 -= kc
529        ST1     {v3.16b}, [x7], x12
530        SUB     x4,  x4, x2             // a3 -= kc
531        B.NE    0b
532
533        # Restore d8-d11 from stack
534        LDP     d10, d11, [sp, 16]
535        LDP     d8,  d9, [sp], 32
536        RET
537
538        # Remainder- 4 to 12 bytes of A
539        # Although C4, its safe to read 16 bytes.
540        .p2align 3
5414:
542        AND     x0, x2, 15              // kc remainder 4 to 12
5435:
544        LDP     q8,  q9,  [x5], 32
545        LDP     q10, q11,  [x5], 32
546        LD1     {v0.16b},  [x3], x0
547        LD1     {v1.16b}, [x15], x0
548        LD1     {v2.16b}, [x13], x0
549        LD1     {v3.16b},  [x4], x0
550        SDOT    v16.4s,  v8.16b, v0.4b[0]
551        SDOT    v17.4s,  v8.16b, v1.4b[0]
552        SDOT    v18.4s,  v8.16b, v2.4b[0]
553        SDOT    v19.4s,  v8.16b, v3.4b[0]
554        SDOT    v20.4s,  v9.16b, v0.4b[0]
555        SDOT    v21.4s,  v9.16b, v1.4b[0]
556        SDOT    v22.4s,  v9.16b, v2.4b[0]
557        SDOT    v23.4s,  v9.16b, v3.4b[0]
558        SDOT    v24.4s, v10.16b, v0.4b[0]
559        SDOT    v25.4s, v10.16b, v1.4b[0]
560        SDOT    v26.4s, v10.16b, v2.4b[0]
561        SDOT    v27.4s, v10.16b, v3.4b[0]
562        SDOT    v28.4s, v11.16b, v0.4b[0]
563        SDOT    v29.4s, v11.16b, v1.4b[0]
564        SDOT    v30.4s, v11.16b, v2.4b[0]
565        SDOT    v31.4s, v11.16b, v3.4b[0]
566        CMP     x0, 4
567        B.LS    3b
568        LDP     q8,  q9,  [x5], 32
569        LDP     q10, q11,  [x5], 32
570        SDOT    v16.4s,  v8.16b, v0.4b[1]
571        SDOT    v17.4s,  v8.16b, v1.4b[1]
572        SDOT    v18.4s,  v8.16b, v2.4b[1]
573        SDOT    v19.4s,  v8.16b, v3.4b[1]
574        SDOT    v20.4s,  v9.16b, v0.4b[1]
575        SDOT    v21.4s,  v9.16b, v1.4b[1]
576        SDOT    v22.4s,  v9.16b, v2.4b[1]
577        SDOT    v23.4s,  v9.16b, v3.4b[1]
578        SDOT    v24.4s, v10.16b, v0.4b[1]
579        SDOT    v25.4s, v10.16b, v1.4b[1]
580        SDOT    v26.4s, v10.16b, v2.4b[1]
581        SDOT    v27.4s, v10.16b, v3.4b[1]
582        SDOT    v28.4s, v11.16b, v0.4b[1]
583        SDOT    v29.4s, v11.16b, v1.4b[1]
584        SDOT    v30.4s, v11.16b, v2.4b[1]
585        SDOT    v31.4s, v11.16b, v3.4b[1]
586        CMP     x0, 8
587        B.LS    3b
588        LDP     q8,  q9,  [x5], 32
589        LDP     q10, q11,  [x5], 32
590        SDOT    v16.4s,  v8.16b, v0.4b[2]
591        SDOT    v17.4s,  v8.16b, v1.4b[2]
592        SDOT    v18.4s,  v8.16b, v2.4b[2]
593        SDOT    v19.4s,  v8.16b, v3.4b[2]
594        SDOT    v20.4s,  v9.16b, v0.4b[2]
595        SDOT    v21.4s,  v9.16b, v1.4b[2]
596        SDOT    v22.4s,  v9.16b, v2.4b[2]
597        SDOT    v23.4s,  v9.16b, v3.4b[2]
598        SDOT    v24.4s, v10.16b, v0.4b[2]
599        SDOT    v25.4s, v10.16b, v1.4b[2]
600        SDOT    v26.4s, v10.16b, v2.4b[2]
601        SDOT    v27.4s, v10.16b, v3.4b[2]
602        SDOT    v28.4s, v11.16b, v0.4b[2]
603        SDOT    v29.4s, v11.16b, v1.4b[2]
604        SDOT    v30.4s, v11.16b, v2.4b[2]
605        SDOT    v31.4s, v11.16b, v3.4b[2]
606        B       3b
607
608        # Store odd width
609        .p2align 3
6106:
611        TBZ     x1, 3, 7f
612        STR     d0, [x6], 8
613        STR     d1, [x8], 8
614        DUP     d0, v0.d[1]
615        DUP     d1, v1.d[1]
616        STR     d2, [x9], 8
617        STR     d3, [x7], 8
618        DUP     d2, v2.d[1]
619        DUP     d3, v3.d[1]
6207:
621        TBZ     x1, 2, 8f
622        STR     s0, [x6], 4
623        STR     s1, [x8], 4
624        DUP     s0, v0.s[1]
625        DUP     s1, v1.s[1]
626        STR     s2, [x9], 4
627        STR     s3, [x7], 4
628        DUP     s2, v2.s[1]
629        DUP     s3, v3.s[1]
6308:
631        TBZ     x1, 1, 9f
632        STR     h0, [x6], 2
633        STR     h1, [x8], 2
634        DUP     h0, v0.h[1]
635        DUP     h1, v1.h[1]
636        STR     h2, [x9], 2
637        STR     h3, [x7], 2
638        DUP     h2, v2.h[1]
639        DUP     h3, v3.h[1]
6409:
641        TBZ     x1, 0, 10f
642        STR     b0, [x6]
643        STR     b1, [x8]
644        STR     b2, [x9]
645        STR     b3, [x7]
64610:
647        # Restore d8-d11 from stack
648        LDP     d10, d11, [sp, 16]
649        LDP     d8,  d9, [sp], 32
650        RET
651
652END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
653
654#ifdef __ELF__
655.section ".note.GNU-stack","",%progbits
656#endif
657