xref: /aosp_15_r20/external/XNNPACK/src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qu8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qu8_conv_minmax_params params)  [sp + 8] -> x11
24
25# params structure is 12 bytes
26#  struct {
27#    uint8_t kernel_zero_point[4];
28#    float scale;
29#    int16_t output_zero_point;
30#    int8_t output_min;
31#    int8_t output_max;
32#  } fp32_neonv8;
33
34# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
35
36# Register usage
37# A0  x3  v0  v4
38# A1 x15  v1  v5
39# A2 x13  v2  v6
40# A3  x4  v3  (v0)
41# B   x5  v8  v9 v10 v11
42# C0  x6 v16 v20 v24 v28
43# C1  x8 v17 v21 v25 v29
44# C2  x9 v18 v22 v26 v30
45# C3  x7 v19 v23 v27 v31
46# zero point v7 v12 v13 v14 v15
47
48# x14 temp for Cortex-A55 loads
49
50BEGIN_FUNCTION xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
51
52        # Clamp A and C pointers
53        CMP     x0, 2                   // if mr < 2
54        LDP     x12, x11, [sp]          // cn_stride, params
55        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
56        ADD     x15, x3, x4             // a1 = a0 + a_stride
57        ADD     x8, x6, x7              // c1 = c0 + cm_stride
58
59        # Save d8-d15 to stack
60        STP     d8, d9, [sp, -64]!
61        CSEL    x15, x3, x15, LO        //   a1 = a0
62        CSEL    x8, x6,  x8, LO         //   c1 = c0
63        BIC     x2, x2, 3
64        STP     d10, d11, [sp, 16]
65
66        ADD     x13, x15, x4            // a2 = a1 + a_stride
67        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
68        STP     d12, d13, [sp, 32]
69                                        // if mr <= 2
70        CSEL    x13, x15, x13, LS       //   a2 = a1
71        CSEL    x9,  x8,  x9, LS        //   c2 = c1
72        STP     d14, d15, [sp, 48]
73
74        CMP     x0, 4                   // if mr < 4
75        ADD     x4, x13, x4             // a3 = a2 + a_stride
76        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
77
78        LD1R    {v7.4s}, [x11], 4       // kernel_zero_point
79
80        CSEL    x4, x13, x4, LO         //   a3 = a2
81        CSEL    x7,  x9, x7, LO         //   c3 = c2
82
83
84        .p2align 3
850:
86        # Load initial bias from w into accumulators
87        LDP     q16, q20, [x5], 32
88
89        MOVI    v12.4s, 0
90        MOVI    v13.4s, 0
91        MOVI    v14.4s, 0
92        MOVI    v15.4s, 0
93
94        LDP     q24, q28, [x5], 32
95        MOV     v17.16b, v16.16b
96        MOV     v18.16b, v16.16b
97        MOV     v19.16b, v16.16b
98        MOV     v21.16b, v20.16b
99        SUBS    x0, x2, 16              // k = kc - 16
100        MOV     v22.16b, v20.16b
101        MOV     v23.16b, v20.16b
102        MOV     v25.16b, v24.16b
103        MOV     v26.16b, v24.16b
104        MOV     v27.16b, v24.16b
105        MOV     v29.16b, v28.16b
106        MOV     v30.16b, v28.16b
107        MOV     v31.16b, v28.16b
108
109        # Is there at least 16 bytes for prologue/epilogue?
110        B.LO    4f
111
112        # prologue - read A and B values for block 0 and 1
113        LDR     d0,  [x3], 8
114        LDR     q8,  [x5], 16
115        LDR     d1, [x15], 8
116        LDR     d2, [x13], 8
117        LDR     d3,  [x4], 8
118        SUBS    x0, x0, 16              // is there 16 for main loop?
119        LDR     d9,  [x5], 8
120        LDR     x14, [x5], 8
121        # Is there at least 16 bytes for main loop?
122        B.LO    2f
123
124        # Main loop - 16 bytes of A in 4 groups.
125        # 4 row of 4 vectors wide = 16 UDOT instructions for 4 channels
126        # 4 LD64 for A
127        # 4 LD128 for W. = 2 LD64 + INS.
128        # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS.
129
130        .p2align 3
1311:
132        # BLOCK 0
133        UDOT    v16.4s,  v8.16b, v0.4b[0]
134        LDR     d10,  [x5], 8
135        UDOT    v17.4s,  v8.16b, v1.4b[0]
136        INS     v9.d[1], x14
137        UDOT    v18.4s,  v8.16b, v2.4b[0]
138        LDR     x14,  [x5], 8
139        UDOT    v19.4s,  v8.16b, v3.4b[0]
140
141        # BLOCK 1
142        UDOT    v20.4s,  v9.16b, v0.4b[0]
143        LDR     d11,  [x5], 8
144        UDOT    v21.4s,  v9.16b, v1.4b[0]
145        INS     v10.d[1], x14
146        UDOT    v22.4s,  v9.16b, v2.4b[0]
147        LDR     x14,  [x5], 8
148        UDOT    v23.4s,  v9.16b, v3.4b[0]
149
150        # BLOCK 2
151        UDOT    v24.4s, v10.16b, v0.4b[0]
152        LDR     d8,  [x5], 8
153        UDOT    v25.4s, v10.16b, v1.4b[0]
154        INS     v11.d[1], x14
155        UDOT    v26.4s, v10.16b, v2.4b[0]
156        LDR     x14,  [x5], 8
157        UDOT    v27.4s, v10.16b, v3.4b[0]
158
159        # BLOCK 3
160        UDOT    v28.4s, v11.16b, v0.4b[0]
161        LDR     d9,  [x5], 8
162        UDOT    v29.4s, v11.16b, v1.4b[0]
163        INS     v8.d[1], x14
164        UDOT    v30.4s, v11.16b, v2.4b[0]
165        LDR     x14,  [x5], 8
166        UDOT    v31.4s, v11.16b, v3.4b[0]
167
168        UDOT    v12.2s, v7.8b, v0.8b
169        UDOT    v13.2s, v7.8b, v1.8b
170        UDOT    v14.2s, v7.8b, v2.8b
171        UDOT    v15.2s, v7.8b, v3.8b
172
173        # BLOCK 0
174        UDOT    v16.4s,  v8.16b, v0.4b[1]
175        LDR     d10,  [x5], 8
176        UDOT    v17.4s,  v8.16b, v1.4b[1]
177        INS     v9.d[1], x14
178        UDOT    v18.4s,  v8.16b, v2.4b[1]
179        LDR     x14,  [x5], 8
180        UDOT    v19.4s,  v8.16b, v3.4b[1]
181        LDR     d4,  [x3], 8
182
183        # BLOCK 1
184        UDOT    v20.4s,  v9.16b, v0.4b[1]
185        LDR     d11,  [x5], 8
186        UDOT    v21.4s,  v9.16b, v1.4b[1]
187        INS     v10.d[1], x14
188        UDOT    v22.4s,  v9.16b, v2.4b[1]
189        LDR     x14,  [x5], 8
190        UDOT    v23.4s,  v9.16b, v3.4b[1]
191        LDR     d5, [x15], 8
192
193        # BLOCK 2
194        UDOT    v24.4s, v10.16b, v0.4b[1]
195        LDR     d8,  [x5], 8
196        UDOT    v25.4s, v10.16b, v1.4b[1]
197        INS     v11.d[1], x14
198        UDOT    v26.4s, v10.16b, v2.4b[1]
199        LDR     x14,  [x5], 8
200        UDOT    v27.4s, v10.16b, v3.4b[1]
201        LDR     d6, [x13], 8
202
203        # BLOCK 3
204        UDOT    v28.4s, v11.16b, v0.4b[1]
205        LDR     d9,  [x5], 8
206        UDOT    v29.4s, v11.16b, v1.4b[1]
207        INS     v8.d[1], x14
208        UDOT    v30.4s, v11.16b, v2.4b[1]
209        LDR     x14,  [x5], 8
210        UDOT    v31.4s, v11.16b, v3.4b[1]
211        LDR     d0,  [x4], 8
212
213        # BLOCK 0
214        UDOT    v16.4s,  v8.16b, v4.4b[0]
215        LDR     d10,  [x5], 8
216        UDOT    v17.4s,  v8.16b, v5.4b[0]
217        INS     v9.d[1], x14
218        UDOT    v18.4s,  v8.16b, v6.4b[0]
219        LDR     x14,  [x5], 8
220        UDOT    v19.4s,  v8.16b, v0.4b[0]
221
222        # BLOCK 1
223        UDOT    v20.4s,  v9.16b, v4.4b[0]
224        LDR     d11,  [x5], 8
225        UDOT    v21.4s,  v9.16b, v5.4b[0]
226        INS     v10.d[1], x14
227        UDOT    v22.4s,  v9.16b, v6.4b[0]
228        LDR     x14,  [x5], 8
229        UDOT    v23.4s,  v9.16b, v0.4b[0]
230
231        # BLOCK 2
232        UDOT    v24.4s, v10.16b, v4.4b[0]
233        LDR     d8,  [x5], 8
234        UDOT    v25.4s, v10.16b, v5.4b[0]
235        INS     v11.d[1], x14
236        UDOT    v26.4s, v10.16b, v6.4b[0]
237        LDR     x14,  [x5], 8
238        UDOT    v27.4s, v10.16b, v0.4b[0]
239
240        # BLOCK 3
241        UDOT    v28.4s, v11.16b, v4.4b[0]
242        LDR     d9,  [x5], 8
243        UDOT    v29.4s, v11.16b, v5.4b[0]
244        INS     v8.d[1], x14
245        UDOT    v30.4s, v11.16b, v6.4b[0]
246        LDR     x14,  [x5], 8
247        UDOT    v31.4s, v11.16b, v0.4b[0]
248
249        # BLOCK 0
250        UDOT    v16.4s,  v8.16b, v4.4b[1]
251        LDR     d10,  [x5], 8
252        UDOT    v17.4s,  v8.16b, v5.4b[1]
253        INS     v9.d[1], x14
254        UDOT    v18.4s,  v8.16b, v6.4b[1]
255        LDR     x14,  [x5], 8
256        UDOT    v19.4s,  v8.16b, v0.4b[1]
257        LDR     d1, [x15], 8
258
259        # BLOCK 1
260        UDOT    v20.4s,  v9.16b, v4.4b[1]
261        LDR     d11,  [x5], 8
262        UDOT    v21.4s,  v9.16b, v5.4b[1]
263        INS     v10.d[1], x14
264        UDOT    v22.4s,  v9.16b, v6.4b[1]
265        LDR     x14,  [x5], 8
266        UDOT    v23.4s,  v9.16b, v0.4b[1]
267        LDR     d2, [x13], 8
268
269        # BLOCK 2
270        UDOT    v24.4s, v10.16b, v4.4b[1]
271        LDR     d8,  [x5], 8            // First B values for block 0 and 1
272        UDOT    v25.4s, v10.16b, v5.4b[1]
273        INS     v11.d[1], x14
274        UDOT    v26.4s, v10.16b, v6.4b[1]
275        LDR     x14,  [x5], 8
276        UDOT    v27.4s, v10.16b, v0.4b[1]
277        LDR     d3,  [x4], 8
278
279        # BLOCK 3 special
280        UDOT    v31.4s, v11.16b, v0.4b[1]
281        LDR     d9,  [x5], 8
282        UDOT    v15.2s, v7.8b, v0.8b    // free up v0 early
283        INS     v8.d[1], x14
284        UDOT    v28.4s, v11.16b, v4.4b[1]
285        LDR     x14,  [x5], 8
286        UDOT    v29.4s, v11.16b, v5.4b[1]
287        LDR     d0,  [x3], 8
288        UDOT    v30.4s, v11.16b, v6.4b[1]
289        SUBS    x0, x0, 16
290
291        UDOT    v12.2s, v7.8b, v4.8b
292        UDOT    v13.2s, v7.8b, v5.8b
293        UDOT    v14.2s, v7.8b, v6.8b
294        B.HS    1b
295
296        # Epilogue.  Same as main loop but no preloads in final group
2972:
298        # BLOCK 0
299        UDOT    v16.4s,  v8.16b, v0.4b[0]
300        LDR     d10,  [x5], 8
301        UDOT    v17.4s,  v8.16b, v1.4b[0]
302        INS     v9.d[1], x14
303        UDOT    v18.4s,  v8.16b, v2.4b[0]
304        LDR     x14,  [x5], 8
305        UDOT    v19.4s,  v8.16b, v3.4b[0]
306
307        # BLOCK 1
308        UDOT    v20.4s,  v9.16b, v0.4b[0]
309        LDR     d11,  [x5], 8
310        UDOT    v21.4s,  v9.16b, v1.4b[0]
311        INS     v10.d[1], x14
312        UDOT    v22.4s,  v9.16b, v2.4b[0]
313        LDR     x14,  [x5], 8
314        UDOT    v23.4s,  v9.16b, v3.4b[0]
315
316        # BLOCK 2
317        UDOT    v24.4s, v10.16b, v0.4b[0]
318        LDR     d8,  [x5], 8
319        UDOT    v25.4s, v10.16b, v1.4b[0]
320        INS     v11.d[1], x14
321        UDOT    v26.4s, v10.16b, v2.4b[0]
322        LDR     x14,  [x5], 8
323        UDOT    v27.4s, v10.16b, v3.4b[0]
324
325        # BLOCK 3
326        UDOT    v28.4s, v11.16b, v0.4b[0]
327        LDR     d9,  [x5], 8
328        UDOT    v29.4s, v11.16b, v1.4b[0]
329        INS     v8.d[1], x14
330        UDOT    v30.4s, v11.16b, v2.4b[0]
331        LDR     x14,  [x5], 8
332        UDOT    v31.4s, v11.16b, v3.4b[0]
333
334        UDOT    v12.2s, v7.8b, v0.8b
335        UDOT    v13.2s, v7.8b, v1.8b
336        UDOT    v14.2s, v7.8b, v2.8b
337        UDOT    v15.2s, v7.8b, v3.8b
338
339        # BLOCK 0
340        UDOT    v16.4s,  v8.16b, v0.4b[1]
341        LDR     d10,  [x5], 8
342        UDOT    v17.4s,  v8.16b, v1.4b[1]
343        INS     v9.d[1], x14
344        UDOT    v18.4s,  v8.16b, v2.4b[1]
345        LDR     x14,  [x5], 8
346        UDOT    v19.4s,  v8.16b, v3.4b[1]
347        LDR     d4,  [x3], 8
348
349        # BLOCK 1
350        UDOT    v20.4s,  v9.16b, v0.4b[1]
351        LDR     d11,  [x5], 8
352        UDOT    v21.4s,  v9.16b, v1.4b[1]
353        INS     v10.d[1], x14
354        UDOT    v22.4s,  v9.16b, v2.4b[1]
355        LDR     x14,  [x5], 8
356        UDOT    v23.4s,  v9.16b, v3.4b[1]
357        LDR     d5, [x15], 8
358
359        # BLOCK 2
360        UDOT    v24.4s, v10.16b, v0.4b[1]
361        LDR     d8,  [x5], 8
362        UDOT    v25.4s, v10.16b, v1.4b[1]
363        INS     v11.d[1], x14
364        UDOT    v26.4s, v10.16b, v2.4b[1]
365        LDR     x14,  [x5], 8
366        UDOT    v27.4s, v10.16b, v3.4b[1]
367        LDR     d6, [x13], 8
368
369        # BLOCK 3
370        UDOT    v28.4s, v11.16b, v0.4b[1]
371        LDR     d9,  [x5], 8
372        UDOT    v29.4s, v11.16b, v1.4b[1]
373        INS     v8.d[1], x14
374        UDOT    v30.4s, v11.16b, v2.4b[1]
375        LDR     x14,  [x5], 8
376        UDOT    v31.4s, v11.16b, v3.4b[1]
377        LDR     d0,  [x4], 8
378
379        # BLOCK 0
380        UDOT    v16.4s,  v8.16b, v4.4b[0]
381        LDR     d10,  [x5], 8
382        UDOT    v17.4s,  v8.16b, v5.4b[0]
383        INS     v9.d[1], x14
384        UDOT    v18.4s,  v8.16b, v6.4b[0]
385        LDR     x14,  [x5], 8
386        UDOT    v19.4s,  v8.16b, v0.4b[0]
387
388        # BLOCK 1
389        UDOT    v20.4s,  v9.16b, v4.4b[0]
390        LDR     d11,  [x5], 8
391        UDOT    v21.4s,  v9.16b, v5.4b[0]
392        INS     v10.d[1], x14
393        UDOT    v22.4s,  v9.16b, v6.4b[0]
394        LDR     x14,  [x5], 8
395        UDOT    v23.4s,  v9.16b, v0.4b[0]
396
397        # BLOCK 2
398        UDOT    v24.4s, v10.16b, v4.4b[0]
399        LDR     d8,  [x5], 8
400        UDOT    v25.4s, v10.16b, v5.4b[0]
401        INS     v11.d[1], x14
402        UDOT    v26.4s, v10.16b, v6.4b[0]
403        LDR     x14,  [x5], 8
404        UDOT    v27.4s, v10.16b, v0.4b[0]
405
406        # BLOCK 3
407        UDOT    v28.4s, v11.16b, v4.4b[0]
408        LDR     d9,  [x5], 8
409        UDOT    v29.4s, v11.16b, v5.4b[0]
410        INS     v8.d[1], x14
411        UDOT    v30.4s, v11.16b, v6.4b[0]
412        LDR     x14,  [x5], 8
413        UDOT    v31.4s, v11.16b, v0.4b[0]
414
415        # BLOCK 0
416        UDOT    v16.4s,  v8.16b, v4.4b[1]
417        LDR     d10,  [x5], 8
418        UDOT    v17.4s,  v8.16b, v5.4b[1]
419        INS     v9.d[1], x14
420        UDOT    v18.4s,  v8.16b, v6.4b[1]
421        LDR     x14,  [x5], 8
422        UDOT    v19.4s,  v8.16b, v0.4b[1]
423
424        # BLOCK 1
425        UDOT    v20.4s,  v9.16b, v4.4b[1]
426        LDR     d11,  [x5], 8
427        UDOT    v21.4s,  v9.16b, v5.4b[1]
428        INS     v10.d[1], x14
429        UDOT    v22.4s,  v9.16b, v6.4b[1]
430        LDR     x14,  [x5], 8
431        UDOT    v23.4s,  v9.16b, v0.4b[1]
432
433        # BLOCK 2
434        UDOT    v24.4s, v10.16b, v4.4b[1]
435        UDOT    v25.4s, v10.16b, v5.4b[1]
436        INS     v11.d[1], x14
437        UDOT    v26.4s, v10.16b, v6.4b[1]
438        UDOT    v27.4s, v10.16b, v0.4b[1]
439
440        # BLOCK 3
441        UDOT    v28.4s, v11.16b, v4.4b[1]
442        UDOT    v29.4s, v11.16b, v5.4b[1]
443        UDOT    v30.4s, v11.16b, v6.4b[1]
444        UDOT    v31.4s, v11.16b, v0.4b[1]
445        AND     x0, x2, 15              // kc remainder 0 to 12
446
447        UDOT    v12.2s, v7.8b, v4.8b
448        UDOT    v13.2s, v7.8b, v5.8b
449        UDOT    v14.2s, v7.8b, v6.8b
450        UDOT    v15.2s, v7.8b, v0.8b
451
452        # Is there a remainder?- 4 to 12 bytes of A
453        CBNZ    x0, 4f
454
4553:
456        ADDP    v0.2s, v12.2s, v13.2s
457        ADDP    v1.2s, v14.2s, v15.2s
458        DUP     v12.4s, v0.s[0]
459        DUP     v13.4s, v0.s[1]
460        DUP     v14.4s, v1.s[0]
461        DUP     v15.4s, v1.s[1]
462
463        # Subtract zero point from accumulators
464        SUB     v16.4s, v16.4s, v12.4s
465        SUB     v17.4s, v17.4s, v13.4s
466        SUB     v18.4s, v18.4s, v14.4s
467        SUB     v19.4s, v19.4s, v15.4s
468        SUB     v20.4s, v20.4s, v12.4s
469        SUB     v21.4s, v21.4s, v13.4s
470        SUB     v22.4s, v22.4s, v14.4s
471        SUB     v23.4s, v23.4s, v15.4s
472        SUB     v24.4s, v24.4s, v12.4s
473        SUB     v25.4s, v25.4s, v13.4s
474        SUB     v26.4s, v26.4s, v14.4s
475        SUB     v27.4s, v27.4s, v15.4s
476        SUB     v28.4s, v28.4s, v12.4s
477        SUB     v29.4s, v29.4s, v13.4s
478        SUB     v30.4s, v30.4s, v14.4s
479        SUB     v31.4s, v31.4s, v15.4s
480
481        SCVTF   v16.4s, v16.4s
482        SCVTF   v17.4s, v17.4s
483        # Apply params - scale, bias and clamp
484        LD1R    {v4.4s}, [x11], 4
485        SCVTF   v18.4s, v18.4s
486        SCVTF   v19.4s, v19.4s
487        SCVTF   v20.4s, v20.4s
488        SCVTF   v21.4s, v21.4s
489        SCVTF   v22.4s, v22.4s
490        SCVTF   v23.4s, v23.4s
491        SCVTF   v24.4s, v24.4s
492        SCVTF   v25.4s, v25.4s
493        SCVTF   v26.4s, v26.4s
494        SCVTF   v27.4s, v27.4s
495        SCVTF   v28.4s, v28.4s
496        SCVTF   v29.4s, v29.4s
497        SCVTF   v30.4s, v30.4s
498        SCVTF   v31.4s, v31.4s
499
500        FMUL    v16.4s, v16.4s, v4.4s
501        FMUL    v17.4s, v17.4s, v4.4s
502        FMUL    v18.4s, v18.4s, v4.4s
503        FMUL    v19.4s, v19.4s, v4.4s
504        FMUL    v20.4s, v20.4s, v4.4s
505        FMUL    v21.4s, v21.4s, v4.4s
506        FMUL    v22.4s, v22.4s, v4.4s
507        FMUL    v23.4s, v23.4s, v4.4s
508        FMUL    v24.4s, v24.4s, v4.4s
509        FMUL    v25.4s, v25.4s, v4.4s
510        FMUL    v26.4s, v26.4s, v4.4s
511        FMUL    v27.4s, v27.4s, v4.4s
512        FMUL    v28.4s, v28.4s, v4.4s
513        FMUL    v29.4s, v29.4s, v4.4s
514        FMUL    v30.4s, v30.4s, v4.4s
515        FMUL    v31.4s, v31.4s, v4.4s
516
517        FCVTNS  v16.4s, v16.4s
518        FCVTNS  v17.4s, v17.4s
519        FCVTNS  v18.4s, v18.4s
520        FCVTNS  v19.4s, v19.4s
521        FCVTNS  v20.4s, v20.4s
522        FCVTNS  v21.4s, v21.4s
523        FCVTNS  v22.4s, v22.4s
524        FCVTNS  v23.4s, v23.4s
525        FCVTNS  v24.4s, v24.4s
526        FCVTNS  v25.4s, v25.4s
527        FCVTNS  v26.4s, v26.4s
528        FCVTNS  v27.4s, v27.4s
529        FCVTNS  v28.4s, v28.4s
530        FCVTNS  v29.4s, v29.4s
531        FCVTNS  v30.4s, v30.4s
532        FCVTNS  v31.4s, v31.4s
533
534        SQXTN   v16.4h, v16.4s
535        SQXTN   v17.4h, v17.4s
536        SQXTN   v18.4h, v18.4s
537        SQXTN   v19.4h, v19.4s
538        SQXTN   v24.4h, v24.4s
539        SQXTN   v25.4h, v25.4s
540        SQXTN   v26.4h, v26.4s
541        SQXTN   v27.4h, v27.4s
542        LD1R    {v6.8h}, [x11], 2       // add bias
543
544        SQXTN2  v16.8h, v20.4s
545        SQXTN2  v17.8h, v21.4s
546        SQXTN2  v18.8h, v22.4s
547        SQXTN2  v19.8h, v23.4s
548        SQXTN2  v24.8h, v28.4s
549        SQXTN2  v25.8h, v29.4s
550        SQXTN2  v26.8h, v30.4s
551        SQXTN2  v27.8h, v31.4s
552
553        SQADD   v16.8h, v16.8h, v6.8h
554        SQADD   v17.8h, v17.8h, v6.8h
555        SQADD   v18.8h, v18.8h, v6.8h
556        SQADD   v19.8h, v19.8h, v6.8h
557        SQADD   v24.8h, v24.8h, v6.8h
558        SQADD   v25.8h, v25.8h, v6.8h
559        SQADD   v26.8h, v26.8h, v6.8h
560        SQADD   v27.8h, v27.8h, v6.8h
561        LD1R    {v4.16b}, [x11], 1      // clamp min value
562
563        SQXTUN  v0.8b, v16.8h
564        SQXTUN  v1.8b, v17.8h
565        SQXTUN  v2.8b, v18.8h
566        SQXTUN  v3.8b, v19.8h
567        LD1R    {v5.16b}, [x11]         // clamp max value
568        SQXTUN2 v0.16b, v24.8h
569        SQXTUN2 v1.16b, v25.8h
570        SQXTUN2 v2.16b, v26.8h
571        SQXTUN2 v3.16b, v27.8h
572
573        SUB     x11, x11, 7             // rewind params pointer
574
575        UMAX    v0.16b, v0.16b, v4.16b
576        UMAX    v1.16b, v1.16b, v4.16b
577        UMAX    v2.16b, v2.16b, v4.16b
578        UMAX    v3.16b, v3.16b, v4.16b
579        SUBS    x1, x1, 16
580        UMIN    v0.16b, v0.16b, v5.16b
581        UMIN    v1.16b, v1.16b, v5.16b
582        UMIN    v2.16b, v2.16b, v5.16b
583        UMIN    v3.16b, v3.16b, v5.16b
584        B.LO    6f
585
586        # Store full 4 x 16
587        ST1     {v0.16b}, [x6], x12
588        SUB     x3,  x3, x2             // a0 -= kc
589        ST1     {v1.16b}, [x8], x12
590        SUB     x15, x15, x2            // a1 -= kc
591        ST1     {v2.16b}, [x9], x12
592        SUB     x13, x13, x2            // a2 -= kc
593        ST1     {v3.16b}, [x7], x12
594        SUB     x4,  x4, x2             // a3 -= kc
595        B.NE    0b
596
597        # Restore d8-d15 from stack
598        LDP     d14, d15, [sp, 48]
599        LDP     d12, d13, [sp, 32]
600        LDP     d10, d11, [sp, 16]
601        LDP     d8, d9, [sp], 64
602        RET
603
604        # Remainder- 4 to 12 bytes of A
605        .p2align 3
6064:
607        TBZ     x0, 3, 5f
608
609        LDR     d0,  [x3], 8
610        LDP     q8,  q9,  [x5], 32
611        LDR     d1, [x15], 8
612        LDR     d2, [x13], 8
613        LDR     d3,  [x4], 8
614        LDP     q10, q11, [x5], 32
615        UDOT    v12.2s, v7.8b, v0.8b
616        UDOT    v13.2s, v7.8b, v1.8b
617        UDOT    v14.2s, v7.8b, v2.8b
618        UDOT    v15.2s, v7.8b, v3.8b
619        UDOT    v16.4s,  v8.16b, v0.4b[0]
620        UDOT    v17.4s,  v8.16b, v1.4b[0]
621        UDOT    v18.4s,  v8.16b, v2.4b[0]
622        UDOT    v19.4s,  v8.16b, v3.4b[0]
623        UDOT    v20.4s,  v9.16b, v0.4b[0]
624        UDOT    v21.4s,  v9.16b, v1.4b[0]
625        UDOT    v22.4s,  v9.16b, v2.4b[0]
626        UDOT    v23.4s,  v9.16b, v3.4b[0]
627        UDOT    v24.4s, v10.16b, v0.4b[0]
628        UDOT    v25.4s, v10.16b, v1.4b[0]
629        UDOT    v26.4s, v10.16b, v2.4b[0]
630        UDOT    v27.4s, v10.16b, v3.4b[0]
631        UDOT    v28.4s, v11.16b, v0.4b[0]
632        UDOT    v29.4s, v11.16b, v1.4b[0]
633        UDOT    v30.4s, v11.16b, v2.4b[0]
634        UDOT    v31.4s, v11.16b, v3.4b[0]
635        LDP     q8,  q9,  [x5], 32
636        LDP     q10, q11, [x5], 32
637        UDOT    v16.4s,  v8.16b, v0.4b[1]
638        UDOT    v17.4s,  v8.16b, v1.4b[1]
639        UDOT    v18.4s,  v8.16b, v2.4b[1]
640        UDOT    v19.4s,  v8.16b, v3.4b[1]
641        UDOT    v20.4s,  v9.16b, v0.4b[1]
642        UDOT    v21.4s,  v9.16b, v1.4b[1]
643        UDOT    v22.4s,  v9.16b, v2.4b[1]
644        UDOT    v23.4s,  v9.16b, v3.4b[1]
645        UDOT    v24.4s, v10.16b, v0.4b[1]
646        UDOT    v25.4s, v10.16b, v1.4b[1]
647        UDOT    v26.4s, v10.16b, v2.4b[1]
648        UDOT    v27.4s, v10.16b, v3.4b[1]
649        UDOT    v28.4s, v11.16b, v0.4b[1]
650        UDOT    v29.4s, v11.16b, v1.4b[1]
651        UDOT    v30.4s, v11.16b, v2.4b[1]
652        UDOT    v31.4s, v11.16b, v3.4b[1]
653        TBZ     x0, 2, 3b
6545:
655        LDR     s0,  [x3], 4
656        LDP     q8,  q9,  [x5], 32
657        LDR     s1, [x15], 4
658        LDR     s2, [x13], 4
659        LDR     s3,  [x4], 4
660        LDP     q10, q11, [x5], 32
661        UDOT    v12.2s, v7.8b, v0.8b
662        UDOT    v13.2s, v7.8b, v1.8b
663        UDOT    v14.2s, v7.8b, v2.8b
664        UDOT    v15.2s, v7.8b, v3.8b
665        UDOT    v16.4s,  v8.16b, v0.4b[0]
666        UDOT    v17.4s,  v8.16b, v1.4b[0]
667        UDOT    v18.4s,  v8.16b, v2.4b[0]
668        UDOT    v19.4s,  v8.16b, v3.4b[0]
669        UDOT    v20.4s,  v9.16b, v0.4b[0]
670        UDOT    v21.4s,  v9.16b, v1.4b[0]
671        UDOT    v22.4s,  v9.16b, v2.4b[0]
672        UDOT    v23.4s,  v9.16b, v3.4b[0]
673        UDOT    v24.4s, v10.16b, v0.4b[0]
674        UDOT    v25.4s, v10.16b, v1.4b[0]
675        UDOT    v26.4s, v10.16b, v2.4b[0]
676        UDOT    v27.4s, v10.16b, v3.4b[0]
677        UDOT    v28.4s, v11.16b, v0.4b[0]
678        UDOT    v29.4s, v11.16b, v1.4b[0]
679        UDOT    v30.4s, v11.16b, v2.4b[0]
680        UDOT    v31.4s, v11.16b, v3.4b[0]
681        B       3b
682
683        # Store odd width
684        .p2align 3
6856:
686        TBZ     x1, 3, 7f
687        STR     d0, [x6], 8
688        STR     d1, [x8], 8
689        DUP     d0, v0.d[1]
690        DUP     d1, v1.d[1]
691        STR     d2, [x9], 8
692        STR     d3, [x7], 8
693        DUP     d2, v2.d[1]
694        DUP     d3, v3.d[1]
6957:
696        TBZ     x1, 2, 8f
697        STR     s0, [x6], 4
698        STR     s1, [x8], 4
699        DUP     s0, v0.s[1]
700        DUP     s1, v1.s[1]
701        STR     s2, [x9], 4
702        STR     s3, [x7], 4
703        DUP     s2, v2.s[1]
704        DUP     s3, v3.s[1]
7058:
706        TBZ     x1, 1, 9f
707        STR     h0, [x6], 2
708        STR     h1, [x8], 2
709        DUP     h0, v0.h[1]
710        DUP     h1, v1.h[1]
711        STR     h2, [x9], 2
712        STR     h3, [x7], 2
713        DUP     h2, v2.h[1]
714        DUP     h3, v3.h[1]
7159:
716        TBZ     x1, 0, 10f
717        STR     b0, [x6]
718        STR     b1, [x8]
719        STR     b2, [x9]
720        STR     b3, [x7]
72110:
722        # Restore d8-d15 from stack
723        LDP     d14, d15, [sp, 48]
724        LDP     d12, d13, [sp, 32]
725        LDP     d10, d11, [sp, 16]
726        LDP     d8, d9, [sp], 64
727        RET
728
729END_FUNCTION xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
730
731#ifdef __ELF__
732.section ".note.GNU-stack","",%progbits
733#endif
734