xref: /aosp_15_r20/external/XNNPACK/src/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qu8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qu8_conv_minmax_params params)  [sp + 8] -> x11
24
25# params structure is 20 bytes
26#  struct {
27#    uint8_t kernel_zero_point[4];
28#    int32_t right_pre_shift;
29#    int32_t multiplier;
30#    int32_t right_post_shift;
31#    int16_t output_zero_point;
32#    int8_t output_min;
33#    int8_t output_max;
34#  } rndnu_neon;
35
36# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
37
38# Register usage
39# A0  x3  v0  v4
40# A1 x15  v1  v5
41# A2 x13  v2  v6
42# A3  x4  v3  (v0)
43# B   x5  v8  v9 v10 v11
44# C0  x6 v16 v20 v24 v28
45# C1  x8 v17 v21 v25 v29
46# C2  x9 v18 v22 v26 v30
47# C3  x7 v19 v23 v27 v31
48# zero point v7 v12 v13 v14 v15
49
50# x14 temp for Cortex-A55 loads
51
52BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55
53
54        # Clamp A and C pointers
55        CMP     x0, 2                   // if mr < 2
56        LDP     x12, x11, [sp]          // cn_stride, params
57        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
58        ADD     x15, x3, x4             // a1 = a0 + a_stride
59        ADD     x8, x6, x7              // c1 = c0 + cm_stride
60
61        # Save d8-d15 to stack
62        STP     d8, d9, [sp, -64]!
63        CSEL    x15, x3, x15, LO        //   a1 = a0
64        CSEL    x8, x6,  x8, LO         //   c1 = c0
65        BIC     x2, x2, 3
66        STP     d10, d11, [sp, 16]
67
68        ADD     x13, x15, x4            // a2 = a1 + a_stride
69        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
70        STP     d12, d13, [sp, 32]
71                                        // if mr <= 2
72        CSEL    x13, x15, x13, LS       //   a2 = a1
73        CSEL    x9,  x8,  x9, LS        //   c2 = c1
74        STP     d14, d15, [sp, 48]
75
76        CMP     x0, 4                   // if mr < 4
77        ADD     x4, x13, x4             // a3 = a2 + a_stride
78        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
79
80        LD1R    {v7.4s}, [x11], 4       // kernel_zero_point
81
82        CSEL    x4, x13, x4, LO         //   a3 = a2
83        CSEL    x7,  x9, x7, LO         //   c3 = c2
84
85
86        .p2align 3
870:
88        # Load initial bias from w into accumulators
89        LDP     q16, q20, [x5], 32
90
91        MOVI    v12.4s, 0
92        MOVI    v13.4s, 0
93        MOVI    v14.4s, 0
94        MOVI    v15.4s, 0
95
96        LDP     q24, q28, [x5], 32
97        MOV     v17.16b, v16.16b
98        MOV     v18.16b, v16.16b
99        MOV     v19.16b, v16.16b
100        MOV     v21.16b, v20.16b
101        SUBS    x0, x2, 16              // k = kc - 16
102        MOV     v22.16b, v20.16b
103        MOV     v23.16b, v20.16b
104        MOV     v25.16b, v24.16b
105        MOV     v26.16b, v24.16b
106        MOV     v27.16b, v24.16b
107        MOV     v29.16b, v28.16b
108        MOV     v30.16b, v28.16b
109        MOV     v31.16b, v28.16b
110
111        # Is there at least 16 bytes for prologue/epilogue?
112        B.LO    4f
113
114        # prologue - read A and B values for block 0 and 1
115        LDR     d0,  [x3], 8
116        LDR     q8,  [x5], 16
117        LDR     d1, [x15], 8
118        LDR     d2, [x13], 8
119        LDR     d3,  [x4], 8
120        SUBS    x0, x0, 16              // is there 16 for main loop?
121        LDR     d9,  [x5], 8
122        LDR     x14, [x5], 8
123        # Is there at least 16 bytes for main loop?
124        B.LO    2f
125
126        # Main loop - 16 bytes of A in 4 groups.
127        # 4 row of 4 vectors wide = 16 UDOT instructions for 4 channels
128        # 4 LD64 for A
129        # 4 LD128 for W. = 2 LD64 + INS.
130        # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS.
131
132        .p2align 3
1331:
134        # BLOCK 0
135        UDOT    v16.4s,  v8.16b, v0.4b[0]
136        LDR     d10,  [x5], 8
137        UDOT    v17.4s,  v8.16b, v1.4b[0]
138        INS     v9.d[1], x14
139        UDOT    v18.4s,  v8.16b, v2.4b[0]
140        LDR     x14,  [x5], 8
141        UDOT    v19.4s,  v8.16b, v3.4b[0]
142
143        # BLOCK 1
144        UDOT    v20.4s,  v9.16b, v0.4b[0]
145        LDR     d11,  [x5], 8
146        UDOT    v21.4s,  v9.16b, v1.4b[0]
147        INS     v10.d[1], x14
148        UDOT    v22.4s,  v9.16b, v2.4b[0]
149        LDR     x14,  [x5], 8
150        UDOT    v23.4s,  v9.16b, v3.4b[0]
151
152        # BLOCK 2
153        UDOT    v24.4s, v10.16b, v0.4b[0]
154        LDR     d8,  [x5], 8
155        UDOT    v25.4s, v10.16b, v1.4b[0]
156        INS     v11.d[1], x14
157        UDOT    v26.4s, v10.16b, v2.4b[0]
158        LDR     x14,  [x5], 8
159        UDOT    v27.4s, v10.16b, v3.4b[0]
160
161        # BLOCK 3
162        UDOT    v28.4s, v11.16b, v0.4b[0]
163        LDR     d9,  [x5], 8
164        UDOT    v29.4s, v11.16b, v1.4b[0]
165        INS     v8.d[1], x14
166        UDOT    v30.4s, v11.16b, v2.4b[0]
167        LDR     x14,  [x5], 8
168        UDOT    v31.4s, v11.16b, v3.4b[0]
169
170        UDOT    v12.2s, v7.8b, v0.8b
171        UDOT    v13.2s, v7.8b, v1.8b
172        UDOT    v14.2s, v7.8b, v2.8b
173        UDOT    v15.2s, v7.8b, v3.8b
174
175        # BLOCK 0
176        UDOT    v16.4s,  v8.16b, v0.4b[1]
177        LDR     d10,  [x5], 8
178        UDOT    v17.4s,  v8.16b, v1.4b[1]
179        INS     v9.d[1], x14
180        UDOT    v18.4s,  v8.16b, v2.4b[1]
181        LDR     x14,  [x5], 8
182        UDOT    v19.4s,  v8.16b, v3.4b[1]
183        LDR     d4,  [x3], 8
184
185        # BLOCK 1
186        UDOT    v20.4s,  v9.16b, v0.4b[1]
187        LDR     d11,  [x5], 8
188        UDOT    v21.4s,  v9.16b, v1.4b[1]
189        INS     v10.d[1], x14
190        UDOT    v22.4s,  v9.16b, v2.4b[1]
191        LDR     x14,  [x5], 8
192        UDOT    v23.4s,  v9.16b, v3.4b[1]
193        LDR     d5, [x15], 8
194
195        # BLOCK 2
196        UDOT    v24.4s, v10.16b, v0.4b[1]
197        LDR     d8,  [x5], 8
198        UDOT    v25.4s, v10.16b, v1.4b[1]
199        INS     v11.d[1], x14
200        UDOT    v26.4s, v10.16b, v2.4b[1]
201        LDR     x14,  [x5], 8
202        UDOT    v27.4s, v10.16b, v3.4b[1]
203        LDR     d6, [x13], 8
204
205        # BLOCK 3
206        UDOT    v28.4s, v11.16b, v0.4b[1]
207        LDR     d9,  [x5], 8
208        UDOT    v29.4s, v11.16b, v1.4b[1]
209        INS     v8.d[1], x14
210        UDOT    v30.4s, v11.16b, v2.4b[1]
211        LDR     x14,  [x5], 8
212        UDOT    v31.4s, v11.16b, v3.4b[1]
213        LDR     d0,  [x4], 8
214
215        # BLOCK 0
216        UDOT    v16.4s,  v8.16b, v4.4b[0]
217        LDR     d10,  [x5], 8
218        UDOT    v17.4s,  v8.16b, v5.4b[0]
219        INS     v9.d[1], x14
220        UDOT    v18.4s,  v8.16b, v6.4b[0]
221        LDR     x14,  [x5], 8
222        UDOT    v19.4s,  v8.16b, v0.4b[0]
223
224        # BLOCK 1
225        UDOT    v20.4s,  v9.16b, v4.4b[0]
226        LDR     d11,  [x5], 8
227        UDOT    v21.4s,  v9.16b, v5.4b[0]
228        INS     v10.d[1], x14
229        UDOT    v22.4s,  v9.16b, v6.4b[0]
230        LDR     x14,  [x5], 8
231        UDOT    v23.4s,  v9.16b, v0.4b[0]
232
233        # BLOCK 2
234        UDOT    v24.4s, v10.16b, v4.4b[0]
235        LDR     d8,  [x5], 8
236        UDOT    v25.4s, v10.16b, v5.4b[0]
237        INS     v11.d[1], x14
238        UDOT    v26.4s, v10.16b, v6.4b[0]
239        LDR     x14,  [x5], 8
240        UDOT    v27.4s, v10.16b, v0.4b[0]
241
242        # BLOCK 3
243        UDOT    v28.4s, v11.16b, v4.4b[0]
244        LDR     d9,  [x5], 8
245        UDOT    v29.4s, v11.16b, v5.4b[0]
246        INS     v8.d[1], x14
247        UDOT    v30.4s, v11.16b, v6.4b[0]
248        LDR     x14,  [x5], 8
249        UDOT    v31.4s, v11.16b, v0.4b[0]
250
251        # BLOCK 0
252        UDOT    v16.4s,  v8.16b, v4.4b[1]
253        LDR     d10,  [x5], 8
254        UDOT    v17.4s,  v8.16b, v5.4b[1]
255        INS     v9.d[1], x14
256        UDOT    v18.4s,  v8.16b, v6.4b[1]
257        LDR     x14,  [x5], 8
258        UDOT    v19.4s,  v8.16b, v0.4b[1]
259        LDR     d1, [x15], 8
260
261        # BLOCK 1
262        UDOT    v20.4s,  v9.16b, v4.4b[1]
263        LDR     d11,  [x5], 8
264        UDOT    v21.4s,  v9.16b, v5.4b[1]
265        INS     v10.d[1], x14
266        UDOT    v22.4s,  v9.16b, v6.4b[1]
267        LDR     x14,  [x5], 8
268        UDOT    v23.4s,  v9.16b, v0.4b[1]
269        LDR     d2, [x13], 8
270
271        # BLOCK 2
272        UDOT    v24.4s, v10.16b, v4.4b[1]
273        LDR     d8,  [x5], 8            // First B values for block 0 and 1
274        UDOT    v25.4s, v10.16b, v5.4b[1]
275        INS     v11.d[1], x14
276        UDOT    v26.4s, v10.16b, v6.4b[1]
277        LDR     x14,  [x5], 8
278        UDOT    v27.4s, v10.16b, v0.4b[1]
279        LDR     d3,  [x4], 8
280
281        # BLOCK 3 special
282        UDOT    v31.4s, v11.16b, v0.4b[1]
283        LDR     d9,  [x5], 8
284        UDOT    v15.2s, v7.8b, v0.8b    // free up v0 early
285        INS     v8.d[1], x14
286        UDOT    v28.4s, v11.16b, v4.4b[1]
287        LDR     x14,  [x5], 8
288        UDOT    v29.4s, v11.16b, v5.4b[1]
289        LDR     d0,  [x3], 8
290        UDOT    v30.4s, v11.16b, v6.4b[1]
291        SUBS    x0, x0, 16
292
293        UDOT    v12.2s, v7.8b, v4.8b
294        UDOT    v13.2s, v7.8b, v5.8b
295        UDOT    v14.2s, v7.8b, v6.8b
296        B.HS    1b
297
298        # Epilogue.  Same as main loop but no preloads in final group
2992:
300        # BLOCK 0
301        UDOT    v16.4s,  v8.16b, v0.4b[0]
302        LDR     d10,  [x5], 8
303        UDOT    v17.4s,  v8.16b, v1.4b[0]
304        INS     v9.d[1], x14
305        UDOT    v18.4s,  v8.16b, v2.4b[0]
306        LDR     x14,  [x5], 8
307        UDOT    v19.4s,  v8.16b, v3.4b[0]
308
309        # BLOCK 1
310        UDOT    v20.4s,  v9.16b, v0.4b[0]
311        LDR     d11,  [x5], 8
312        UDOT    v21.4s,  v9.16b, v1.4b[0]
313        INS     v10.d[1], x14
314        UDOT    v22.4s,  v9.16b, v2.4b[0]
315        LDR     x14,  [x5], 8
316        UDOT    v23.4s,  v9.16b, v3.4b[0]
317
318        # BLOCK 2
319        UDOT    v24.4s, v10.16b, v0.4b[0]
320        LDR     d8,  [x5], 8
321        UDOT    v25.4s, v10.16b, v1.4b[0]
322        INS     v11.d[1], x14
323        UDOT    v26.4s, v10.16b, v2.4b[0]
324        LDR     x14,  [x5], 8
325        UDOT    v27.4s, v10.16b, v3.4b[0]
326
327        # BLOCK 3
328        UDOT    v28.4s, v11.16b, v0.4b[0]
329        LDR     d9,  [x5], 8
330        UDOT    v29.4s, v11.16b, v1.4b[0]
331        INS     v8.d[1], x14
332        UDOT    v30.4s, v11.16b, v2.4b[0]
333        LDR     x14,  [x5], 8
334        UDOT    v31.4s, v11.16b, v3.4b[0]
335
336        UDOT    v12.2s, v7.8b, v0.8b
337        UDOT    v13.2s, v7.8b, v1.8b
338        UDOT    v14.2s, v7.8b, v2.8b
339        UDOT    v15.2s, v7.8b, v3.8b
340
341        # BLOCK 0
342        UDOT    v16.4s,  v8.16b, v0.4b[1]
343        LDR     d10,  [x5], 8
344        UDOT    v17.4s,  v8.16b, v1.4b[1]
345        INS     v9.d[1], x14
346        UDOT    v18.4s,  v8.16b, v2.4b[1]
347        LDR     x14,  [x5], 8
348        UDOT    v19.4s,  v8.16b, v3.4b[1]
349        LDR     d4,  [x3], 8
350
351        # BLOCK 1
352        UDOT    v20.4s,  v9.16b, v0.4b[1]
353        LDR     d11,  [x5], 8
354        UDOT    v21.4s,  v9.16b, v1.4b[1]
355        INS     v10.d[1], x14
356        UDOT    v22.4s,  v9.16b, v2.4b[1]
357        LDR     x14,  [x5], 8
358        UDOT    v23.4s,  v9.16b, v3.4b[1]
359        LDR     d5, [x15], 8
360
361        # BLOCK 2
362        UDOT    v24.4s, v10.16b, v0.4b[1]
363        LDR     d8,  [x5], 8
364        UDOT    v25.4s, v10.16b, v1.4b[1]
365        INS     v11.d[1], x14
366        UDOT    v26.4s, v10.16b, v2.4b[1]
367        LDR     x14,  [x5], 8
368        UDOT    v27.4s, v10.16b, v3.4b[1]
369        LDR     d6, [x13], 8
370
371        # BLOCK 3
372        UDOT    v28.4s, v11.16b, v0.4b[1]
373        LDR     d9,  [x5], 8
374        UDOT    v29.4s, v11.16b, v1.4b[1]
375        INS     v8.d[1], x14
376        UDOT    v30.4s, v11.16b, v2.4b[1]
377        LDR     x14,  [x5], 8
378        UDOT    v31.4s, v11.16b, v3.4b[1]
379        LDR     d0,  [x4], 8
380
381        # BLOCK 0
382        UDOT    v16.4s,  v8.16b, v4.4b[0]
383        LDR     d10,  [x5], 8
384        UDOT    v17.4s,  v8.16b, v5.4b[0]
385        INS     v9.d[1], x14
386        UDOT    v18.4s,  v8.16b, v6.4b[0]
387        LDR     x14,  [x5], 8
388        UDOT    v19.4s,  v8.16b, v0.4b[0]
389
390        # BLOCK 1
391        UDOT    v20.4s,  v9.16b, v4.4b[0]
392        LDR     d11,  [x5], 8
393        UDOT    v21.4s,  v9.16b, v5.4b[0]
394        INS     v10.d[1], x14
395        UDOT    v22.4s,  v9.16b, v6.4b[0]
396        LDR     x14,  [x5], 8
397        UDOT    v23.4s,  v9.16b, v0.4b[0]
398
399        # BLOCK 2
400        UDOT    v24.4s, v10.16b, v4.4b[0]
401        LDR     d8,  [x5], 8
402        UDOT    v25.4s, v10.16b, v5.4b[0]
403        INS     v11.d[1], x14
404        UDOT    v26.4s, v10.16b, v6.4b[0]
405        LDR     x14,  [x5], 8
406        UDOT    v27.4s, v10.16b, v0.4b[0]
407
408        # BLOCK 3
409        UDOT    v28.4s, v11.16b, v4.4b[0]
410        LDR     d9,  [x5], 8
411        UDOT    v29.4s, v11.16b, v5.4b[0]
412        INS     v8.d[1], x14
413        UDOT    v30.4s, v11.16b, v6.4b[0]
414        LDR     x14,  [x5], 8
415        UDOT    v31.4s, v11.16b, v0.4b[0]
416
417        # BLOCK 0
418        UDOT    v16.4s,  v8.16b, v4.4b[1]
419        LDR     d10,  [x5], 8
420        UDOT    v17.4s,  v8.16b, v5.4b[1]
421        INS     v9.d[1], x14
422        UDOT    v18.4s,  v8.16b, v6.4b[1]
423        LDR     x14,  [x5], 8
424        UDOT    v19.4s,  v8.16b, v0.4b[1]
425
426        # BLOCK 1
427        UDOT    v20.4s,  v9.16b, v4.4b[1]
428        LDR     d11,  [x5], 8
429        UDOT    v21.4s,  v9.16b, v5.4b[1]
430        INS     v10.d[1], x14
431        UDOT    v22.4s,  v9.16b, v6.4b[1]
432        LDR     x14,  [x5], 8
433        UDOT    v23.4s,  v9.16b, v0.4b[1]
434
435        # BLOCK 2
436        UDOT    v24.4s, v10.16b, v4.4b[1]
437        UDOT    v25.4s, v10.16b, v5.4b[1]
438        INS     v11.d[1], x14
439        UDOT    v26.4s, v10.16b, v6.4b[1]
440        UDOT    v27.4s, v10.16b, v0.4b[1]
441
442        # BLOCK 3
443        UDOT    v28.4s, v11.16b, v4.4b[1]
444        UDOT    v29.4s, v11.16b, v5.4b[1]
445        UDOT    v30.4s, v11.16b, v6.4b[1]
446        UDOT    v31.4s, v11.16b, v0.4b[1]
447        AND     x0, x2, 15              // kc remainder 0 to 12
448
449        UDOT    v12.2s, v7.8b, v4.8b
450        UDOT    v13.2s, v7.8b, v5.8b
451        UDOT    v14.2s, v7.8b, v6.8b
452        UDOT    v15.2s, v7.8b, v0.8b
453
454        # Is there a remainder?- 4 to 12 bytes of A
455        CBNZ    x0, 4f
456
4573:
458        ADDP    v0.2s, v12.2s, v13.2s
459        ADDP    v1.2s, v14.2s, v15.2s
460        DUP     v12.4s, v0.s[0]
461        DUP     v13.4s, v0.s[1]
462        DUP     v14.4s, v1.s[0]
463        DUP     v15.4s, v1.s[1]
464
465        # Subtract zero point from accumulators
466        SUB     v16.4s, v16.4s, v12.4s
467        SUB     v17.4s, v17.4s, v13.4s
468        SUB     v18.4s, v18.4s, v14.4s
469        SUB     v19.4s, v19.4s, v15.4s
470        SUB     v20.4s, v20.4s, v12.4s
471        SUB     v21.4s, v21.4s, v13.4s
472        SUB     v22.4s, v22.4s, v14.4s
473        SUB     v23.4s, v23.4s, v15.4s
474        SUB     v24.4s, v24.4s, v12.4s
475        SUB     v25.4s, v25.4s, v13.4s
476        SUB     v26.4s, v26.4s, v14.4s
477        SUB     v27.4s, v27.4s, v15.4s
478        SUB     v28.4s, v28.4s, v12.4s
479        SUB     v29.4s, v29.4s, v13.4s
480        SUB     v30.4s, v30.4s, v14.4s
481        SUB     v31.4s, v31.4s, v15.4s
482
483        # Apply params - preshift, scale, postshift, bias and clamp
484        LD1R    {v4.4s}, [x11], 4
485        SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
486        SSHL    v17.4s, v17.4s, v4.4s
487        SSHL    v18.4s, v18.4s, v4.4s
488        SSHL    v19.4s, v19.4s, v4.4s
489        SSHL    v20.4s, v20.4s, v4.4s
490        SSHL    v21.4s, v21.4s, v4.4s
491        SSHL    v22.4s, v22.4s, v4.4s
492        SSHL    v23.4s, v23.4s, v4.4s
493        LD1R    {v5.4s}, [x11], 4
494        SSHL    v24.4s, v24.4s, v4.4s
495        SSHL    v25.4s, v25.4s, v4.4s
496        SSHL    v26.4s, v26.4s, v4.4s
497        SSHL    v27.4s, v27.4s, v4.4s
498        SSHL    v28.4s, v28.4s, v4.4s
499        SSHL    v29.4s, v29.4s, v4.4s
500        SSHL    v30.4s, v30.4s, v4.4s
501        SSHL    v31.4s, v31.4s, v4.4s
502        LD1R    {v6.4s}, [x11], 4
503        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
504        SQDMULH v17.4s, v17.4s, v5.4s
505        SQDMULH v18.4s, v18.4s, v5.4s
506        SQDMULH v19.4s, v19.4s, v5.4s
507        SQDMULH v20.4s, v20.4s, v5.4s
508        SQDMULH v21.4s, v21.4s, v5.4s
509        SQDMULH v22.4s, v22.4s, v5.4s
510        SQDMULH v23.4s, v23.4s, v5.4s
511        SQDMULH v24.4s, v24.4s, v5.4s
512        SQDMULH v25.4s, v25.4s, v5.4s
513        SQDMULH v26.4s, v26.4s, v5.4s
514        SQDMULH v27.4s, v27.4s, v5.4s
515        SQDMULH v28.4s, v28.4s, v5.4s
516        SQDMULH v29.4s, v29.4s, v5.4s
517        SQDMULH v30.4s, v30.4s, v5.4s
518        SQDMULH v31.4s, v31.4s, v5.4s
519        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
520        SRSHL   v17.4s, v17.4s, v6.4s
521        SRSHL   v18.4s, v18.4s, v6.4s
522        SRSHL   v19.4s, v19.4s, v6.4s
523        SRSHL   v20.4s, v20.4s, v6.4s
524        SRSHL   v21.4s, v21.4s, v6.4s
525        SRSHL   v22.4s, v22.4s, v6.4s
526        SRSHL   v23.4s, v23.4s, v6.4s
527        SRSHL   v24.4s, v24.4s, v6.4s
528        SRSHL   v25.4s, v25.4s, v6.4s
529        SRSHL   v26.4s, v26.4s, v6.4s
530        SRSHL   v27.4s, v27.4s, v6.4s
531        SRSHL   v28.4s, v28.4s, v6.4s
532        SRSHL   v29.4s, v29.4s, v6.4s
533        SRSHL   v30.4s, v30.4s, v6.4s
534        SRSHL   v31.4s, v31.4s, v6.4s
535
536        SQXTN   v16.4h, v16.4s
537        SQXTN   v17.4h, v17.4s
538        SQXTN   v18.4h, v18.4s
539        SQXTN   v19.4h, v19.4s
540        SQXTN   v24.4h, v24.4s
541        SQXTN   v25.4h, v25.4s
542        SQXTN   v26.4h, v26.4s
543        SQXTN   v27.4h, v27.4s
544        LD1R    {v6.8h}, [x11], 2       // add bias
545
546        SQXTN2  v16.8h, v20.4s
547        SQXTN2  v17.8h, v21.4s
548        SQXTN2  v18.8h, v22.4s
549        SQXTN2  v19.8h, v23.4s
550        SQXTN2  v24.8h, v28.4s
551        SQXTN2  v25.8h, v29.4s
552        SQXTN2  v26.8h, v30.4s
553        SQXTN2  v27.8h, v31.4s
554
555        SQADD   v16.8h, v16.8h, v6.8h
556        SQADD   v17.8h, v17.8h, v6.8h
557        SQADD   v18.8h, v18.8h, v6.8h
558        SQADD   v19.8h, v19.8h, v6.8h
559        SQADD   v24.8h, v24.8h, v6.8h
560        SQADD   v25.8h, v25.8h, v6.8h
561        SQADD   v26.8h, v26.8h, v6.8h
562        SQADD   v27.8h, v27.8h, v6.8h
563        LD1R    {v4.16b}, [x11], 1      // clamp min value
564
565        SQXTUN  v0.8b, v16.8h
566        SQXTUN  v1.8b, v17.8h
567        SQXTUN  v2.8b, v18.8h
568        SQXTUN  v3.8b, v19.8h
569        LD1R    {v5.16b}, [x11]         // clamp max value
570        SQXTUN2 v0.16b, v24.8h
571        SQXTUN2 v1.16b, v25.8h
572        SQXTUN2 v2.16b, v26.8h
573        SQXTUN2 v3.16b, v27.8h
574
575        SUB     x11, x11, 15             // rewind params pointer
576
577        UMAX    v0.16b, v0.16b, v4.16b
578        UMAX    v1.16b, v1.16b, v4.16b
579        UMAX    v2.16b, v2.16b, v4.16b
580        UMAX    v3.16b, v3.16b, v4.16b
581        SUBS    x1, x1, 16
582        UMIN    v0.16b, v0.16b, v5.16b
583        UMIN    v1.16b, v1.16b, v5.16b
584        UMIN    v2.16b, v2.16b, v5.16b
585        UMIN    v3.16b, v3.16b, v5.16b
586        B.LO    6f
587
588        # Store full 4 x 16
589        ST1     {v0.16b}, [x6], x12
590        SUB     x3,  x3, x2             // a0 -= kc
591        ST1     {v1.16b}, [x8], x12
592        SUB     x15, x15, x2            // a1 -= kc
593        ST1     {v2.16b}, [x9], x12
594        SUB     x13, x13, x2            // a2 -= kc
595        ST1     {v3.16b}, [x7], x12
596        SUB     x4,  x4, x2             // a3 -= kc
597        B.NE    0b
598
599        # Restore d8-d15 from stack
600        LDP     d14, d15, [sp, 48]
601        LDP     d12, d13, [sp, 32]
602        LDP     d10, d11, [sp, 16]
603        LDP     d8, d9, [sp], 64
604        RET
605
606        # Remainder- 4 to 12 bytes of A
607        .p2align 3
6084:
609        TBZ     x0, 3, 5f
610
611        LDR     d0,  [x3], 8
612        LDP     q8,  q9,  [x5], 32
613        LDR     d1, [x15], 8
614        LDR     d2, [x13], 8
615        LDR     d3,  [x4], 8
616        LDP     q10, q11, [x5], 32
617        UDOT    v12.2s, v7.8b, v0.8b
618        UDOT    v13.2s, v7.8b, v1.8b
619        UDOT    v14.2s, v7.8b, v2.8b
620        UDOT    v15.2s, v7.8b, v3.8b
621        UDOT    v16.4s,  v8.16b, v0.4b[0]
622        UDOT    v17.4s,  v8.16b, v1.4b[0]
623        UDOT    v18.4s,  v8.16b, v2.4b[0]
624        UDOT    v19.4s,  v8.16b, v3.4b[0]
625        UDOT    v20.4s,  v9.16b, v0.4b[0]
626        UDOT    v21.4s,  v9.16b, v1.4b[0]
627        UDOT    v22.4s,  v9.16b, v2.4b[0]
628        UDOT    v23.4s,  v9.16b, v3.4b[0]
629        UDOT    v24.4s, v10.16b, v0.4b[0]
630        UDOT    v25.4s, v10.16b, v1.4b[0]
631        UDOT    v26.4s, v10.16b, v2.4b[0]
632        UDOT    v27.4s, v10.16b, v3.4b[0]
633        UDOT    v28.4s, v11.16b, v0.4b[0]
634        UDOT    v29.4s, v11.16b, v1.4b[0]
635        UDOT    v30.4s, v11.16b, v2.4b[0]
636        UDOT    v31.4s, v11.16b, v3.4b[0]
637        LDP     q8,  q9,  [x5], 32
638        LDP     q10, q11, [x5], 32
639        UDOT    v16.4s,  v8.16b, v0.4b[1]
640        UDOT    v17.4s,  v8.16b, v1.4b[1]
641        UDOT    v18.4s,  v8.16b, v2.4b[1]
642        UDOT    v19.4s,  v8.16b, v3.4b[1]
643        UDOT    v20.4s,  v9.16b, v0.4b[1]
644        UDOT    v21.4s,  v9.16b, v1.4b[1]
645        UDOT    v22.4s,  v9.16b, v2.4b[1]
646        UDOT    v23.4s,  v9.16b, v3.4b[1]
647        UDOT    v24.4s, v10.16b, v0.4b[1]
648        UDOT    v25.4s, v10.16b, v1.4b[1]
649        UDOT    v26.4s, v10.16b, v2.4b[1]
650        UDOT    v27.4s, v10.16b, v3.4b[1]
651        UDOT    v28.4s, v11.16b, v0.4b[1]
652        UDOT    v29.4s, v11.16b, v1.4b[1]
653        UDOT    v30.4s, v11.16b, v2.4b[1]
654        UDOT    v31.4s, v11.16b, v3.4b[1]
655        TBZ     x0, 2, 3b
6565:
657        LDR     s0,  [x3], 4
658        LDP     q8,  q9,  [x5], 32
659        LDR     s1, [x15], 4
660        LDR     s2, [x13], 4
661        LDR     s3,  [x4], 4
662        LDP     q10, q11, [x5], 32
663        UDOT    v12.2s, v7.8b, v0.8b
664        UDOT    v13.2s, v7.8b, v1.8b
665        UDOT    v14.2s, v7.8b, v2.8b
666        UDOT    v15.2s, v7.8b, v3.8b
667        UDOT    v16.4s,  v8.16b, v0.4b[0]
668        UDOT    v17.4s,  v8.16b, v1.4b[0]
669        UDOT    v18.4s,  v8.16b, v2.4b[0]
670        UDOT    v19.4s,  v8.16b, v3.4b[0]
671        UDOT    v20.4s,  v9.16b, v0.4b[0]
672        UDOT    v21.4s,  v9.16b, v1.4b[0]
673        UDOT    v22.4s,  v9.16b, v2.4b[0]
674        UDOT    v23.4s,  v9.16b, v3.4b[0]
675        UDOT    v24.4s, v10.16b, v0.4b[0]
676        UDOT    v25.4s, v10.16b, v1.4b[0]
677        UDOT    v26.4s, v10.16b, v2.4b[0]
678        UDOT    v27.4s, v10.16b, v3.4b[0]
679        UDOT    v28.4s, v11.16b, v0.4b[0]
680        UDOT    v29.4s, v11.16b, v1.4b[0]
681        UDOT    v30.4s, v11.16b, v2.4b[0]
682        UDOT    v31.4s, v11.16b, v3.4b[0]
683        B       3b
684
685        # Store odd width
686        .p2align 3
6876:
688        TBZ     x1, 3, 7f
689        STR     d0, [x6], 8
690        STR     d1, [x8], 8
691        DUP     d0, v0.d[1]
692        DUP     d1, v1.d[1]
693        STR     d2, [x9], 8
694        STR     d3, [x7], 8
695        DUP     d2, v2.d[1]
696        DUP     d3, v3.d[1]
6977:
698        TBZ     x1, 2, 8f
699        STR     s0, [x6], 4
700        STR     s1, [x8], 4
701        DUP     s0, v0.s[1]
702        DUP     s1, v1.s[1]
703        STR     s2, [x9], 4
704        STR     s3, [x7], 4
705        DUP     s2, v2.s[1]
706        DUP     s3, v3.s[1]
7078:
708        TBZ     x1, 1, 9f
709        STR     h0, [x6], 2
710        STR     h1, [x8], 2
711        DUP     h0, v0.h[1]
712        DUP     h1, v1.h[1]
713        STR     h2, [x9], 2
714        STR     h3, [x7], 2
715        DUP     h2, v2.h[1]
716        DUP     h3, v3.h[1]
7179:
718        TBZ     x1, 0, 10f
719        STR     b0, [x6]
720        STR     b1, [x8]
721        STR     b2, [x9]
722        STR     b3, [x7]
72310:
724        # Restore d8-d15 from stack
725        LDP     d14, d15, [sp, 48]
726        LDP     d12, d13, [sp, 32]
727        LDP     d10, d11, [sp, 16]
728        LDP     d8, d9, [sp], 64
729        RET
730
731END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55
732
733#ifdef __ELF__
734.section ".note.GNU-stack","",%progbits
735#endif
736