xref: /aosp_15_r20/external/XNNPACK/src/qu8-gemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qu8-gemm/4x8c4-aarch64-neondot-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qu8_conv_minmax_params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v4
29# A1 x15  v1  v5
30# A2 x13  v2  v6
31# A3  x4  v3  v7
32# B   x5  v28  v29 v30 v31
33# C0  x6 v16 v20
34# C1  x8 v17 v21
35# C2  x9 v18 v22
36# C3  x7 v19 v23
37# zero_point v24 v25 v26 v27 v8
38# unused v12 v13 v14 v15 v29 v30 v31
39
40# x14 temp for Cortex-A55 loads
41
42BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55
43
44        # Clamp A and C pointers
45        CMP     x0, 2                   // if mr < 2
46        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
47        ADD     x15, x3, x4             // a1 = a0 + a_stride
48        ADD     x8, x6, x7              // c1 = c0 + cm_stride
49        CSEL    x15, x3, x15, LO        //   a1 = a0
50        CSEL    x8, x6,  x8, LO         //   c1 = c0
51        BIC     x2, x2, 3
52
53        LDP     x12, x11, [sp]          // cn_stride, params
54
55        ADD     x13, x15, x4            // a2 = a1 + a_stride
56        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
57        STR     d8, [sp, -16]!          // Save d8 on stack
58                                        // if mr <= 2
59        CSEL    x13, x15, x13, LS       //   a2 = a1
60        CSEL    x9,  x8,  x9, LS        //   c2 = c1
61
62        LD1R    {v8.4s}, [x11], 4       // kernel_zero_point
63
64        CMP     x0, 4                   // if mr < 4
65        ADD     x4, x13, x4             // a3 = a2 + a_stride
66        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
67        CSEL    x4, x13, x4, LO         //   a3 = a2
68        CSEL    x7,  x9, x7, LO         //   c3 = c2
69
70        .p2align 3
710:
72        # Load initial bias from w into accumulators
73        LDP     q16, q20, [x5], 32
74        MOV     v17.16b, v16.16b
75        MOV     v18.16b, v16.16b
76        MOV     v19.16b, v16.16b
77        MOV     v21.16b, v20.16b
78        MOV     v22.16b, v20.16b
79        MOV     v23.16b, v20.16b
80        SUBS    x0, x2, 16              // k = kc - 16
81        MOVI    v24.16b, 0
82        MOVI    v25.16b, 0
83        MOVI    v26.16b, 0
84        MOVI    v27.16b, 0
85
86        # Is there at least 16 bytes for prologue/epilogue?
87        B.LO    4f
88
89        # prologue - read A and B values for block 0 and 1
90        LDR     d0,  [x3], 8
91        LDR     q28, [x5], 16
92        LDR     d1, [x15], 8
93        LDR     d2, [x13], 8
94        LDR     d3,  [x4], 8
95        SUBS    x0, x0, 16              // is there 16 for main loop?
96        LDR     d29, [x5], 8
97        LDR     x14, [x5], 8
98        # Is there at least 16 bytes for main loop?
99        B.LO    2f
100
101        # Main loop - 16 bytes of A in 4 groups of 2 blocks
102        # 4 row of 2 vectors wide = 8 UDOT instructions for 4 channels
103        # 4 LD64 for A
104        # 4 LD128 for W. = 2 LD64 + INS.
105        # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS.
106
107        .p2align 3
1081:
109        # BLOCK 0
110        UDOT    v16.4s,  v28.16b, v0.4b[0]
111        LDR     d30,  [x5], 8
112        UDOT    v17.4s,  v28.16b, v1.4b[0]
113        INS     v29.d[1], x14
114        UDOT    v18.4s,  v28.16b, v2.4b[0]
115        LDR     x14,  [x5], 8
116        UDOT    v19.4s,  v28.16b, v3.4b[0]
117        LDR     d4,  [x3], 8
118
119        # BLOCK 1
120        UDOT    v20.4s,  v29.16b, v0.4b[0]
121        LDR     d31,  [x5], 8
122        UDOT    v21.4s,  v29.16b, v1.4b[0]
123        INS     v30.d[1], x14
124        UDOT    v22.4s,  v29.16b, v2.4b[0]
125        LDR     x14,  [x5], 8
126        UDOT    v23.4s,  v29.16b, v3.4b[0]
127        LDR     d5, [x15], 8
128
129        # BLOCK 0
130        UDOT    v16.4s, v30.16b, v0.4b[1]
131        LDR     d28,  [x5], 8
132        UDOT    v17.4s, v30.16b, v1.4b[1]
133        INS     v31.d[1], x14
134        UDOT    v18.4s, v30.16b, v2.4b[1]
135        LDR     x14,  [x5], 8
136        UDOT    v19.4s, v30.16b, v3.4b[1]
137        LDR     d6, [x13], 8
138
139        # BLOCK 1
140        UDOT    v20.4s, v31.16b, v0.4b[1]
141        LDR     d29,  [x5], 8
142        UDOT    v21.4s, v31.16b, v1.4b[1]
143        INS     v28.d[1], x14
144        UDOT    v22.4s, v31.16b, v2.4b[1]
145        LDR     x14,  [x5], 8
146        UDOT    v23.4s, v31.16b, v3.4b[1]
147        LDR     d7,  [x4], 8
148
149        UDOT    v24.2s, v8.8b, v0.8b
150        UDOT    v25.2s, v8.8b, v1.8b
151        UDOT    v26.2s, v8.8b, v2.8b
152        UDOT    v27.2s, v8.8b, v3.8b
153
154        # BLOCK 0
155        UDOT    v16.4s,  v28.16b, v4.4b[0]
156        LDR     d30,  [x5], 8
157        UDOT    v17.4s,  v28.16b, v5.4b[0]
158        INS     v29.d[1], x14
159        UDOT    v18.4s,  v28.16b, v6.4b[0]
160        LDR     x14,  [x5], 8
161        UDOT    v19.4s,  v28.16b, v7.4b[0]
162        LDR     d0,  [x3], 8
163
164        # BLOCK 1
165        UDOT    v20.4s,  v29.16b, v4.4b[0]
166        LDR     d31,  [x5], 8
167        UDOT    v21.4s,  v29.16b, v5.4b[0]
168        INS     v30.d[1], x14
169        UDOT    v22.4s,  v29.16b, v6.4b[0]
170        LDR     x14,  [x5], 8
171        UDOT    v23.4s,  v29.16b, v7.4b[0]
172        LDR     d1, [x15], 8
173
174        # BLOCK 0
175        UDOT    v16.4s, v30.16b, v4.4b[1]
176        LDR     d28,  [x5], 8
177        UDOT    v17.4s, v30.16b, v5.4b[1]
178        INS     v31.d[1], x14
179        UDOT    v18.4s, v30.16b, v6.4b[1]
180        LDR     x14,  [x5], 8
181        UDOT    v19.4s, v30.16b, v7.4b[1]
182        LDR     d2, [x13], 8
183
184        # BLOCK 1
185        UDOT    v20.4s, v31.16b, v4.4b[1]
186        LDR     d29,  [x5], 8
187        UDOT    v21.4s, v31.16b, v5.4b[1]
188        INS     v28.d[1], x14
189        UDOT    v22.4s, v31.16b, v6.4b[1]
190        LDR     x14,  [x5], 8
191        UDOT    v23.4s, v31.16b, v7.4b[1]
192        LDR     d3,  [x4], 8
193
194        UDOT    v24.2s, v8.8b, v4.8b
195        UDOT    v25.2s, v8.8b, v5.8b
196        SUBS    x0, x0, 16
197        UDOT    v26.2s, v8.8b, v6.8b
198        UDOT    v27.2s, v8.8b, v7.8b
199
200        B.HS    1b
201
202        # Epilogue.  Same as main loop but no preloads in final group
2032:
204        # BLOCK 0
205        UDOT    v16.4s,  v28.16b, v0.4b[0]
206        LDR     d30,  [x5], 8
207        UDOT    v17.4s,  v28.16b, v1.4b[0]
208        INS     v29.d[1], x14
209        UDOT    v18.4s,  v28.16b, v2.4b[0]
210        LDR     x14,  [x5], 8
211        UDOT    v19.4s,  v28.16b, v3.4b[0]
212        LDR     d4,  [x3], 8
213
214        # BLOCK 1
215        UDOT    v20.4s,  v29.16b, v0.4b[0]
216        LDR     d31,  [x5], 8
217        UDOT    v21.4s,  v29.16b, v1.4b[0]
218        INS     v30.d[1], x14
219        UDOT    v22.4s,  v29.16b, v2.4b[0]
220        LDR     x14,  [x5], 8
221        UDOT    v23.4s,  v29.16b, v3.4b[0]
222        LDR     d5, [x15], 8
223
224        # BLOCK 0
225        UDOT    v16.4s, v30.16b, v0.4b[1]
226        LDR     d28,  [x5], 8
227        UDOT    v17.4s, v30.16b, v1.4b[1]
228        INS     v31.d[1], x14
229        UDOT    v18.4s, v30.16b, v2.4b[1]
230        LDR     x14,  [x5], 8
231        UDOT    v19.4s, v30.16b, v3.4b[1]
232        LDR     d6, [x13], 8
233
234        # BLOCK 1
235        UDOT    v20.4s, v31.16b, v0.4b[1]
236        LDR     d29,  [x5], 8
237        UDOT    v21.4s, v31.16b, v1.4b[1]
238        INS     v28.d[1], x14
239        UDOT    v22.4s, v31.16b, v2.4b[1]
240        LDR     x14,  [x5], 8
241        UDOT    v23.4s, v31.16b, v3.4b[1]
242        LDR     d7,  [x4], 8
243
244        UDOT    v24.2s, v8.8b, v0.8b
245        UDOT    v25.2s, v8.8b, v1.8b
246        UDOT    v26.2s, v8.8b, v2.8b
247        UDOT    v27.2s, v8.8b, v3.8b
248
249        # BLOCK 0
250        UDOT    v16.4s,  v28.16b, v4.4b[0]
251        LDR     d30,  [x5], 8
252        UDOT    v17.4s,  v28.16b, v5.4b[0]
253        INS     v29.d[1], x14
254        UDOT    v18.4s,  v28.16b, v6.4b[0]
255        LDR     x14,  [x5], 8
256        UDOT    v19.4s,  v28.16b, v7.4b[0]
257
258        # BLOCK 1
259        UDOT    v20.4s,  v29.16b, v4.4b[0]
260        LDR     d31,  [x5], 8
261        UDOT    v21.4s,  v29.16b, v5.4b[0]
262        INS     v30.d[1], x14
263        UDOT    v22.4s,  v29.16b, v6.4b[0]
264        LDR     x14,  [x5], 8
265        UDOT    v23.4s,  v29.16b, v7.4b[0]
266
267        # BLOCK 0
268        UDOT    v16.4s, v30.16b, v4.4b[1]
269        UDOT    v17.4s, v30.16b, v5.4b[1]
270        INS     v31.d[1], x14
271        UDOT    v18.4s, v30.16b, v6.4b[1]
272        UDOT    v19.4s, v30.16b, v7.4b[1]
273
274        # BLOCK 1
275        UDOT    v20.4s, v31.16b, v4.4b[1]
276        UDOT    v21.4s, v31.16b, v5.4b[1]
277        UDOT    v22.4s, v31.16b, v6.4b[1]
278        UDOT    v23.4s, v31.16b, v7.4b[1]
279
280        AND     x0, x2, 15              // kc remainder 0 to 12
281
282        UDOT    v24.2s, v8.8b, v4.8b
283        UDOT    v25.2s, v8.8b, v5.8b
284        UDOT    v26.2s, v8.8b, v6.8b
285        UDOT    v27.2s, v8.8b, v7.8b
286
287        # Is there a remainder?- 4 to 12 bytes of A
288        CBNZ    x0, 4f
289
290        .p2align 3
2913:
292        ADDP    v0.2s, v24.2s, v25.2s
293        ADDP    v1.2s, v26.2s, v27.2s
294        DUP     v24.4s, v0.s[0]
295        DUP     v25.4s, v0.s[1]
296        DUP     v26.4s, v1.s[0]
297        DUP     v27.4s, v1.s[1]
298
299        # Subtract zero point from accumulators
300        SUB     v16.4s, v16.4s, v24.4s
301        SUB     v17.4s, v17.4s, v25.4s
302        SUB     v18.4s, v18.4s, v26.4s
303        SUB     v19.4s, v19.4s, v27.4s
304        SUB     v20.4s, v20.4s, v24.4s
305        SUB     v21.4s, v21.4s, v25.4s
306        SUB     v22.4s, v22.4s, v26.4s
307        SUB     v23.4s, v23.4s, v27.4s
308
309        # Apply params - preshift, scale, postshift, bias and clamp
310        LD1R    {v4.4s}, [x11], 4
311        SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
312        SSHL    v17.4s, v17.4s, v4.4s
313        SSHL    v18.4s, v18.4s, v4.4s
314        SSHL    v19.4s, v19.4s, v4.4s
315        LD1R    {v5.4s}, [x11], 4
316        SSHL    v20.4s, v20.4s, v4.4s
317        SSHL    v21.4s, v21.4s, v4.4s
318        SSHL    v22.4s, v22.4s, v4.4s
319        SSHL    v23.4s, v23.4s, v4.4s
320        LD1R    {v6.4s}, [x11], 4
321        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
322        SQDMULH v17.4s, v17.4s, v5.4s
323        SQDMULH v18.4s, v18.4s, v5.4s
324        SQDMULH v19.4s, v19.4s, v5.4s
325        SQDMULH v20.4s, v20.4s, v5.4s
326        SQDMULH v21.4s, v21.4s, v5.4s
327        SQDMULH v22.4s, v22.4s, v5.4s
328        SQDMULH v23.4s, v23.4s, v5.4s
329        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
330        SRSHL   v17.4s, v17.4s, v6.4s
331        SRSHL   v18.4s, v18.4s, v6.4s
332        SRSHL   v19.4s, v19.4s, v6.4s
333        SRSHL   v20.4s, v20.4s, v6.4s
334        SRSHL   v21.4s, v21.4s, v6.4s
335        SRSHL   v22.4s, v22.4s, v6.4s
336        SRSHL   v23.4s, v23.4s, v6.4s
337
338        SQXTN   v16.4h, v16.4s
339        SQXTN   v17.4h, v17.4s
340        SQXTN   v18.4h, v18.4s
341        SQXTN   v19.4h, v19.4s
342        LD1R    {v6.8h}, [x11], 2       // add bias
343
344        SQXTN2  v16.8h, v20.4s
345        SQXTN2  v17.8h, v21.4s
346        SQXTN2  v18.8h, v22.4s
347        SQXTN2  v19.8h, v23.4s
348
349        SQADD   v16.8h, v16.8h, v6.8h
350        SQADD   v17.8h, v17.8h, v6.8h
351        SQADD   v18.8h, v18.8h, v6.8h
352        SQADD   v19.8h, v19.8h, v6.8h
353        LD1R    {v4.16b}, [x11], 1      // clamp min value
354
355        SQXTUN  v0.8b, v16.8h
356        SQXTUN  v1.8b, v18.8h
357        LD1R    {v5.16b}, [x11]         // clamp max value
358        SQXTUN2 v0.16b, v17.8h
359        SQXTUN2 v1.16b, v19.8h
360        SUB     x11, x11, 15             // rewind params pointer
361
362        UMAX    v0.16b, v0.16b, v4.16b
363        UMAX    v1.16b, v1.16b, v4.16b
364        SUBS    x1, x1, 8
365        UMIN    v0.16b, v0.16b, v5.16b
366        UMIN    v1.16b, v1.16b, v5.16b
367        B.LO    6f
368
369        # Store full 4 x 8
370        ST1     {v0.8b}, [x6], x12
371        SUB     x3,  x3, x2             // a0 -= kc
372        ST1     {v0.d}[1], [x8], x12
373        SUB     x15, x15, x2            // a1 -= kc
374        ST1     {v1.8b}, [x9], x12
375        SUB     x13, x13, x2            // a2 -= kc
376        ST1     {v1.d}[1], [x7], x12
377        SUB     x4,  x4, x2             // a3 -= kc
378        B.NE    0b
379
380        # Restore d8 from stack
381        LDR     d8, [sp], 16
382        RET
383
384        # Remainder- 4 to 12 bytes of A
385        # Although C4, its safe to read 16 bytes.
386        .p2align 3
3874:
388        TBZ     x0, 3, 5f
389
390        LDR     d0,  [x3], 8
391        LDR     q4, [x5], 16
392        LDR     d1, [x15], 8
393        LDR     d2, [x13], 8
394        LDR     d3,  [x4], 8
395        LDR     q5, [x5], 16
396        UDOT    v24.2s, v8.8b, v0.8b
397        UDOT    v25.2s, v8.8b, v1.8b
398        UDOT    v26.2s, v8.8b, v2.8b
399        UDOT    v27.2s, v8.8b, v3.8b
400        UDOT    v16.4s, v4.16b, v0.4b[0]
401        UDOT    v17.4s, v4.16b, v1.4b[0]
402        UDOT    v18.4s, v4.16b, v2.4b[0]
403        UDOT    v19.4s, v4.16b, v3.4b[0]
404        LDR     q6, [x5], 16
405        UDOT    v20.4s, v5.16b, v0.4b[0]
406        UDOT    v21.4s, v5.16b, v1.4b[0]
407        UDOT    v22.4s, v5.16b, v2.4b[0]
408        UDOT    v23.4s, v5.16b, v3.4b[0]
409        LDR     q4, [x5], 16
410        UDOT    v16.4s, v6.16b, v0.4b[1]
411        UDOT    v17.4s, v6.16b, v1.4b[1]
412        UDOT    v18.4s, v6.16b, v2.4b[1]
413        UDOT    v19.4s, v6.16b, v3.4b[1]
414        UDOT    v20.4s, v4.16b, v0.4b[1]
415        UDOT    v21.4s, v4.16b, v1.4b[1]
416        UDOT    v22.4s, v4.16b, v2.4b[1]
417        UDOT    v23.4s, v4.16b, v3.4b[1]
418        TBZ     x0, 2, 3b
4195:
420        LDR     s0,  [x3], 4
421        LDR     q4, [x5], 16
422        LDR     s1, [x15], 4
423        LDR     s2, [x13], 4
424        LDR     s3,  [x4], 4
425        LDR     q5, [x5], 16
426        UDOT    v24.2s, v8.8b, v0.8b
427        UDOT    v25.2s, v8.8b, v1.8b
428        UDOT    v26.2s, v8.8b, v2.8b
429        UDOT    v27.2s, v8.8b, v3.8b
430        UDOT    v16.4s, v4.16b, v0.4b[0]
431        UDOT    v17.4s, v4.16b, v1.4b[0]
432        UDOT    v18.4s, v4.16b, v2.4b[0]
433        UDOT    v19.4s, v4.16b, v3.4b[0]
434        UDOT    v20.4s, v5.16b, v0.4b[0]
435        UDOT    v21.4s, v5.16b, v1.4b[0]
436        UDOT    v22.4s, v5.16b, v2.4b[0]
437        UDOT    v23.4s, v5.16b, v3.4b[0]
438        B       3b
439
440        # Store odd width
441        .p2align 3
4426:
443        TBZ     x1, 2, 7f
444        STR     s0, [x6], 4
445        ST1     {v0.s}[2], [x8], 4
446        STR     s1, [x9], 4
447        ST1     {v1.s}[2], [x7], 4
448        EXT     v0.16b, v0.16b, v0.16b, 4
449        EXT     v1.16b, v1.16b, v1.16b, 4
4507:
451        TBZ     x1, 1, 8f
452        STR     h0, [x6], 2
453        ST1     {v0.h}[4], [x8], 2
454        STR     h1, [x9], 2
455        ST1     {v1.h}[4], [x7], 2
456        EXT     v0.16b, v0.16b, v0.16b, 2
457        EXT     v1.16b, v1.16b, v1.16b, 2
4588:
459        TBZ     x1, 0, 9f
460        STR     b0, [x6]
461        ST1     {v0.b}[8], [x8]
462        STR     b1, [x9]
463        ST1     {v1.b}[8], [x7]
4649:
465        # Restore d8 from stack
466        LDR     d8, [sp], 16
467        RET
468
469END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55
470
471#ifdef __ELF__
472.section ".note.GNU-stack","",%progbits
473#endif
474