xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31
32# C pointers
33#  x6 c0
34# x16 c1
35# x17 c2
36# x14 c3
37
38# x4 temporary vector shadow register
39
40# Vector register usage
41# A0  v0     v3
42# A1  v0[1]  v3[1]
43# A2  v1     v4
44# A3  v1[1]  v4[1]
45
46# B   v12 v13 v14 v15 second set of B
47# B   v16 v17 v18 v19 first set
48# C   v20 v21
49# C   v22 v23
50# C   v24 v25
51# C   v26 v27
52# Clamp v6 v7
53
54# unused A   v8 v9 v10 v11
55# x12 a4
56# x13 c4
57#  x7 c5
58# A4  v2     v5
59# A5  v2[1]  v5[1]
60# C   v28 v29
61# C   v30 v31
62
63BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53
64
65        # Load params pointer
66        LDR     x8, [sp, 8]
67
68        # Clamp A and C pointers
69        CMP     x0, 2                   // if mr < 2
70        ADD     x9, x3, x4              // a1 = a0 + a_stride
71        ADD     x16, x6, x7             // c1 = c0 + cm_stride
72        CSEL    x9, x3, x9, LO          //   a1 = a0
73        CSEL    x16, x6, x16, LO        //   c1 = c0
74
75        ADD     x10, x9, x4             // a2 = a1 + a_stride
76        ADD     x17, x16, x7            // c2 = c1 + cm_stride
77                                        // if mr <= 2
78        CSEL    x10, x9, x10, LS        //   a2 = a1
79        CSEL    x17, x16, x17, LS       //   c2 = c1
80
81        CMP     x0, 4                   // if mr < 4
82        ADD     x11, x10, x4            // a3 = a2 + a_stride
83        ADD     x14, x17, x7            // c3 = c2 + cm_stride
84        CSEL    x11, x10, x11, LO       //   a3 = a2
85        CSEL    x14, x17, x14, LO       //   c3 = c2
86
87        # Load min/max values
88        LD2R    {v6.4s, v7.4s}, [x8]
89
90        # Save d12-d15 on stack
91        STP     d12, d13, [sp, -32]!
92        STP     d14, d15, [sp, 16]
93
940:
95        # Load initial bias from w into accumulators
96        LDP     q20, q21, [x5], 32
97        MOV     v22.16b, v20.16b
98        MOV     v23.16b, v21.16b
99        MOV     v24.16b, v20.16b
100        MOV     v25.16b, v21.16b
101        MOV     v26.16b, v20.16b
102        MOV     v27.16b, v21.16b
103
104        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
105        SUBS    x0, x2, 16              // k = kc - 16
106        B.LO    4f
107
108        # Prologue - First group loads, no FMA
109        LDR     d0, [x3], 8              // a0
110        LDP     q16, q17, [x5], 32         // b
111        LDR     d1, [x10], 8             // a2
112        LD1     {v0.d}[1],  [x9], 8       // a1
113        LD1     {v1.d}[1], [x11], 8       // a3
114        SUBS    x0, x0, 16
115        LDR     q18, [x5], 16
116        LDR     d19, [x5], 8
117        LDR     x4, [x5], 8             // ins is in BLOCK 0
118
119        # Is there at least 4 floats (16 bytes) for main loop?
120        B.LO    2f
121
122        # Main loop - 4 floats of A (16 bytes)
123        # 32 FMA + 8 LD64 A + 8 LDR B
1241:
125        # First group of 16 FMA, Second group loads
126        # BLOCK 0
127        LDR     d3, [x3], 8              // a0
128        INS     v19.d[1], x4               // b from second group
129        FMLA    v20.4s, v16.4s,  v0.s[0]
130        LDR     x4, [x9], 8               // a1
131        FMLA    v22.4s, v16.4s,  v0.s[2]
132        FMLA    v24.4s, v16.4s,  v1.s[0]
133
134        # BLOCK 1
135        LDR     d12, [x5]
136        INS     v3.d[1], x4                // a1 ins
137        FMLA    v26.4s, v16.4s,  v1.s[2]
138        LDR     x4, [x5, 8]             // b
139        FMLA    v21.4s, v17.4s,  v0.s[0]
140        FMLA    v23.4s, v17.4s,  v0.s[2]
141
142        # BLOCK 2
143        LDR     d4, [x10], 8             // a2
144        INS     v12.d[1], x4            // b  ins
145        FMLA    v25.4s, v17.4s,  v1.s[0]
146        LDR     x4, [x11], 8              // a3
147        FMLA    v27.4s, v17.4s,  v1.s[2]
148        FMLA    v20.4s, v18.4s,  v0.s[1]
149
150        # BLOCK 3
151        LDR     d13, [x5, 16]
152        INS     v4.d[1], x4                // a3 ins
153        FMLA    v22.4s, v18.4s,  v0.s[3]
154        LDR     x4, [x5, 24]
155        FMLA    v24.4s, v18.4s,  v1.s[1]
156        FMLA    v26.4s, v18.4s,  v1.s[3]
157
158        # BLOCK 4
159        LDR     d14, [x5, 32]
160        INS     v13.d[1], x4            // b
161        FMLA    v21.4s, v19.4s,  v0.s[1]
162        LDR     x4, [x5, 40]
163        FMLA    v23.4s, v19.4s,  v0.s[3]
164        FMLA    v25.4s, v19.4s,  v1.s[1]
165
166        # BLOCK 5
167        # NOPs to ensure 4 cycle LDR lands on next LDR
168        LDR     d15, [x5, 48]
169        INS     v14.d[1], x4            // b from previous
170        FMLA    v27.4s, v19.4s,  v1.s[3]
171        LDR     x4, [x5, 56]
172        NOP
173        NOP
174        NOP
175        NOP
176
177        # Second group of 16 FMA, First group of loads
178        # BLOCK 0
179        LDR     d0, [x3], 8              // a0
180        INS     v15.d[1], x4            // b from previous
181        FMLA    v20.4s, v12.4s,  v3.s[0]
182        LDR     x4, [x9], 8               // a1
183        FMLA    v22.4s, v12.4s,  v3.s[2]
184        FMLA    v24.4s, v12.4s,  v4.s[0]
185
186        # BLOCK 1
187        LDR     d16, [x5, 64]
188        INS     v0.d[1], x4                // a1 ins
189        FMLA    v26.4s, v12.4s,  v4.s[2]
190        LDR     x4, [x5, 72]            // b
191        FMLA    v21.4s, v13.4s,  v3.s[0]
192        FMLA    v23.4s, v13.4s,  v3.s[2]
193
194        # BLOCK 2
195        LDR     d1, [x10], 8             // a2
196        INS     v16.d[1], x4            // b
197        FMLA    v25.4s, v13.4s,  v4.s[0]
198        LDR     x4, [x11], 8              // a3
199        FMLA    v27.4s, v13.4s,  v4.s[2]
200        FMLA    v20.4s, v14.4s,  v3.s[1]
201
202        # BLOCK 3
203        LDR     d17, [x5, 80]
204        INS     v1.d[1], x4                // a3 ins
205        FMLA    v22.4s, v14.4s,  v3.s[3]
206        LDR     x4, [x5, 88]
207        FMLA    v24.4s, v14.4s,  v4.s[1]
208        FMLA    v26.4s, v14.4s,  v4.s[3]
209
210        # BLOCK 4
211        LDR     d18, [x5, 96]
212        INS     v17.d[1], x4            // b
213        FMLA    v21.4s, v15.4s,  v3.s[1]
214        LDR     x4, [x5, 104]
215        FMLA    v23.4s, v15.4s,  v3.s[3]
216        FMLA    v25.4s, v15.4s,  v4.s[1]
217
218        # BLOCK 5
219        # NOTE that block needs to be 4 cycles for LDR not to stall
220        LDR     d19, [x5, 112]
221        INS     v18.d[1], x4
222        FMLA    v27.4s, v15.4s,  v4.s[3]
223        LDR     x4, [x5, 120]
224        SUBS    x0, x0, 16
225        ADD     x5, x5, 128
226        B.HS    1b
227
228        # Epilogue - 4 floats of A (16 bytes)
229        # 32 FMA + 8 LD64 A + 8 LDR B
2302:
231        # First group of 16 FMA, Second group loads
232        # BLOCK 0
233        LDR     d3, [x3], 8              // a0
234        INS     v19.d[1], x4               // b from second group
235        FMLA    v20.4s, v16.4s,  v0.s[0]
236        LDR     x4, [x9], 8               // a1
237        FMLA    v22.4s, v16.4s,  v0.s[2]
238        FMLA    v24.4s, v16.4s,  v1.s[0]
239
240        # BLOCK 1
241        LDR     d12, [x5]
242        INS     v3.d[1], x4                // a1 ins
243        FMLA    v26.4s, v16.4s,  v1.s[2]
244        LDR     x4, [x5, 8]             // b
245        FMLA    v21.4s, v17.4s,  v0.s[0]
246        FMLA    v23.4s, v17.4s,  v0.s[2]
247
248        # BLOCK 2
249        LDR     d4, [x10], 8             // a2
250        INS     v12.d[1], x4            // b  ins
251        FMLA    v25.4s, v17.4s,  v1.s[0]
252        LDR     x4, [x11], 8              // a3
253        FMLA    v27.4s, v17.4s,  v1.s[2]
254        FMLA    v20.4s, v18.4s,  v0.s[1]
255
256        # BLOCK 3
257        LDR     d13, [x5, 16]
258        INS     v4.d[1], x4                // a3 ins
259        FMLA    v22.4s, v18.4s,  v0.s[3]
260        LDR     x4, [x5, 24]
261        FMLA    v24.4s, v18.4s,  v1.s[1]
262        FMLA    v26.4s, v18.4s,  v1.s[3]
263
264        # BLOCK 4
265        LDR     d14, [x5, 32]
266        INS     v13.d[1], x4            // b
267        FMLA    v21.4s, v19.4s,  v0.s[1]
268        LDR     x4, [x5, 40]
269        FMLA    v23.4s, v19.4s,  v0.s[3]
270        FMLA    v25.4s, v19.4s,  v1.s[1]
271
272        # BLOCK 5
273        # NOPs to ensure 4 cycle LDR lands on next LDR
274        LDR     d15, [x5, 48]
275        INS     v14.d[1], x4
276        FMLA    v27.4s, v19.4s,  v1.s[3]
277        LDR     x4, [x5, 56]
278        NOP     // fma
279        NOP
280        NOP     // fma
281        NOP
282
283        # Second group of 16 FMA, no loads
284        # BLOCK 0
285        INS     v15.d[1], x4            // b from previous
286        FMLA    v20.4s, v12.4s,  v3.s[0]
287        FMLA    v22.4s, v12.4s,  v3.s[2]
288        FMLA    v24.4s, v12.4s,  v4.s[0]
289
290        # BLOCK 1
291        FMLA    v26.4s, v12.4s,  v4.s[2]
292        FMLA    v21.4s, v13.4s,  v3.s[0]
293        FMLA    v23.4s, v13.4s,  v3.s[2]
294
295        # BLOCK 2
296        FMLA    v25.4s, v13.4s,  v4.s[0]
297        FMLA    v27.4s, v13.4s,  v4.s[2]
298        FMLA    v20.4s, v14.4s,  v3.s[1]
299
300        # BLOCK 3
301        FMLA    v22.4s, v14.4s,  v3.s[3]
302        FMLA    v24.4s, v14.4s,  v4.s[1]
303        FMLA    v26.4s, v14.4s,  v4.s[3]
304        TST     x0, 15
305
306        # BLOCK 4
307        FMLA    v21.4s, v15.4s,  v3.s[1]
308        FMLA    v23.4s, v15.4s,  v3.s[3]
309        FMLA    v25.4s, v15.4s,  v4.s[1]
310        ADD     x5, x5, 64
311
312        # BLOCK 5
313        FMLA    v27.4s, v15.4s,  v4.s[3]
314
315        # Is there a remainder?- 2 floats of A (8 bytes) or less
316        B.NE    4f
317
3183:
319        # Clamp
320        FMAX    v20.4s, v20.4s, v6.4s
321        # Load cn_stride
322        LDR     x0, [sp, 32]
323        FMAX    v21.4s, v21.4s, v6.4s
324        FMAX    v22.4s, v22.4s, v6.4s
325        FMAX    v23.4s, v23.4s, v6.4s
326        FMAX    v24.4s, v24.4s, v6.4s
327        FMAX    v25.4s, v25.4s, v6.4s
328        FMAX    v26.4s, v26.4s, v6.4s
329        FMAX    v27.4s, v27.4s, v6.4s
330        SUBS    x1, x1, 8
331        FMIN    v20.4s, v20.4s, v7.4s
332        FMIN    v21.4s, v21.4s, v7.4s
333        FMIN    v22.4s, v22.4s, v7.4s
334        FMIN    v23.4s, v23.4s, v7.4s
335        FMIN    v24.4s, v24.4s, v7.4s
336        FMIN    v25.4s, v25.4s, v7.4s
337        FMIN    v26.4s, v26.4s, v7.4s
338        FMIN    v27.4s, v27.4s, v7.4s
339
340        # Store full 4 x 8
341        B.LO    6f
342
343        ST1     {v20.16b, v21.16b},  [x6], x0
344        SUB     x3,  x3, x2             // a0 -= kc
345        ST1     {v22.16b, v23.16b}, [x16], x0
346        SUB     x9,  x9, x2             // a1 -= kc
347        ST1     {v24.16b, v25.16b}, [x17], x0
348        SUB     x10, x10, x2            // a2 -= kc
349        ST1     {v26.16b, v27.16b}, [x14], x0
350        SUB     x11, x11, x2            // a3 -= kc
351
352        B.HI    0b
353
354        # Restore d12-d15 from stack
355        LDP     d14, d15, [sp, 16]
356        LDP     d12, d13, [sp], 32
357        RET
358
3594:
360        # Is there a remainder?- 2 floats of A (8 bytes)
361        TBZ     x0, 3, 5f
362
363        # Remainder- 2 floats of A (8 bytes)
364        LDR     d0,  [x3], 8
365        LDR     q16, [x5], 16
366        LD1     {v0.d}[1], [x9], 8
367        LDR     d1, [x10], 8
368        LD1     {v1.d}[1], [x11], 8
369        LDR     q17, [x5], 16
370        LDR     q18, [x5], 16
371        LDR     q19, [x5], 16
372        FMLA    v20.4s, v16.4s,  v0.s[0]
373        FMLA    v22.4s, v16.4s,  v0.s[2]
374        FMLA    v24.4s, v16.4s,  v1.s[0]
375        FMLA    v26.4s, v16.4s,  v1.s[2]
376        FMLA    v21.4s, v17.4s,  v0.s[0]
377        FMLA    v23.4s, v17.4s,  v0.s[2]
378        FMLA    v25.4s, v17.4s,  v1.s[0]
379        FMLA    v27.4s, v17.4s,  v1.s[2]
380
381        FMLA    v20.4s, v18.4s,  v0.s[1]
382        FMLA    v22.4s, v18.4s,  v0.s[3]
383        FMLA    v24.4s, v18.4s,  v1.s[1]
384        FMLA    v26.4s, v18.4s,  v1.s[3]
385        FMLA    v21.4s, v19.4s,  v0.s[1]
386        FMLA    v23.4s, v19.4s,  v0.s[3]
387        FMLA    v25.4s, v19.4s,  v1.s[1]
388        FMLA    v27.4s, v19.4s,  v1.s[3]
389
390        # Is there a remainder?- 1 float of A (4 bytes)
391        TBZ     x0, 2, 3b
392
3935:
394        # Remainder- 1 float of A (4 bytes)
395        LDR     s0,  [x3], 4
396        LDR     q16, [x5], 16
397        LD1     {v0.s}[2], [x9], 4
398        LDR     s1, [x10], 4
399        LD1     {v1.s}[2], [x11], 4
400        LDR     q17, [x5], 16
401
402        FMLA    v20.4s, v16.4s,  v0.s[0]
403        FMLA    v22.4s, v16.4s,  v0.s[2]
404        FMLA    v24.4s, v16.4s,  v1.s[0]
405        FMLA    v26.4s, v16.4s,  v1.s[2]
406        FMLA    v21.4s, v17.4s,  v0.s[0]
407        FMLA    v23.4s, v17.4s,  v0.s[2]
408        FMLA    v25.4s, v17.4s,  v1.s[0]
409        FMLA    v27.4s, v17.4s,  v1.s[2]
410        B       3b
411
412        # Store odd width
4136:
414        TBZ     x1, 2, 7f
415        STR     q20,  [x6], 16
416        MOV     v20.16b, v21.16b
417        STR     q22, [x16], 16
418        MOV     v22.16b, v23.16b
419        STR     q24, [x17], 16
420        MOV     v24.16b, v25.16b
421        STR     q26, [x14], 16
422        MOV     v26.16b, v27.16b
423
4247:
425        TBZ     x1, 1, 8f
426        STR     d20,  [x6], 8
427        STR     d22, [x16], 8
428        DUP     d20, v20.d[1]
429        DUP     d22, v22.d[1]
430        STR     d24, [x17], 8
431        STR     d26, [x14], 8
432        DUP     d24, v24.d[1]
433        DUP     d26, v26.d[1]
434
4358:
436        TBZ     x1, 0, 9f
437        STR     s20,  [x6]
438        STR     s22, [x16]
439        STR     s24, [x17]
440        STR     s26, [x14]
4419:
442        # Restore d12-d15 from stack
443        LDP     d14, d15, [sp, 16]
444        LDP     d12, d13, [sp], 32
445        RET
446
447END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53
448
449#ifdef __ELF__
450.section ".note.GNU-stack","",%progbits
451#endif
452