xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> (x8)
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32
33# C pointers
34#  x6 c0
35# x16 c1
36# x17 c2
37# x14 c3
38
39# x4 temporary vector shadow register
40
41# Vector register usage
42# A0  v0     v3
43# A1  v0[1]  v3[1]
44# A2  v1     v4
45# A3  v1[1]  v4[1]
46
47# B   v12 v13 v14 v15 second set of B
48# B   v16 v17 v18 v19 first set
49# C   v20 v21
50# C   v22 v23
51# C   v24 v25
52# C   v26 v27
53# Clamp v6 v7
54
55# unused A   v8 v9 v10 v11
56# x12 a4
57# x13 c4
58#  x7 c5
59# A4  v2     v5
60# A5  v2[1]  v5[1]
61# C   v28 v29
62# C   v30 v31
63
64BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53
65
66        # Load acc, params pointer
67        LDP     x15, x8, [sp, 8]
68
69        # Clamp A and C pointers
70        CMP     x0, 2                   // if mr < 2
71        ADD     x9, x3, x4              // a1 = a0 + a_stride
72        ADD     x16, x6, x7             // c1 = c0 + cm_stride
73        CSEL    x9, x3, x9, LO          //   a1 = a0
74        CSEL    x16, x6, x16, LO        //   c1 = c0
75
76        ADD     x10, x9, x4             // a2 = a1 + a_stride
77        ADD     x17, x16, x7            // c2 = c1 + cm_stride
78                                        // if mr <= 2
79        CSEL    x10, x9, x10, LS        //   a2 = a1
80        CSEL    x17, x16, x17, LS       //   c2 = c1
81
82        CMP     x0, 4                   // if mr < 4
83        ADD     x11, x10, x4            // a3 = a2 + a_stride
84        ADD     x14, x17, x7            // c3 = c2 + cm_stride
85        CSEL    x11, x10, x11, LO       //   a3 = a2
86        CSEL    x14, x17, x14, LO       //   c3 = c2
87
88        # Load min/max values
89        LD2R    {v6.4s, v7.4s}, [x8]
90
91        # Save d12-d15 on stack
92        STP     d12, d13, [sp, -32]!
93        STP     d14, d15, [sp, 16]
94
950:
96        # Load initial accumulators
97        LDP     q20, q21, [x15], 32
98        LDP     q22, q23, [x15], 32
99        LDP     q24, q25, [x15], 32
100        LDP     q26, q27, [x15], 32
101
102        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
103        SUBS    x0, x2, 16              // k = kc - 16
104        B.LO    4f
105
106        # Prologue - First group loads, no FMA
107        LDR     d0, [x3], 8              // a0
108        LDP     q16, q17, [x5], 32         // b
109        LDR     d1, [x10], 8             // a2
110        LD1     {v0.d}[1],  [x9], 8       // a1
111        LD1     {v1.d}[1], [x11], 8       // a3
112        SUBS    x0, x0, 16
113        LDR     q18, [x5], 16
114        LDR     d19, [x5], 8
115        LDR     x4, [x5], 8             // ins is in BLOCK 0
116
117        # Is there at least 4 floats (16 bytes) for main loop?
118        B.LO    2f
119
120        # Main loop - 4 floats of A (16 bytes)
121        # 32 FMA + 8 LD64 A + 8 LDR B
1221:
123        # First group of 16 FMA, Second group loads
124        # BLOCK 0
125        LDR     d3, [x3], 8              // a0
126        INS     v19.d[1], x4               // b from second group
127        FMLA    v20.4s, v16.4s,  v0.s[0]
128        LDR     x4, [x9], 8               // a1
129        FMLA    v22.4s, v16.4s,  v0.s[2]
130        FMLA    v24.4s, v16.4s,  v1.s[0]
131
132        # BLOCK 1
133        LDR     d12, [x5]
134        INS     v3.d[1], x4                // a1 ins
135        FMLA    v26.4s, v16.4s,  v1.s[2]
136        LDR     x4, [x5, 8]             // b
137        FMLA    v21.4s, v17.4s,  v0.s[0]
138        FMLA    v23.4s, v17.4s,  v0.s[2]
139
140        # BLOCK 2
141        LDR     d4, [x10], 8             // a2
142        INS     v12.d[1], x4            // b  ins
143        FMLA    v25.4s, v17.4s,  v1.s[0]
144        LDR     x4, [x11], 8              // a3
145        FMLA    v27.4s, v17.4s,  v1.s[2]
146        FMLA    v20.4s, v18.4s,  v0.s[1]
147
148        # BLOCK 3
149        LDR     d13, [x5, 16]
150        INS     v4.d[1], x4                // a3 ins
151        FMLA    v22.4s, v18.4s,  v0.s[3]
152        LDR     x4, [x5, 24]
153        FMLA    v24.4s, v18.4s,  v1.s[1]
154        FMLA    v26.4s, v18.4s,  v1.s[3]
155
156        # BLOCK 4
157        LDR     d14, [x5, 32]
158        INS     v13.d[1], x4            // b
159        FMLA    v21.4s, v19.4s,  v0.s[1]
160        LDR     x4, [x5, 40]
161        FMLA    v23.4s, v19.4s,  v0.s[3]
162        FMLA    v25.4s, v19.4s,  v1.s[1]
163
164        # BLOCK 5
165        # NOPs to ensure 4 cycle LDR lands on next LDR
166        LDR     d15, [x5, 48]
167        INS     v14.d[1], x4            // b from previous
168        FMLA    v27.4s, v19.4s,  v1.s[3]
169        LDR     x4, [x5, 56]
170        NOP
171        NOP
172        NOP
173        NOP
174
175        # Second group of 16 FMA, First group of loads
176        # BLOCK 0
177        LDR     d0, [x3], 8              // a0
178        INS     v15.d[1], x4            // b from previous
179        FMLA    v20.4s, v12.4s,  v3.s[0]
180        LDR     x4, [x9], 8               // a1
181        FMLA    v22.4s, v12.4s,  v3.s[2]
182        FMLA    v24.4s, v12.4s,  v4.s[0]
183
184        # BLOCK 1
185        LDR     d16, [x5, 64]
186        INS     v0.d[1], x4                // a1 ins
187        FMLA    v26.4s, v12.4s,  v4.s[2]
188        LDR     x4, [x5, 72]            // b
189        FMLA    v21.4s, v13.4s,  v3.s[0]
190        FMLA    v23.4s, v13.4s,  v3.s[2]
191
192        # BLOCK 2
193        LDR     d1, [x10], 8             // a2
194        INS     v16.d[1], x4            // b
195        FMLA    v25.4s, v13.4s,  v4.s[0]
196        LDR     x4, [x11], 8              // a3
197        FMLA    v27.4s, v13.4s,  v4.s[2]
198        FMLA    v20.4s, v14.4s,  v3.s[1]
199
200        # BLOCK 3
201        LDR     d17, [x5, 80]
202        INS     v1.d[1], x4                // a3 ins
203        FMLA    v22.4s, v14.4s,  v3.s[3]
204        LDR     x4, [x5, 88]
205        FMLA    v24.4s, v14.4s,  v4.s[1]
206        FMLA    v26.4s, v14.4s,  v4.s[3]
207
208        # BLOCK 4
209        LDR     d18, [x5, 96]
210        INS     v17.d[1], x4            // b
211        FMLA    v21.4s, v15.4s,  v3.s[1]
212        LDR     x4, [x5, 104]
213        FMLA    v23.4s, v15.4s,  v3.s[3]
214        FMLA    v25.4s, v15.4s,  v4.s[1]
215
216        # BLOCK 5
217        # NOTE that block needs to be 4 cycles for LDR not to stall
218        LDR     d19, [x5, 112]
219        INS     v18.d[1], x4
220        FMLA    v27.4s, v15.4s,  v4.s[3]
221        LDR     x4, [x5, 120]
222        SUBS    x0, x0, 16
223        ADD     x5, x5, 128
224        B.HS    1b
225
226        # Epilogue - 4 floats of A (16 bytes)
227        # 32 FMA + 8 LD64 A + 8 LDR B
2282:
229        # First group of 16 FMA, Second group loads
230        # BLOCK 0
231        LDR     d3, [x3], 8              // a0
232        INS     v19.d[1], x4               // b from second group
233        FMLA    v20.4s, v16.4s,  v0.s[0]
234        LDR     x4, [x9], 8               // a1
235        FMLA    v22.4s, v16.4s,  v0.s[2]
236        FMLA    v24.4s, v16.4s,  v1.s[0]
237
238        # BLOCK 1
239        LDR     d12, [x5]
240        INS     v3.d[1], x4                // a1 ins
241        FMLA    v26.4s, v16.4s,  v1.s[2]
242        LDR     x4, [x5, 8]             // b
243        FMLA    v21.4s, v17.4s,  v0.s[0]
244        FMLA    v23.4s, v17.4s,  v0.s[2]
245
246        # BLOCK 2
247        LDR     d4, [x10], 8             // a2
248        INS     v12.d[1], x4            // b  ins
249        FMLA    v25.4s, v17.4s,  v1.s[0]
250        LDR     x4, [x11], 8              // a3
251        FMLA    v27.4s, v17.4s,  v1.s[2]
252        FMLA    v20.4s, v18.4s,  v0.s[1]
253
254        # BLOCK 3
255        LDR     d13, [x5, 16]
256        INS     v4.d[1], x4                // a3 ins
257        FMLA    v22.4s, v18.4s,  v0.s[3]
258        LDR     x4, [x5, 24]
259        FMLA    v24.4s, v18.4s,  v1.s[1]
260        FMLA    v26.4s, v18.4s,  v1.s[3]
261
262        # BLOCK 4
263        LDR     d14, [x5, 32]
264        INS     v13.d[1], x4            // b
265        FMLA    v21.4s, v19.4s,  v0.s[1]
266        LDR     x4, [x5, 40]
267        FMLA    v23.4s, v19.4s,  v0.s[3]
268        FMLA    v25.4s, v19.4s,  v1.s[1]
269
270        # BLOCK 5
271        # NOPs to ensure 4 cycle LDR lands on next LDR
272        LDR     d15, [x5, 48]
273        INS     v14.d[1], x4
274        FMLA    v27.4s, v19.4s,  v1.s[3]
275        LDR     x4, [x5, 56]
276        NOP     // fma
277        NOP
278        NOP     // fma
279        NOP
280
281        # Second group of 16 FMA, no loads
282        # BLOCK 0
283        INS     v15.d[1], x4            // b from previous
284        FMLA    v20.4s, v12.4s,  v3.s[0]
285        FMLA    v22.4s, v12.4s,  v3.s[2]
286        FMLA    v24.4s, v12.4s,  v4.s[0]
287
288        # BLOCK 1
289        FMLA    v26.4s, v12.4s,  v4.s[2]
290        FMLA    v21.4s, v13.4s,  v3.s[0]
291        FMLA    v23.4s, v13.4s,  v3.s[2]
292
293        # BLOCK 2
294        FMLA    v25.4s, v13.4s,  v4.s[0]
295        FMLA    v27.4s, v13.4s,  v4.s[2]
296        FMLA    v20.4s, v14.4s,  v3.s[1]
297
298        # BLOCK 3
299        FMLA    v22.4s, v14.4s,  v3.s[3]
300        FMLA    v24.4s, v14.4s,  v4.s[1]
301        FMLA    v26.4s, v14.4s,  v4.s[3]
302        TST     x0, 15
303
304        # BLOCK 4
305        FMLA    v21.4s, v15.4s,  v3.s[1]
306        FMLA    v23.4s, v15.4s,  v3.s[3]
307        FMLA    v25.4s, v15.4s,  v4.s[1]
308        ADD     x5, x5, 64
309
310        # BLOCK 5
311        FMLA    v27.4s, v15.4s,  v4.s[3]
312
313        # Is there a remainder?- 2 floats of A (8 bytes) or less
314        B.NE    4f
315
3163:
317        # Clamp
318        FMAX    v20.4s, v20.4s, v6.4s
319        # Load cn_stride
320        LDR     x0, [sp, 32]
321        FMAX    v21.4s, v21.4s, v6.4s
322        FMAX    v22.4s, v22.4s, v6.4s
323        FMAX    v23.4s, v23.4s, v6.4s
324        FMAX    v24.4s, v24.4s, v6.4s
325        FMAX    v25.4s, v25.4s, v6.4s
326        FMAX    v26.4s, v26.4s, v6.4s
327        FMAX    v27.4s, v27.4s, v6.4s
328        SUBS    x1, x1, 8
329        FMIN    v20.4s, v20.4s, v7.4s
330        FMIN    v21.4s, v21.4s, v7.4s
331        FMIN    v22.4s, v22.4s, v7.4s
332        FMIN    v23.4s, v23.4s, v7.4s
333        FMIN    v24.4s, v24.4s, v7.4s
334        FMIN    v25.4s, v25.4s, v7.4s
335        FMIN    v26.4s, v26.4s, v7.4s
336        FMIN    v27.4s, v27.4s, v7.4s
337
338        # Store full 4 x 8
339        B.LO    6f
340
341        ST1     {v26.16b, v27.16b}, [x14], x0
342        SUB     x3,  x3, x2             // a0 -= kc
343        ST1     {v24.16b, v25.16b}, [x17], x0
344        SUB     x9,  x9, x2             // a1 -= kc
345        ST1     {v22.16b, v23.16b}, [x16], x0
346        SUB     x10, x10, x2            // a2 -= kc
347        ST1     {v20.16b, v21.16b},  [x6], x0
348        SUB     x11, x11, x2            // a3 -= kc
349
350        B.HI    0b
351
352        # Restore d12-d15 from stack
353        LDP     d14, d15, [sp, 16]
354        LDP     d12, d13, [sp], 32
355        RET
356
3574:
358        # Is there a remainder?- 2 floats of A (8 bytes)
359        TBZ     x0, 3, 5f
360
361        # Remainder- 2 floats of A (8 bytes)
362        LDR     d0,  [x3], 8
363        LDR     q16, [x5], 16
364        LD1     {v0.d}[1], [x9], 8
365        LDR     d1, [x10], 8
366        LD1     {v1.d}[1], [x11], 8
367        LDR     q17, [x5], 16
368        LDR     q18, [x5], 16
369        LDR     q19, [x5], 16
370        FMLA    v20.4s, v16.4s,  v0.s[0]
371        FMLA    v22.4s, v16.4s,  v0.s[2]
372        FMLA    v24.4s, v16.4s,  v1.s[0]
373        FMLA    v26.4s, v16.4s,  v1.s[2]
374        FMLA    v21.4s, v17.4s,  v0.s[0]
375        FMLA    v23.4s, v17.4s,  v0.s[2]
376        FMLA    v25.4s, v17.4s,  v1.s[0]
377        FMLA    v27.4s, v17.4s,  v1.s[2]
378
379        FMLA    v20.4s, v18.4s,  v0.s[1]
380        FMLA    v22.4s, v18.4s,  v0.s[3]
381        FMLA    v24.4s, v18.4s,  v1.s[1]
382        FMLA    v26.4s, v18.4s,  v1.s[3]
383        FMLA    v21.4s, v19.4s,  v0.s[1]
384        FMLA    v23.4s, v19.4s,  v0.s[3]
385        FMLA    v25.4s, v19.4s,  v1.s[1]
386        FMLA    v27.4s, v19.4s,  v1.s[3]
387
388        # Is there a remainder?- 1 float of A (4 bytes)
389        TBZ     x0, 2, 3b
390
3915:
392        # Remainder- 1 float of A (4 bytes)
393        LDR     s0,  [x3], 4
394        LDR     q16, [x5], 16
395        LD1     {v0.s}[2], [x9], 4
396        LDR     s1, [x10], 4
397        LD1     {v1.s}[2], [x11], 4
398        LDR     q17, [x5], 16
399
400        FMLA    v20.4s, v16.4s,  v0.s[0]
401        FMLA    v22.4s, v16.4s,  v0.s[2]
402        FMLA    v24.4s, v16.4s,  v1.s[0]
403        FMLA    v26.4s, v16.4s,  v1.s[2]
404        FMLA    v21.4s, v17.4s,  v0.s[0]
405        FMLA    v23.4s, v17.4s,  v0.s[2]
406        FMLA    v25.4s, v17.4s,  v1.s[0]
407        FMLA    v27.4s, v17.4s,  v1.s[2]
408        B       3b
409
410        # Store odd width
4116:
412        TBZ     x1, 2, 7f
413        STR     q26, [x14], 16
414        MOV     v26.16b, v27.16b
415        STR     q24, [x17], 16
416        MOV     v24.16b, v25.16b
417        STR     q22, [x16], 16
418        MOV     v22.16b, v23.16b
419        STR     q20,  [x6], 16
420        MOV     v20.16b, v21.16b
421
4227:
423        TBZ     x1, 1, 8f
424        STR     d26, [x14], 8
425        STR     d24, [x17], 8
426        DUP     d26, v26.d[1]
427        DUP     d24, v24.d[1]
428        STR     d22, [x16], 8
429        STR     d20,  [x6], 8
430        DUP     d22, v22.d[1]
431        DUP     d20, v20.d[1]
432
4338:
434        TBZ     x1, 0, 9f
435        STR     s26, [x14]
436        STR     s24, [x17]
437        STR     s22, [x16]
438        STR     s20,  [x6]
4399:
440        # Restore d12-d15 from stack
441        LDP     d14, d15, [sp, 16]
442        LDP     d12, d13, [sp], 32
443        RET
444
445END_FUNCTION xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53
446
447#ifdef __ELF__
448.section ".note.GNU-stack","",%progbits
449#endif
450