xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/4x8-minmax-aarch64-neonfma-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const void*restrict w,             x5
19#     uint8_t*restrict c,                x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> x10
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x13 a0
30# x14 a1
31# x15 a2
32#  x8 a3
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38#  x7 c3
39
40# x19 temporary vector shadow register
41
42# Vector register usage
43# A0  v0     v3
44# A1  v0[1]  v3[1]
45# A2  v1     v4
46# A3  v1[1]  v4[1]
47
48# B   v12 v13 v14 v15 second set of B
49# B   v16 v17 v18 v19 first set
50# C   v20 v21
51# C   v22 v23
52# C   v24 v25
53# C   v26 v27
54# Clamp v6 v7
55
56# unused A   v8 v9 v10 v11
57# x12 a4
58#  x4 a5
59# x13 c4
60#  x7 c5
61# A4  v2     v5
62# A5  v2[1]  v5[1]
63# C   v28 v29
64# C   v30 v31
65
66BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53
67
68        # Clamp C pointers
69        CMP     x0, 2                   // if mr < 2
70        ADD     x16, x6, x7             // c1 = c0 + cm_stride
71        CSEL    x16, x6, x16, LO        //   c1 = c0
72
73        ADD     x17, x16, x7            // c2 = c1 + cm_stride
74                                        // if mr <= 2
75        CSEL    x17, x16, x17, LS       //   c2 = c1
76
77        CMP     x0, 4                   // if mr < 4
78        ADD     x7, x17, x7             // c3 = c2 + cm_stride
79        CSEL    x7, x17, x7, LO         //   c3 = c2
80
81        # Load cn_stride, a_offset
82        LDP     x10, x11, [sp]
83
84        # Load zero, params pointer
85        LDP     x12, x8, [sp, 16]
86
87        # Load min/max values
88        LD2R    {v6.4s, v7.4s}, [x8]
89
90        # Save x19, d12-d15 on stack
91        STP     d12, d13, [sp, -48]!
92        STP     d14, d15, [sp, 16]
93        STR     x19,      [sp, 32]
94
950:
96        # Load initial bias from w into accumulators
97        LDP     q20, q21, [x5], 32
98        MOV     v22.16b, v20.16b
99        MOV     v23.16b, v21.16b
100        MOV     v24.16b, v20.16b
101        MOV     v25.16b, v21.16b
102        MOV     v26.16b, v20.16b
103        MOV     v27.16b, v21.16b
104
105        MOV     x9, x3                  // p = ks
106
1071:
108        # Load next 4 A pointers
109        LDP     x13, x14, [x4], 16
110        LDP     x15, x8, [x4], 16
111
112        CMP     x13, x12                // if a0 == zero
113        ADD     x13, x13, x11           // a0 += a_offset
114        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
115        CMP     x14, x12                // if a1 == zero
116        ADD     x14, x14, x11           // a1 += a_offset
117        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
118        CMP     x15, x12                // if a2 == zero
119        ADD     x15, x15, x11           // a2 += a_offset
120        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
121        CMP     x8, x12                 // if a3 == zero
122        ADD     x8, x8, x11             // a3 += a_offset
123        CSEL    x8, x12, x8, EQ         //   a3 = zero, else += a3 + a_offset
124
125        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
126        SUBS    x0, x2, 16              // k = kc - 16
127        B.LO    4f
128
129        # Prologue - First group loads, no FMA
130        LDR     d0, [x13], 8            // a0
131        LDP     q16, q17, [x5], 32        // b
132        LDR     d1, [x15], 8            // a2
133        LD1     {v0.d}[1],  [x14], 8     // a1
134        LD1     {v1.d}[1], [x8], 8       // a3
135        SUBS    x0, x0, 16
136        LDR     q18, [x5], 16
137        LDR     d19, [x5], 8
138        LDR     x19, [x5], 8            // ins is in BLOCK 0
139
140        # Is there at least 4 floats (16 bytes) for main loop?
141        B.LO    3f
142
143        # Main loop - 4 floats of A (16 bytes)
144        # 32 FMA + 8 LD64 A + 8 LDR B
1452:
146        # First group of 16 FMA, Second group loads
147        # BLOCK 0
148        LDR     d3, [x13], 8              // a0
149        INS     v19.d[1], x19               // b from second group
150        FMLA    v20.4s, v16.4s,  v0.s[0]
151        LDR     x19, [x14], 8              // a1
152        FMLA    v22.4s, v16.4s,  v0.s[2]
153        FMLA    v24.4s, v16.4s,  v1.s[0]
154
155        # BLOCK 1
156        LDR     d12, [x5]
157        INS     v3.d[1], x19                // a1 ins
158        FMLA    v26.4s, v16.4s,  v1.s[2]
159        LDR     x19, [x5, 8]            // b
160        FMLA    v21.4s, v17.4s,  v0.s[0]
161        FMLA    v23.4s, v17.4s,  v0.s[2]
162
163        # BLOCK 2
164        LDR     d4, [x15], 8              // a2
165        INS     v12.d[1], x19           // b  ins
166        FMLA    v25.4s, v17.4s,  v1.s[0]
167        LDR     x19, [x8], 8               // a3
168        FMLA    v27.4s, v17.4s,  v1.s[2]
169        FMLA    v20.4s, v18.4s,  v0.s[1]
170
171        # BLOCK 3
172        LDR     d13, [x5, 16]
173        INS     v4.d[1], x19                // a3 ins
174        FMLA    v22.4s, v18.4s,  v0.s[3]
175        LDR     x19, [x5, 24]
176        FMLA    v24.4s, v18.4s,  v1.s[1]
177        FMLA    v26.4s, v18.4s,  v1.s[3]
178
179        # BLOCK 4
180        LDR     d14, [x5, 32]
181        INS     v13.d[1], x19           // b
182        FMLA    v21.4s, v19.4s,  v0.s[1]
183        LDR     x19, [x5, 40]
184        FMLA    v23.4s, v19.4s,  v0.s[3]
185        FMLA    v25.4s, v19.4s,  v1.s[1]
186
187        # BLOCK 5
188        # NOPs to ensure 4 cycle LDR lands on next LDR
189        LDR     d15, [x5, 48]
190        INS     v14.d[1], x19           // b from previous
191        FMLA    v27.4s, v19.4s,  v1.s[3]
192        LDR     x19, [x5, 56]
193        NOP
194        NOP
195        NOP
196        NOP
197
198        # Second group of 16 FMA, First group of loads
199        # BLOCK 0
200        LDR     d0, [x13], 8              // a0
201        INS     v15.d[1], x19           // b from previous
202        FMLA    v20.4s, v12.4s,  v3.s[0]
203        LDR     x19, [x14], 8              // a1
204        FMLA    v22.4s, v12.4s,  v3.s[2]
205        FMLA    v24.4s, v12.4s,  v4.s[0]
206
207        # BLOCK 1
208        LDR     d16, [x5, 64]
209        INS     v0.d[1], x19               // a1 ins
210        FMLA    v26.4s, v12.4s,  v4.s[2]
211        LDR     x19, [x5, 72]           // b
212        FMLA    v21.4s, v13.4s,  v3.s[0]
213        FMLA    v23.4s, v13.4s,  v3.s[2]
214
215        # BLOCK 2
216        LDR     d1, [x15], 8             // a2
217        INS     v16.d[1], x19           // b
218        FMLA    v25.4s, v13.4s,  v4.s[0]
219        LDR     x19, [x8], 8             // a3
220        FMLA    v27.4s, v13.4s,  v4.s[2]
221        FMLA    v20.4s, v14.4s,  v3.s[1]
222
223        # BLOCK 3
224        LDR     d17, [x5, 80]
225        INS     v1.d[1], x19               // a3 ins
226        FMLA    v22.4s, v14.4s,  v3.s[3]
227        LDR     x19, [x5, 88]
228        FMLA    v24.4s, v14.4s,  v4.s[1]
229        FMLA    v26.4s, v14.4s,  v4.s[3]
230
231        # BLOCK 4
232        LDR     d18, [x5, 96]
233        INS     v17.d[1], x19           // b
234        FMLA    v21.4s, v15.4s,  v3.s[1]
235        LDR     x19, [x5, 104]
236        FMLA    v23.4s, v15.4s,  v3.s[3]
237        FMLA    v25.4s, v15.4s,  v4.s[1]
238
239        # BLOCK 5
240        # NOTE that block needs to be 4 cycles for LDR not to stall
241        LDR     d19, [x5, 112]
242        INS     v18.d[1], x19
243        FMLA    v27.4s, v15.4s,  v4.s[3]
244        LDR     x19, [x5, 120]
245        SUBS    x0, x0, 16
246        ADD     x5, x5, 128
247        B.HS    2b
248
249        # Epilogue - 4 floats of A (16 bytes)
250        # 32 FMA + 8 LD64 A + 8 LDR B
2513:
252        # First group of 16 FMA, Second group loads
253        # BLOCK 0
254        LDR     d3, [x13], 8              // a0
255        INS     v19.d[1], x19              // b from second group
256        FMLA    v20.4s, v16.4s,  v0.s[0]
257        LDR     x19, [x14], 8              // a1
258        FMLA    v22.4s, v16.4s,  v0.s[2]
259        FMLA    v24.4s, v16.4s,  v1.s[0]
260
261        # BLOCK 1
262        LDR     d12, [x5]
263        INS     v3.d[1], x19               // a1 ins
264        FMLA    v26.4s, v16.4s,  v1.s[2]
265        LDR     x19, [x5, 8]            // b
266        FMLA    v21.4s, v17.4s,  v0.s[0]
267        FMLA    v23.4s, v17.4s,  v0.s[2]
268
269        # BLOCK 2
270        LDR     d4, [x15], 8             // a2
271        INS     v12.d[1], x19           // b  ins
272        FMLA    v25.4s, v17.4s,  v1.s[0]
273        LDR     x19, [x8], 8             // a3
274        FMLA    v27.4s, v17.4s,  v1.s[2]
275        FMLA    v20.4s, v18.4s,  v0.s[1]
276
277        # BLOCK 3
278        LDR     d13, [x5, 16]
279        INS     v4.d[1], x19               // a3 ins
280        FMLA    v22.4s, v18.4s,  v0.s[3]
281        LDR     x19, [x5, 24]
282        FMLA    v24.4s, v18.4s,  v1.s[1]
283        FMLA    v26.4s, v18.4s,  v1.s[3]
284
285        # BLOCK 4
286        LDR     d14, [x5, 32]
287        INS     v13.d[1], x19           // b
288        FMLA    v21.4s, v19.4s,  v0.s[1]
289        LDR     x19, [x5, 40]
290        FMLA    v23.4s, v19.4s,  v0.s[3]
291        FMLA    v25.4s, v19.4s,  v1.s[1]
292
293        # BLOCK 5
294        # NOPs to ensure 4 cycle LDR lands on next LDR
295        LDR     d15, [x5, 48]
296        INS     v14.d[1], x19
297        FMLA    v27.4s, v19.4s,  v1.s[3]
298        LDR     x19, [x5, 56]
299        NOP     // fma
300        NOP
301        NOP     // fma
302        NOP
303
304        # Second group of 16 FMA, no loads
305        # BLOCK 0
306        INS     v15.d[1], x19           // b from previous
307        FMLA    v20.4s, v12.4s,  v3.s[0]
308        FMLA    v22.4s, v12.4s,  v3.s[2]
309        FMLA    v24.4s, v12.4s,  v4.s[0]
310
311        # BLOCK 1
312        FMLA    v26.4s, v12.4s,  v4.s[2]
313        FMLA    v21.4s, v13.4s,  v3.s[0]
314        FMLA    v23.4s, v13.4s,  v3.s[2]
315
316        # BLOCK 2
317        FMLA    v25.4s, v13.4s,  v4.s[0]
318        FMLA    v27.4s, v13.4s,  v4.s[2]
319        FMLA    v20.4s, v14.4s,  v3.s[1]
320
321        # BLOCK 3
322        FMLA    v22.4s, v14.4s,  v3.s[3]
323        FMLA    v24.4s, v14.4s,  v4.s[1]
324        FMLA    v26.4s, v14.4s,  v4.s[3]
325
326        # BLOCK 4
327        FMLA    v21.4s, v15.4s,  v3.s[1]
328        FMLA    v23.4s, v15.4s,  v3.s[3]
329        FMLA    v25.4s, v15.4s,  v4.s[1]
330        ADD     x5, x5, 64
331
332        # BLOCK 5
333        FMLA    v27.4s, v15.4s,  v4.s[3]
334
3354:
336        # Is there a remainder?- 2 floats of A (8 bytes)
337        TBNZ    x0, 3, 6f
338        # Is there a remainder?- 1 float of A (4 bytes)
339        TBNZ    x0, 2, 7f
3405:
341        # ks loop
342        SUBS    x9, x9, 32              // ks -= MR * sizeof(void*)
343        B.HI    1b
344
345        # Clamp
346        FMAX    v20.4s, v20.4s, v6.4s
347        FMAX    v21.4s, v21.4s, v6.4s
348        FMAX    v22.4s, v22.4s, v6.4s
349        FMAX    v23.4s, v23.4s, v6.4s
350        FMAX    v24.4s, v24.4s, v6.4s
351        FMAX    v25.4s, v25.4s, v6.4s
352        FMAX    v26.4s, v26.4s, v6.4s
353        FMAX    v27.4s, v27.4s, v6.4s
354        FMIN    v20.4s, v20.4s, v7.4s
355        FMIN    v21.4s, v21.4s, v7.4s
356        FMIN    v22.4s, v22.4s, v7.4s
357        FMIN    v23.4s, v23.4s, v7.4s
358        FMIN    v24.4s, v24.4s, v7.4s
359        FMIN    v25.4s, v25.4s, v7.4s
360        FMIN    v26.4s, v26.4s, v7.4s
361        FMIN    v27.4s, v27.4s, v7.4s
362
363        # Store full 4 x 8
364        SUBS    x1, x1, 8
365        B.LO    8f
366
367        STP     q26, q27, [x7]
368        ADD     x7, x7, x10
369        STP     q24, q25, [x17]
370        ADD     x17, x17, x10
371        STP     q22, q23, [x16]
372        ADD     x16, x16, x10
373        STP     q20, q21,  [x6]
374        ADD     x6,  x6, x10
375
376        SUB     x4, x4, x3              // a -= ks
377
378        # nc loop
379        B.HI    0b
380
381        # Restore x19, d12-d15 from stack
382        LDR     x19,      [sp, 32]
383        LDP     d14, d15, [sp, 16]
384        LDP     d12, d13, [sp], 48
385        RET
386
387        # Remainder - 2 floats of A (8 bytes)
388        # 16 FMA + 4 LD64 A + 2 LDP B
3896:
390        LDR     d0,  [x13], 8
391        LDP     q16,  q17, [x5], 32
392        LD1     {v0.d}[1], [x14], 8
393        LDR     d1, [x15], 8
394        LD1     {v1.d}[1], [x8], 8
395        LDP     q18,  q19, [x5], 32
396        FMLA    v20.4s, v16.4s,  v0.s[0]
397        FMLA    v22.4s, v16.4s,  v0.s[2]
398        FMLA    v24.4s, v16.4s,  v1.s[0]
399        FMLA    v26.4s, v16.4s,  v1.s[2]
400        FMLA    v21.4s, v17.4s,  v0.s[0]
401        FMLA    v23.4s, v17.4s,  v0.s[2]
402        FMLA    v25.4s, v17.4s,  v1.s[0]
403        FMLA    v27.4s, v17.4s,  v1.s[2]
404
405        FMLA    v20.4s, v18.4s,  v0.s[1]
406        FMLA    v22.4s, v18.4s,  v0.s[3]
407        FMLA    v24.4s, v18.4s,  v1.s[1]
408        FMLA    v26.4s, v18.4s,  v1.s[3]
409        FMLA    v21.4s, v19.4s,  v0.s[1]
410        FMLA    v23.4s, v19.4s,  v0.s[3]
411        FMLA    v25.4s, v19.4s,  v1.s[1]
412        FMLA    v27.4s, v19.4s,  v1.s[3]
413
414        # Is there a remainder?- 1 float of A (4 bytes)
415        TBZ     x0, 2, 5b
416
4177:
418        # Remainder- 1 float of A (4 bytes)
419        LDR     s0,  [x13], 4
420        LDP     q16,  q17, [x5], 32
421        LD1     {v0.s}[2], [x14], 4
422        LDR     s1, [x15], 4
423        LD1     {v1.s}[2], [x8], 4
424
425        FMLA    v20.4s, v16.4s,  v0.s[0]
426        FMLA    v22.4s, v16.4s,  v0.s[2]
427        FMLA    v24.4s, v16.4s,  v1.s[0]
428        FMLA    v26.4s, v16.4s,  v1.s[2]
429        FMLA    v21.4s, v17.4s,  v0.s[0]
430        FMLA    v23.4s, v17.4s,  v0.s[2]
431        FMLA    v25.4s, v17.4s,  v1.s[0]
432        FMLA    v27.4s, v17.4s,  v1.s[2]
433        B       5b
434
435        # Store odd width
4368:
437        TBZ     x1, 2, 9f
438        STR     q26,  [x7], 16
439        MOV     v26.16b, v27.16b
440        STR     q24, [x17], 16
441        MOV     v24.16b, v25.16b
442        STR     q22, [x16], 16
443        MOV     v22.16b, v23.16b
444        STR     q20,  [x6], 16
445        MOV     v20.16b, v21.16b
4469:
447        TBZ     x1, 1, 10f
448        STR     d26,  [x7], 8
449        STR     d24, [x17], 8
450        DUP     d26, v26.d[1]
451        DUP     d24, v24.d[1]
452        STR     d22, [x16], 8
453        STR     d20,  [x6], 8
454        DUP     d22, v22.d[1]
455        DUP     d20, v20.d[1]
456
45710:
458        TBZ     x1, 0, 11f
459        STR     s26,  [x7]
460        STR     s24, [x17]
461        STR     s22, [x16]
462        STR     s20,  [x6]
46311:
464        # Restore x19, d12-d15 from stack
465        LDR     x19,      [sp, 32]
466        LDP     d14, d15, [sp, 16]
467        LDP     d12, d13, [sp], 48
468        RET
469
470END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53
471
472#ifdef __ELF__
473.section ".note.GNU-stack","",%progbits
474#endif
475