xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const void*restrict w,             x5
19#     uint8_t*restrict c,                x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> x10
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x13 a0
30# x14 a1
31# x15 a2
32#  x8 a3
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38#  x7 c3
39
40# x19 temporary vector shadow register
41
42# Vector register usage
43# A0  v0     v3
44# A1  v0[1]  v3[1]
45# A2  v1     v4
46# A3  v1[1]  v4[1]
47
48# B   v12 v13 v14 v15 second set of B
49# B   v16 v17 v18 v19 first set
50# C   v20 v21
51# C   v22 v23
52# C   v24 v25
53# C   v26 v27
54# Clamp v6 v7
55
56# unused A   v8 v9 v10 v11
57# x12 a4
58#  x4 a5
59# x13 c4
60#  x7 c5
61# A4  v2     v5
62# A5  v2[1]  v5[1]
63# C   v28 v29
64# C   v30 v31
65
66BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53
67
68        # Clamp C pointers
69        CMP     x0, 2                   // if mr < 2
70        ADD     x16, x6, x7             // c1 = c0 + cm_stride
71        CSEL    x16, x6, x16, LO        //   c1 = c0
72
73        ADD     x17, x16, x7            // c2 = c1 + cm_stride
74                                        // if mr <= 2
75        CSEL    x17, x16, x17, LS       //   c2 = c1
76
77        CMP     x0, 4                   // if mr < 4
78        ADD     x7, x17, x7             // c3 = c2 + cm_stride
79        CSEL    x7, x17, x7, LO         //   c3 = c2
80
81        # Load cn_stride, a_offset
82        LDP     x10, x11, [sp]
83
84        # Load zero, params pointer
85        LDP     x12, x8, [sp, 16]
86
87        # Load min/max values
88        LD2R    {v6.4s, v7.4s}, [x8]
89
90        # Save x19, d12-d15 on stack
91        STP     d12, d13, [sp, -48]!
92        STP     d14, d15, [sp, 16]
93        STR     x19,      [sp, 32]
94
950:
96        # Load initial bias from w into accumulators
97        LDP     q20, q21, [x5], 32
98        MOV     v22.16b, v20.16b
99        PRFM    PLDL1KEEP,  [x13,  0]   // Prefetch A
100        PRFM    PLDL1KEEP,  [x13, 64]
101        MOV     v23.16b, v21.16b
102        PRFM    PLDL1KEEP,  [x14,  0]
103        PRFM    PLDL1KEEP,  [x14, 64]
104        MOV     v24.16b, v20.16b
105        PRFM    PLDL1KEEP, [x15,  0]
106        PRFM    PLDL1KEEP, [x15, 64]
107        MOV     v25.16b, v21.16b
108        PRFM    PLDL1KEEP, [x8,  0]
109        PRFM    PLDL1KEEP, [x8, 64]
110        MOV     v26.16b, v20.16b
111        PRFM    PLDL1KEEP, [x5,   0]    // Prefetch B
112        PRFM    PLDL1KEEP, [x5,  64]
113        MOV     v27.16b, v21.16b
114        PRFM    PLDL1KEEP, [x5, 128]
115        PRFM    PLDL1KEEP, [x5, 192]
116
117        MOV     x9, x3                  // p = ks
118
1191:
120        # Load next 4 A pointers
121        LDP     x13, x14, [x4], 16
122        LDP     x15, x8, [x4], 16
123
124        CMP     x13, x12                // if a0 == zero
125        ADD     x13, x13, x11           // a0 += a_offset
126        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
127        CMP     x14, x12                // if a1 == zero
128        ADD     x14, x14, x11           // a1 += a_offset
129        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
130        CMP     x15, x12                // if a2 == zero
131        ADD     x15, x15, x11           // a2 += a_offset
132        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
133        CMP     x8, x12                 // if a3 == zero
134        ADD     x8, x8, x11             // a3 += a_offset
135        CSEL    x8, x12, x8, EQ         //   a3 = zero, else += a3 + a_offset
136
137        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
138        SUBS    x0, x2, 16              // k = kc - 16
139        B.LO    4f
140
141        # Prologue - First group loads, no FMA
142        LDR     d0, [x13], 8            // a0
143        LDP     q16, q17, [x5], 32        // b
144        LDR     d1, [x15], 8            // a2
145        LD1     {v0.d}[1],  [x14], 8     // a1
146        LD1     {v1.d}[1], [x8], 8       // a3
147        SUBS    x0, x0, 16
148        LDR     q18, [x5], 16
149        LDR     d19, [x5], 8
150        LDR     x19, [x5], 8            // ins is in BLOCK 0
151
152        # Is there at least 4 floats (16 bytes) for main loop?
153        B.LO    3f
154
155        # Main loop - 4 floats of A (16 bytes)
156        # 32 FMA + 8 LD64 A + 8 LDR B
1572:
158        # First group of 16 FMA, Second group loads
159        # BLOCK 0
160        LDR     d3, [x13], 8              // a0
161        INS     v19.d[1], x19               // b from second group
162        FMLA    v20.4s, v16.4s,  v0.s[0]
163        LDR     x19, [x14], 8              // a1
164        FMLA    v22.4s, v16.4s,  v0.s[2]
165        FMLA    v24.4s, v16.4s,  v1.s[0]
166
167        # BLOCK 1
168        LDR     d12, [x5]
169        INS     v3.d[1], x19                // a1 ins
170        FMLA    v26.4s, v16.4s,  v1.s[2]
171        LDR     x19, [x5, 8]            // b
172        FMLA    v21.4s, v17.4s,  v0.s[0]
173        FMLA    v23.4s, v17.4s,  v0.s[2]
174
175        # BLOCK 2
176        LDR     d4, [x15], 8              // a2
177        INS     v12.d[1], x19           // b  ins
178        FMLA    v25.4s, v17.4s,  v1.s[0]
179        LDR     x19, [x8], 8               // a3
180        FMLA    v27.4s, v17.4s,  v1.s[2]
181        FMLA    v20.4s, v18.4s,  v0.s[1]
182
183        # BLOCK 3
184        LDR     d13, [x5, 16]
185        INS     v4.d[1], x19                // a3 ins
186        FMLA    v22.4s, v18.4s,  v0.s[3]
187        LDR     x19, [x5, 24]
188        FMLA    v24.4s, v18.4s,  v1.s[1]
189        FMLA    v26.4s, v18.4s,  v1.s[3]
190
191        # BLOCK 4
192        LDR     d14, [x5, 32]
193        INS     v13.d[1], x19           // b
194        FMLA    v21.4s, v19.4s,  v0.s[1]
195        LDR     x19, [x5, 40]
196        FMLA    v23.4s, v19.4s,  v0.s[3]
197        FMLA    v25.4s, v19.4s,  v1.s[1]
198
199        # BLOCK 5
200        # NOPs to ensure 4 cycle LDR lands on next LDR
201        LDR     d15, [x5, 48]
202        INS     v14.d[1], x19           // b from previous
203        FMLA    v27.4s, v19.4s,  v1.s[3]
204        LDR     x19, [x5, 56]
205        NOP
206        NOP
207        NOP
208        NOP
209
210        # Second group of 16 FMA, First group of loads
211        # BLOCK 0
212        LDR     d0, [x13], 8              // a0
213        INS     v15.d[1], x19           // b from previous
214        FMLA    v20.4s, v12.4s,  v3.s[0]
215        LDR     x19, [x14], 8              // a1
216        FMLA    v22.4s, v12.4s,  v3.s[2]
217        FMLA    v24.4s, v12.4s,  v4.s[0]
218        PRFM    PLDL1KEEP, [x13, 128]      // Prefetch A0
219
220        # BLOCK 1
221        LDR     d16, [x5, 64]
222        INS     v0.d[1], x19               // a1 ins
223        FMLA    v26.4s, v12.4s,  v4.s[2]
224        LDR     x19, [x5, 72]           // b
225        FMLA    v21.4s, v13.4s,  v3.s[0]
226        FMLA    v23.4s, v13.4s,  v3.s[2]
227        PRFM    PLDL1KEEP, [x14, 128]      // Prefetch A1
228
229        # BLOCK 2
230        LDR     d1, [x15], 8             // a2
231        INS     v16.d[1], x19           // b
232        FMLA    v25.4s, v13.4s,  v4.s[0]
233        LDR     x19, [x8], 8             // a3
234        FMLA    v27.4s, v13.4s,  v4.s[2]
235        FMLA    v20.4s, v14.4s,  v3.s[1]
236        PRFM    PLDL1KEEP, [x15, 128]     // Prefetch A2
237
238        # BLOCK 3
239        LDR     d17, [x5, 80]
240        INS     v1.d[1], x19               // a3 ins
241        FMLA    v22.4s, v14.4s,  v3.s[3]
242        LDR     x19, [x5, 88]
243        FMLA    v24.4s, v14.4s,  v4.s[1]
244        FMLA    v26.4s, v14.4s,  v4.s[3]
245        PRFM    PLDL1KEEP, [x8, 128]     // Prefetch A3
246
247        # BLOCK 4
248        LDR     d18, [x5, 96]
249        INS     v17.d[1], x19           // b
250        FMLA    v21.4s, v15.4s,  v3.s[1]
251        LDR     x19, [x5, 104]
252        FMLA    v23.4s, v15.4s,  v3.s[3]
253        FMLA    v25.4s, v15.4s,  v4.s[1]
254        PRFM    PLDL1KEEP, [x5, 192]      // Prefetch B
255
256        # BLOCK 5
257        # NOTE that block needs to be 4 cycles for LDR not to stall
258        LDR     d19, [x5, 112]
259        INS     v18.d[1], x19
260        FMLA    v27.4s, v15.4s,  v4.s[3]
261        LDR     x19, [x5, 120]
262        SUBS    x0, x0, 16
263        PRFM    PLDL1KEEP, [x5, 256]      // Prefetch B
264        ADD     x5, x5, 128
265        B.HS    2b
266
267        # Epilogue - 4 floats of A (16 bytes)
268        # 32 FMA + 8 LD64 A + 8 LDR B
2693:
270        # First group of 16 FMA, Second group loads
271        # BLOCK 0
272        LDR     d3, [x13], 8              // a0
273        INS     v19.d[1], x19              // b from second group
274        FMLA    v20.4s, v16.4s,  v0.s[0]
275        LDR     x19, [x14], 8              // a1
276        FMLA    v22.4s, v16.4s,  v0.s[2]
277        FMLA    v24.4s, v16.4s,  v1.s[0]
278
279        # BLOCK 1
280        LDR     d12, [x5]
281        INS     v3.d[1], x19               // a1 ins
282        FMLA    v26.4s, v16.4s,  v1.s[2]
283        LDR     x19, [x5, 8]            // b
284        FMLA    v21.4s, v17.4s,  v0.s[0]
285        FMLA    v23.4s, v17.4s,  v0.s[2]
286
287        # BLOCK 2
288        LDR     d4, [x15], 8             // a2
289        INS     v12.d[1], x19           // b  ins
290        FMLA    v25.4s, v17.4s,  v1.s[0]
291        LDR     x19, [x8], 8             // a3
292        FMLA    v27.4s, v17.4s,  v1.s[2]
293        FMLA    v20.4s, v18.4s,  v0.s[1]
294
295        # BLOCK 3
296        LDR     d13, [x5, 16]
297        INS     v4.d[1], x19               // a3 ins
298        FMLA    v22.4s, v18.4s,  v0.s[3]
299        LDR     x19, [x5, 24]
300        FMLA    v24.4s, v18.4s,  v1.s[1]
301        FMLA    v26.4s, v18.4s,  v1.s[3]
302
303        # BLOCK 4
304        LDR     d14, [x5, 32]
305        INS     v13.d[1], x19           // b
306        FMLA    v21.4s, v19.4s,  v0.s[1]
307        LDR     x19, [x5, 40]
308        FMLA    v23.4s, v19.4s,  v0.s[3]
309        FMLA    v25.4s, v19.4s,  v1.s[1]
310
311        # BLOCK 5
312        # NOPs to ensure 4 cycle LDR lands on next LDR
313        LDR     d15, [x5, 48]
314        INS     v14.d[1], x19
315        FMLA    v27.4s, v19.4s,  v1.s[3]
316        LDR     x19, [x5, 56]
317        NOP     // fma
318        NOP
319        NOP     // fma
320        NOP
321
322        # Second group of 16 FMA, no loads
323        # BLOCK 0
324        INS     v15.d[1], x19           // b from previous
325        FMLA    v20.4s, v12.4s,  v3.s[0]
326        FMLA    v22.4s, v12.4s,  v3.s[2]
327        FMLA    v24.4s, v12.4s,  v4.s[0]
328
329        # BLOCK 1
330        FMLA    v26.4s, v12.4s,  v4.s[2]
331        FMLA    v21.4s, v13.4s,  v3.s[0]
332        FMLA    v23.4s, v13.4s,  v3.s[2]
333
334        # BLOCK 2
335        FMLA    v25.4s, v13.4s,  v4.s[0]
336        FMLA    v27.4s, v13.4s,  v4.s[2]
337        FMLA    v20.4s, v14.4s,  v3.s[1]
338
339        # BLOCK 3
340        FMLA    v22.4s, v14.4s,  v3.s[3]
341        FMLA    v24.4s, v14.4s,  v4.s[1]
342        FMLA    v26.4s, v14.4s,  v4.s[3]
343
344        # BLOCK 4
345        FMLA    v21.4s, v15.4s,  v3.s[1]
346        FMLA    v23.4s, v15.4s,  v3.s[3]
347        FMLA    v25.4s, v15.4s,  v4.s[1]
348        ADD     x5, x5, 64
349
350        # BLOCK 5
351        FMLA    v27.4s, v15.4s,  v4.s[3]
352
3534:
354        # Is there a remainder?- 2 floats of A (8 bytes)
355        TBNZ    x0, 3, 6f
356        # Is there a remainder?- 1 float of A (4 bytes)
357        TBNZ    x0, 2, 7f
3585:
359        # ks loop
360        SUBS    x9, x9, 32              // ks -= MR * sizeof(void*)
361        B.HI    1b
362
363        # Clamp
364        FMAX    v20.4s, v20.4s, v6.4s
365        FMAX    v21.4s, v21.4s, v6.4s
366        FMAX    v22.4s, v22.4s, v6.4s
367        FMAX    v23.4s, v23.4s, v6.4s
368        FMAX    v24.4s, v24.4s, v6.4s
369        FMAX    v25.4s, v25.4s, v6.4s
370        FMAX    v26.4s, v26.4s, v6.4s
371        FMAX    v27.4s, v27.4s, v6.4s
372        FMIN    v20.4s, v20.4s, v7.4s
373        FMIN    v21.4s, v21.4s, v7.4s
374        FMIN    v22.4s, v22.4s, v7.4s
375        FMIN    v23.4s, v23.4s, v7.4s
376        FMIN    v24.4s, v24.4s, v7.4s
377        FMIN    v25.4s, v25.4s, v7.4s
378        FMIN    v26.4s, v26.4s, v7.4s
379        FMIN    v27.4s, v27.4s, v7.4s
380
381        # Store full 4 x 8
382        SUBS    x1, x1, 8
383        B.LO    8f
384
385        STP     q26, q27, [x7]
386        ADD     x7, x7, x10
387        STP     q24, q25, [x17]
388        ADD     x17, x17, x10
389        STP     q22, q23, [x16]
390        ADD     x16, x16, x10
391        STP     q20, q21,  [x6]
392        ADD     x6,  x6, x10
393
394        SUB     x4, x4, x3              // a -= ks
395
396        # nc loop
397        B.HI    0b
398
399        # Restore x19, d12-d15 from stack
400        LDR     x19,      [sp, 32]
401        LDP     d14, d15, [sp, 16]
402        LDP     d12, d13, [sp], 48
403        RET
404
405        # Remainder - 2 floats of A (8 bytes)
406        # 16 FMA + 4 LD64 A + 2 LDP B
4076:
408        LDR     d0,  [x13], 8
409        LDP     q16,  q17, [x5], 32
410        LD1     {v0.d}[1], [x14], 8
411        LDR     d1, [x15], 8
412        LD1     {v1.d}[1], [x8], 8
413        LDP     q18,  q19, [x5], 32
414        FMLA    v20.4s, v16.4s,  v0.s[0]
415        FMLA    v22.4s, v16.4s,  v0.s[2]
416        FMLA    v24.4s, v16.4s,  v1.s[0]
417        FMLA    v26.4s, v16.4s,  v1.s[2]
418        FMLA    v21.4s, v17.4s,  v0.s[0]
419        FMLA    v23.4s, v17.4s,  v0.s[2]
420        FMLA    v25.4s, v17.4s,  v1.s[0]
421        FMLA    v27.4s, v17.4s,  v1.s[2]
422
423        FMLA    v20.4s, v18.4s,  v0.s[1]
424        FMLA    v22.4s, v18.4s,  v0.s[3]
425        FMLA    v24.4s, v18.4s,  v1.s[1]
426        FMLA    v26.4s, v18.4s,  v1.s[3]
427        FMLA    v21.4s, v19.4s,  v0.s[1]
428        FMLA    v23.4s, v19.4s,  v0.s[3]
429        FMLA    v25.4s, v19.4s,  v1.s[1]
430        FMLA    v27.4s, v19.4s,  v1.s[3]
431
432        # Is there a remainder?- 1 float of A (4 bytes)
433        TBZ     x0, 2, 5b
434
4357:
436        # Remainder- 1 float of A (4 bytes)
437        LDR     s0,  [x13], 4
438        LDP     q16,  q17, [x5], 32
439        LD1     {v0.s}[2], [x14], 4
440        LDR     s1, [x15], 4
441        LD1     {v1.s}[2], [x8], 4
442
443        FMLA    v20.4s, v16.4s,  v0.s[0]
444        FMLA    v22.4s, v16.4s,  v0.s[2]
445        FMLA    v24.4s, v16.4s,  v1.s[0]
446        FMLA    v26.4s, v16.4s,  v1.s[2]
447        FMLA    v21.4s, v17.4s,  v0.s[0]
448        FMLA    v23.4s, v17.4s,  v0.s[2]
449        FMLA    v25.4s, v17.4s,  v1.s[0]
450        FMLA    v27.4s, v17.4s,  v1.s[2]
451        B       5b
452
453        # Store odd width
4548:
455        TBZ     x1, 2, 9f
456        STR     q26,  [x7], 16
457        MOV     v26.16b, v27.16b
458        STR     q24, [x17], 16
459        MOV     v24.16b, v25.16b
460        STR     q22, [x16], 16
461        MOV     v22.16b, v23.16b
462        STR     q20,  [x6], 16
463        MOV     v20.16b, v21.16b
4649:
465        TBZ     x1, 1, 10f
466        STR     d26,  [x7], 8
467        STR     d24, [x17], 8
468        DUP     d26, v26.d[1]
469        DUP     d24, v24.d[1]
470        STR     d22, [x16], 8
471        STR     d20,  [x6], 8
472        DUP     d22, v22.d[1]
473        DUP     d20, v20.d[1]
474
47510:
476        TBZ     x1, 0, 11f
477        STR     s26,  [x7]
478        STR     s24, [x17]
479        STR     s22, [x16]
480        STR     s20,  [x6]
48111:
482        # Restore x19, d12-d15 from stack
483        LDR     x19,      [sp, 32]
484        LDP     d14, d15, [sp, 16]
485        LDP     d12, d13, [sp], 48
486        RET
487
488END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53
489
490#ifdef __ELF__
491.section ".note.GNU-stack","",%progbits
492#endif
493