xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/1x12-minmax-aarch64-neonfma-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53(
9#     size_t mr,                         (x0) - unused.  mr = 1
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const float*restrict w,            x5
15#     float*restrict c,                  x6
16#     size_t cm_stride,                  (x7) - unused
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# A pointer
25# x8  a0
26
27# C pointer
28# x6  c0
29
30# Vector register usage and GPR shadows
31# a0  v0           first set of A
32# a0  v1           second set of A
33# B   v2  v3  v4   x14 x15 x16  first set of B
34# B   v5  v6  v7   x17 x13 x7
35# B  v23 v24 v25   x14 x15 x16  second set of B (same x as first set)
36# B  v17 v18 v19   x17 x13 x7
37# C  v20 v21 v22
38
39BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53
40
41        # Load cn_stride, a_offset
42        LDP     x10, x11, [sp]
43
44        # Load zero, params pointer
45        LDP     x12, x8, [sp, 16]
46
47        # Load min/max values
48        LD2R    {v30.4s, v31.4s}, [x8]
49
500:
51        # Load initial bias from w into accumulators
52        LD1     {v20.16b, v21.16b, v22.16b}, [x5], 48
53
54        PRFM    PLDL1KEEP, [x5]
55        PRFM    PLDL1KEEP, [x5, 64]
56        PRFM    PLDL1KEEP, [x5, 128]
57        PRFM    PLDL1KEEP, [x5, 192]
58        PRFM    PLDL1KEEP, [x5, 256]
59        PRFM    PLDL1KEEP, [x5, 320]
60
61        MOV     x9, x3                  // p = ks
62
631:
64        # Load next A pointer
65        LDR     x8, [x4], 8
66
67        CMP     x8, x12                 // if a0 == zero
68        ADD     x8, x8, x11             // a0 += a_offset
69        CSEL    x8, x12, x8, EQ         //   a0 = zero, else += a0 + a_offset
70
71        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
72        SUBS    x0, x2, 16              // k = kc - 16
73        B.LO    5f
74
75        # Prologue - loads for first group of 6 fma
76
77        # Read first block of 1 A.
78        LDR     d0, [x8], 8             // a0
79
80        LDR     d2, [x5]                // vb0x0123
81        LDR     x14, [x5, 8]
82
83        LDR     d3, [x5, 16]            // vb0x25567
84        LDR     x15, [x5, 24]
85
86        LDR     d4, [x5, 32]            // vb0x89AB
87        LDR     x16, [x5, 40]
88
89        LDR     d5, [x5, 48]            // vb1x0123
90        LDR     x17, [x5, 56]
91
92        LDR     d6, [x5, 64]            // vb1x25567
93        LDR     x13, [x5, 72]
94
95        LDR     d7, [x5, 80]            // vb1x89AB
96        LDR     x7, [x5, 88]
97        INS     v2.d[1], x14
98        ADD     x5, x5, 96
99
100        # Is there at least 4 floats (16 bytes) for main loop?
101        SUBS    x0, x0, 16              // 4 floats for main loop
102        B.LO    3f
103
104        # Main loop - 4 floats of A (16 bytes)
1052:
106        # First group of 6 fma.
107        # A is loaded for 2nd group into v1
108
109        # BLOCK 0
110        LDR     d1, [x8], 8             // a0
111        INS     v3.d[1], x15
112        FMLA    v20.4s, v2.4s, v0.s[0]
113        PRFM    PLDL1KEEP, [x5, 192]
114
115        # BLOCK 1
116        INS     v4.d[1], x16
117        FMLA    v21.4s, v3.4s, v0.s[0]
118        PRFM    PLDL1KEEP, [x5, 256]
119
120        # BLOCK 2
121        LDR     d23, [x5]               // vb0x0123
122        INS     v5.d[1], x17
123        LDR     x14, [x5, 8]
124        PRFM    PLDL1KEEP, [x5, 320]
125        FMLA    v22.4s, v4.4s, v0.s[0]
126
127        # BLOCK 3
128        LDR     d24, [x5, 16]           // vb0x25567
129        INS     v6.d[1], x13
130        LDR     x15, [x5, 24]
131
132        # BLOCK 4
133        LDR     d25, [x5, 32]           // vb0x89AB
134        INS     v7.d[1], x7
135        FMLA    v20.4s, v5.4s, v0.s[1]
136        LDR     x16, [x5, 40]
137
138        # BLOCK 5
139        LDR     d17, [x5, 48]           // vb1x0123
140        LDR     x17, [x5, 56]
141        FMLA    v21.4s, v6.4s, v0.s[1]
142
143        # BLOCK 6
144        LDR     d18, [x5, 64]           // vb1x25567
145        LDR     x13, [x5, 72]
146        FMLA    v22.4s, v7.4s, v0.s[1]
147
148        # BLOCK 7
149        LDR     d19, [x5, 80]           // vb1x89AB
150        INS     v23.d[1], x14           // v23 was loaded in block 2
151        LDR     x7, [x5, 88]
152
153        # Second group of 6 fma.
154        # A is loaded for 1st group into v0
155
156        # BLOCK 0
157        LDR     d0, [x8], 8             // a0
158        INS     v24.d[1], x15
159        FMLA    v20.4s, v23.4s, v1.s[0]
160
161        # BLOCK 1
162        INS     v25.d[1], x16
163        FMLA    v21.4s, v24.4s, v1.s[0]
164
165        # BLOCK 2
166        LDR     d2, [x5, 96]            // vb0x0123
167        INS     v17.d[1], x17
168        LDR     x14, [x5, 104]
169        FMLA    v22.4s, v25.4s, v1.s[0]
170
171        # BLOCK 3
172        LDR     d3, [x5, 112]           // vb0x25567
173        INS     v18.d[1], x13
174        LDR     x15, [x5, 120]
175
176        # BLOCK 4
177        LDR     d4, [x5, 128]           // vb0x89AB
178        INS     v19.d[1], x7
179        FMLA    v20.4s, v17.4s, v1.s[1]
180        LDR     x16, [x5, 136]
181
182        # BLOCK 5
183        LDR     d5, [x5, 144]           // vb1x0123
184        LDR     x17, [x5, 152]
185        FMLA    v21.4s, v18.4s, v1.s[1]
186
187        # BLOCK 6
188        LDR     d6, [x5, 160]           // vb1x25567
189        LDR     x13, [x5, 168]
190        SUBS    x0, x0, 16
191        FMLA    v22.4s, v19.4s, v1.s[1]
192
193        # BLOCK 7
194        LDR     d7, [x5, 176]           // vb1x89AB
195        INS     v2.d[1], x14
196        LDR     x7, [x5, 184]
197        ADD     x5, x5, 192
198        B.HS    2b
199
200        # Epilogue
201        # First block same as main loop.  Second block has no loads.
2023:
203        # BLOCK 0
204        LDR     d1, [x8], 8             // a0
205        INS     v3.d[1], x15
206        FMLA    v20.4s, v2.4s, v0.s[0]
207        PRFM    PLDL1KEEP, [x5, 192]
208
209        # BLOCK 1
210        INS     v4.d[1], x16
211        FMLA    v21.4s, v3.4s, v0.s[0]
212        PRFM    PLDL1KEEP, [x5, 256]
213
214        # BLOCK 2
215        LDR     d23, [x5]               // vb0x0123
216        INS     v5.d[1], x17
217        LDR     x14, [x5, 8]
218        PRFM    PLDL1KEEP, [x5, 320]
219        FMLA    v22.4s, v4.4s, v0.s[0]
220
221        # BLOCK 3
222        LDR     d24, [x5, 16]           // vb0x25567
223        INS     v6.d[1], x13
224        LDR     x15, [x5, 24]
225
226        # BLOCK 4
227        LDR     d25, [x5, 32]           // vb0x89AB
228        INS     v7.d[1], x7
229        FMLA    v20.4s, v5.4s, v0.s[1]
230        LDR     x16, [x5, 40]
231
232        # BLOCK 5
233        LDR     d17, [x5, 48]           // vb1x0123
234        LDR     x17, [x5, 56]
235        FMLA    v21.4s, v6.4s, v0.s[1]
236
237        # BLOCK 6
238        LDR     d18, [x5, 64]           // vb1x25567
239        LDR     x13, [x5, 72]
240        FMLA    v22.4s, v7.4s, v0.s[1]
241
242        # BLOCK 7
243        LDR     d19, [x5, 80]           // vb1x89AB
244        INS     v23.d[1], x14           // v23 was loaded in block 2
245        LDR     x7, [x5, 88]
246        ADD     x5, x5, 96
247
248        # Second group of 6 fma.  8 blocks of 4 cycles.
249        # Epilogue version does no loads
250
251        # BLOCK 0
252        INS     v24.d[1], x15
253        FMLA    v20.4s, v23.4s, v1.s[0]
254
255        # BLOCK 1
256        INS     v25.d[1], x16
257        FMLA    v21.4s, v24.4s, v1.s[0]
258
259        # BLOCK 2
260        INS     v17.d[1], x17
261        FMLA    v22.4s, v25.4s, v1.s[0]
262
263        # BLOCK 3
264        INS     v18.d[1], x13
265
266        # BLOCK 4
267        INS     v19.d[1], x7
268        FMLA    v20.4s, v17.4s, v1.s[1]
269        TST     x0, 15
270
271        # BLOCK 5
272        FMLA    v21.4s, v18.4s, v1.s[1]
273
274        # BLOCK 6
275        FMLA    v22.4s, v19.4s, v1.s[1]
276
277        # BLOCK 7
278        # Is there a remainder?- 2 floats of A (8 bytes) or less
279        B.NE    5f
280
2814:
282        # ks loop
283        SUBS    x9, x9, 8               // ks -= MR * sizeof(void*)
284        B.HI    1b
285
286        # Clamp
287        FMAX    v20.4s, v20.4s, v30.4s
288        FMAX    v21.4s, v21.4s, v30.4s
289        FMAX    v22.4s, v22.4s, v30.4s
290        FMIN    v20.4s, v20.4s, v31.4s
291        FMIN    v21.4s, v21.4s, v31.4s
292        FMIN    v22.4s, v22.4s, v31.4s
293
294        # Store full 1 x 12
295        SUBS    x1, x1, 12
296        B.LO    7f
297
298        ST1     {v20.16b, v21.16b, v22.16b}, [x6], x10
299        SUB     x4, x4, x3              // a -= ks
300
301        # nc loop
302        B.HI    0b
303        RET
304
3055:
306        # Is there a remainder?- 2 floats of A (8 bytes)
307        TBZ     x0, 3, 6f
308
309        # Remainder- 2 floats of A (8 bytes)
310        LDR     d0, [x8], 8             // a0
311        LD1     {v2.16b, v3.16b, v4.16b}, [x5], 48
312        LD1     {v5.16b, v6.16b, v7.16b}, [x5], 48
313
314        # First block of 3 B
315        FMLA    v20.4s, v2.4s, v0.s[0]
316        FMLA    v21.4s, v3.4s, v0.s[0]
317        FMLA    v22.4s, v4.4s, v0.s[0]
318
319        # Second block of 3 B
320        FMLA    v20.4s, v5.4s, v0.s[1]
321        FMLA    v21.4s, v6.4s, v0.s[1]
322        FMLA    v22.4s, v7.4s, v0.s[1]
323
324        TBZ     x0, 2, 4b
3256:
326        # Remainder - 1 float of A (4 bytes)
327        LDR     s0, [x8], 4             // a0
328        LD1     {v2.16b, v3.16b, v4.16b}, [x5], 48
329
330        FMLA    v20.4s, v2.4s, v0.s[0]
331        FMLA    v21.4s, v3.4s, v0.s[0]
332        FMLA    v22.4s, v4.4s, v0.s[0]
333        B       4b
334
3357:
336        ADD     x1, x1, 12
337        # Store odd channels
338        TBZ     x1, 3, 8f
339        STP     q20, q21, [x6]
340        ADD     x6, x6, 32
341        MOV     v20.16b, v22.16b
342
3438:
344        TBZ     x1, 2, 9f
345        STR     q20, [x6], 16
346        MOV     v20.16b, v21.16b
347
3489:
349        TBZ     x1, 1, 10f
350        STR     d20, [x6], 8
351        DUP     d20, v20.d[1]
352
35310:
354        TBZ     x1, 0, 11f
355        STR     s20, [x6]
35611:
357        RET
358
359END_FUNCTION xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53
360
361#ifdef __ELF__
362.section ".note.GNU-stack","",%progbits
363#endif
364