xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/4x2-aarch64-neonfma-cortex-a75.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x2__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# Register usage
27# A0  x3 v0  v4
28# A1 x11 v1  v5
29# A2 x12 v2  v6
30# A3  x4 v3  v7
31
32# B   x5 v16 v17 v18 v19 v20 v21 v22 v23
33
34# C0  x6 v24 v25
35# C1  x9 v26 v27
36# C2 x10 v28 v29
37# C3  x7 v30 v31
38
39# Clamp v4 v5
40
41BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x2__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75
42
43        $if INC:
44          # Load cn_stride, acc
45          LDP     x14, x15, [sp]
46          # Load params pointer
47          LDR     x8, [sp, 16]
48        $else:
49          # Load cn_stride, params pointer
50          LDP     x14, x8, [sp]
51
52        # Load min/max values
53        LD2R    {v4.2s, v5.2s}, [x8]
54
55        # Clamp A and C pointers
56        CMP     x0, 2                   // if mr < 2
57        ADD     x11, x3, x4             // a1 = a0 + a_stride
58        ADD     x9, x6, x7              // c1 = c0 + cm_stride
59        CSEL    x11, x3, x11, LO        //   a1 = a0
60        CSEL    x9, x6, x9, LO          //   c1 = c0
61
62        ADD     x12, x11, x4            // a2 = a1 + a_stride
63        ADD     x10, x9, x7             // c2 = c1 + cm_stride
64                                        // if mr <= 2
65        CSEL    x12, x11, x12, LS       //   a2 = a1
66        CSEL    x10, x9, x10, LS        //   c2 = c1
67
68        CMP     x0, 4                   // if mr < 4
69        ADD     x4, x12, x4             // a3 = a2 + a_stride
70        ADD     x7, x10, x7             // c3 = c2 + cm_stride
71        CSEL    x4, x12, x4, LO         //   a3 = a2
72        CSEL    x7, x10, x7, LO         //   c3 = c2
73
740:
75        $if INC:
76          # Load initial accumulators
77          LDR     d24, [x15], 8
78          LDR     d26, [x15], 8
79          LDR     d28, [x15], 8
80          LDR     d30, [x15], 8
81        $else:
82          # Load initial bias from w into accumulators
83          LDR     d24, [x5], 8
84          MOV     v26.8b, v24.8b
85          MOV     v30.8b, v24.8b
86          MOV     v28.8b, v24.8b
87        MOVI    v25.2s, 0
88        $if PREFETCH:
89          PRFM    PLDL1KEEP, [x5, 64]
90        MOVI    v27.2s, 0
91        $if PREFETCH:
92          PRFM    PLDL1KEEP, [x5, 128]
93        MOVI    v29.2s, 0
94        $if PREFETCH:
95          PRFM    PLDL1KEEP, [x5, 192]
96        MOVI    v31.2s, 0
97        $if PREFETCH:
98          PRFM    PLDL1KEEP, [x5, 256]
99
100        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
101        SUBS    x0, x2, 32              // k = kc - 32
102        B.LO    4f
103
104        # Prologue
105        # Read first block of 4 A and B.
106        LDR     q0,  [x3], 16
107        LDP     d20, d21, [x5], 16
108        LDR     q1, [x11], 16
109        LDR     q2, [x12], 16
110        LDR     q3,  [x4], 16
111        LDP     d22, d23, [x5], 16
112
113        # Is there at least 32.  yes do main loop
114        SUBS    x0, x0, 32
115        B.LO    2f
116
117        # Main loop - 8 floats of A (32 bytes)
1181:
119        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
120        FMLA    v24.2s, v20.2s, v0.s[0]
121        LDR     q4, [x3], 16
122        FMLA    v26.2s, v20.2s, v1.s[0]
123        FMLA    v28.2s, v20.2s, v2.s[0]
124        LDR     d16, [x5, 0]
125        FMLA    v30.2s, v20.2s, v3.s[0]
126        FMLA    v25.2s, v21.2s, v0.s[1]
127        LDR     q5, [x11], 16
128        FMLA    v27.2s, v21.2s, v1.s[1]
129        FMLA    v29.2s, v21.2s, v2.s[1]
130        LDR     q6, [x12], 16
131        FMLA    v31.2s, v21.2s, v3.s[1]
132        FMLA    v24.2s, v22.2s, v0.s[2]
133        LDR     q7, [x4], 16
134        FMLA    v26.2s, v22.2s, v1.s[2]
135        FMLA    v28.2s, v22.2s, v2.s[2]
136        LDR     d17, [x5, 8]
137        FMLA    v30.2s, v22.2s, v3.s[2]
138        FMLA    v25.2s, v23.2s, v0.s[3]
139        LDR     d18, [x5, 16]
140        FMLA    v27.2s, v23.2s, v1.s[3]
141        FMLA    v29.2s, v23.2s, v2.s[3]
142        LDR     d19, [x5, 24]
143        FMLA    v31.2s, v23.2s, v3.s[3]
144        $if PREFETCH:
145          PRFM    PLDL1KEEP, [x5, 320]
146
147        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
148        FMLA    v24.2s, v16.2s, v4.s[0]
149        LDR     q0, [x3], 16
150        FMLA    v26.2s, v16.2s, v5.s[0]
151        FMLA    v28.2s, v16.2s, v6.s[0]
152        LDR     d20, [x5, 32]
153        FMLA    v30.2s, v16.2s, v7.s[0]
154        FMLA    v25.2s, v17.2s, v4.s[1]
155        LDR     q1, [x11], 16
156        FMLA    v27.2s, v17.2s, v5.s[1]
157        FMLA    v29.2s, v17.2s, v6.s[1]
158        LDR     q2, [x12], 16
159        FMLA    v31.2s, v17.2s, v7.s[1]
160        FMLA    v24.2s, v18.2s, v4.s[2]
161        LDR     q3, [x4], 16
162        FMLA    v26.2s, v18.2s, v5.s[2]
163        FMLA    v28.2s, v18.2s, v6.s[2]
164        LDR     d21, [x5, 40]
165        FMLA    v30.2s, v18.2s, v7.s[2]
166        SUBS    x0, x0, 32
167        FMLA    v25.2s, v19.2s, v4.s[3]
168        LDR     d22, [x5, 48]
169        FMLA    v27.2s, v19.2s, v5.s[3]
170        LDR     d23, [x5, 56]
171        FMLA    v29.2s, v19.2s, v6.s[3]
172        ADD     x5, x5, 64
173        FMLA    v31.2s, v19.2s, v7.s[3]
174        B.HS    1b
175
1762:
177        # Epilogue
178        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
179        FMLA    v24.2s, v20.2s, v0.s[0]
180        LDR     q4, [x3], 16
181        FMLA    v26.2s, v20.2s, v1.s[0]
182        FMLA    v28.2s, v20.2s, v2.s[0]
183        LDR     d16, [x5, 0]
184        FMLA    v30.2s, v20.2s, v3.s[0]
185        FMLA    v25.2s, v21.2s, v0.s[1]
186        LDR     q5, [x11], 16
187        FMLA    v27.2s, v21.2s, v1.s[1]
188        FMLA    v29.2s, v21.2s, v2.s[1]
189        LDR     q6, [x12], 16
190        FMLA    v31.2s, v21.2s, v3.s[1]
191        FMLA    v24.2s, v22.2s, v0.s[2]
192        LDR     q7, [x4], 16
193        FMLA    v26.2s, v22.2s, v1.s[2]
194        FMLA    v28.2s, v22.2s, v2.s[2]
195        LDR     d17, [x5, 8]
196        FMLA    v30.2s, v22.2s, v3.s[2]
197        FMLA    v25.2s, v23.2s, v0.s[3]
198        LDR     d18, [x5, 16]
199        FMLA    v27.2s, v23.2s, v1.s[3]
200        FMLA    v29.2s, v23.2s, v2.s[3]
201        LDR     d19, [x5, 24]
202        FMLA    v31.2s, v23.2s, v3.s[3]
203        $if PREFETCH:
204          PRFM    PLDL1KEEP, [x5, 320]
205
206        # Second block of 4.  FMA for second 4, no loads
207        FMLA    v24.2s, v16.2s, v4.s[0]
208        FMLA    v26.2s, v16.2s, v5.s[0]
209        FMLA    v28.2s, v16.2s, v6.s[0]
210        FMLA    v30.2s, v16.2s, v7.s[0]
211        FMLA    v25.2s, v17.2s, v4.s[1]
212        FMLA    v27.2s, v17.2s, v5.s[1]
213        FMLA    v29.2s, v17.2s, v6.s[1]
214        FMLA    v31.2s, v17.2s, v7.s[1]
215        FMLA    v24.2s, v18.2s, v4.s[2]
216        FMLA    v26.2s, v18.2s, v5.s[2]
217        FMLA    v28.2s, v18.2s, v6.s[2]
218        ADDS    x0, x0, 32
219        FMLA    v30.2s, v18.2s, v7.s[2]
220        FMLA    v25.2s, v19.2s, v4.s[3]
221        ADD     x5, x5, 32
222        FMLA    v27.2s, v19.2s, v5.s[3]
223        FMLA    v29.2s, v19.2s, v6.s[3]
224        LD2R    {v4.2s, v5.2s}, [x8]     // Load min/max values
225        FMLA    v31.2s, v19.2s, v7.s[3]
226
227        # Is there a remainder? up to 8 floats (32 bytes)
228        B.NE    4f
229
2303:
231        FADD    v24.2s, v24.2s, v25.2s
232        FADD    v26.2s, v26.2s, v27.2s
233        FADD    v28.2s, v28.2s, v29.2s
234        FADD    v30.2s, v30.2s, v31.2s
235
236        # Clamp
237        FMAX    v24.2s, v24.2s, v4.2s
238        FMAX    v26.2s, v26.2s, v4.2s
239        FMAX    v28.2s, v28.2s, v4.2s
240        FMAX    v30.2s, v30.2s, v4.2s
241        SUBS    x1, x1, 2
242        FMIN    v24.2s, v24.2s, v5.2s
243        FMIN    v26.2s, v26.2s, v5.2s
244        FMIN    v28.2s, v28.2s, v5.2s
245        FMIN    v30.2s, v30.2s, v5.2s
246
247        # Store full 4 x 2
248        B.LO    7f
249
250        $if INC:
251          STR     d30, [x7]
252          SUB     x3,  x3, x2             // a0 -= kc
253          ADD     x7,  x7, x14
254          STR     d28, [x10]
255          SUB     x11, x11, x2            // a1 -= kc
256          ADD     x10, x10, x14
257          STR     d26, [x9]
258          SUB     x12, x12, x2            // a2 -= kc
259          ADD     x9,  x9, x14
260          STR     d24, [x6]
261          SUB     x4,  x4, x2             // a3 -= kc
262          ADD     x6,  x6, x14
263        $else:
264          STR     d24, [x6]
265          SUB     x3,  x3, x2             // a0 -= kc
266          ADD     x6,  x6, x14
267          STR     d26, [x9]
268          SUB     x11, x11, x2            // a1 -= kc
269          ADD     x9,  x9, x14
270          STR     d28, [x10]
271          SUB     x12, x12, x2            // a2 -= kc
272          ADD     x10, x10, x14
273          STR     d30, [x7]
274          SUB     x4,  x4, x2             // a3 -= kc
275          ADD     x7,  x7, x14
276
277        B.HI    0b
278        RET
279
2804:
281        # Remainder- 4 floats of A (16 bytes)
282        TBZ     x0, 4, 5f
283
284        LDR     q0,  [x3], 16
285        LDP     d20, d21, [x5], 16
286        LDR     q1, [x11], 16
287        LDR     q2, [x12], 16
288        LDR     q3,  [x4], 16
289        LDP     d22, d23, [x5], 16
290        FMLA    v24.2s, v20.2s, v0.s[0]
291        FMLA    v26.2s, v20.2s, v1.s[0]
292        FMLA    v28.2s, v20.2s, v2.s[0]
293        FMLA    v30.2s, v20.2s, v3.s[0]
294        FMLA    v25.2s, v21.2s, v0.s[1]
295        FMLA    v27.2s, v21.2s, v1.s[1]
296        FMLA    v29.2s, v21.2s, v2.s[1]
297        FMLA    v31.2s, v21.2s, v3.s[1]
298        FMLA    v24.2s, v22.2s, v0.s[2]
299        FMLA    v26.2s, v22.2s, v1.s[2]
300        FMLA    v28.2s, v22.2s, v2.s[2]
301        FMLA    v30.2s, v22.2s, v3.s[2]
302        FMLA    v25.2s, v23.2s, v0.s[3]
303        FMLA    v27.2s, v23.2s, v1.s[3]
304        FMLA    v29.2s, v23.2s, v2.s[3]
305        FMLA    v31.2s, v23.2s, v3.s[3]
306
3075:
308        # Remainder- 2 floats of A (8 bytes)
309        TBZ     x0, 3, 6f
310
311        LDR     d0,  [x3], 8
312        LDP     d20, d21, [x5], 16
313        LDR     d1, [x11], 8
314        LDR     d2, [x12], 8
315        LDR     d3,  [x4], 8
316        FMLA    v24.2s, v20.2s, v0.s[0]
317        FMLA    v26.2s, v20.2s, v1.s[0]
318        FMLA    v28.2s, v20.2s, v2.s[0]
319        FMLA    v30.2s, v20.2s, v3.s[0]
320        FMLA    v25.2s, v21.2s, v0.s[1]
321        FMLA    v27.2s, v21.2s, v1.s[1]
322        FMLA    v29.2s, v21.2s, v2.s[1]
323        FMLA    v31.2s, v21.2s, v3.s[1]
324
3256:
326        # Remainder- 1 float of A (4 bytes)
327        TBZ     x0, 2, 3b
328
329        LDR     s0,  [x3], 4
330        LDR     d20, [x5], 8
331        LDR     s1, [x11], 4
332        LDR     s2, [x12], 4
333        LDR     s3,  [x4], 4
334        FMLA    v24.2s, v20.2s, v0.s[0]
335        FMLA    v26.2s, v20.2s, v1.s[0]
336        FMLA    v28.2s, v20.2s, v2.s[0]
337        FMLA    v30.2s, v20.2s, v3.s[0]
338        B       3b
339
340        # Store odd width
3417:
342        $if INC:
343          STR     s30,  [x7]
344          STR     s28, [x10]
345          STR     s26,  [x9]
346          STR     s24,  [x6]
347        $else:
348          STR     s24,  [x6]
349          STR     s26,  [x9]
350          STR     s28, [x10]
351          STR     s30,  [x7]
35210:
353        RET
354
355
356END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x2__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75
357
358#ifdef __ELF__
359.section ".note.GNU-stack","",%progbits
360#endif
361