xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/4x2-minmax-aarch64-neonfma-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x2-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# Register usage
27# A0  x3 v0  v4
28# A1 x11 v1  v5
29# A2 x12 v2  v6
30# A3  x4 v3  v7
31
32# B   x5 v16 v17 v18 v19 v20 v21 v22 v23
33
34# C0  x6 v24 v25
35# C1  x9 v26 v27
36# C2 x10 v28 v29
37# C3  x7 v30 v31
38
39# Clamp v4 v5
40
41BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75
42
43        # Load cn_stride, params pointer
44        LDP     x14, x8, [sp]
45
46        # Load min/max values
47        LD2R    {v4.2s, v5.2s}, [x8]
48
49        # Clamp A and C pointers
50        CMP     x0, 2                   // if mr < 2
51        ADD     x11, x3, x4             // a1 = a0 + a_stride
52        ADD     x9, x6, x7              // c1 = c0 + cm_stride
53        CSEL    x11, x3, x11, LO        //   a1 = a0
54        CSEL    x9, x6, x9, LO          //   c1 = c0
55
56        ADD     x12, x11, x4            // a2 = a1 + a_stride
57        ADD     x10, x9, x7             // c2 = c1 + cm_stride
58                                        // if mr <= 2
59        CSEL    x12, x11, x12, LS       //   a2 = a1
60        CSEL    x10, x9, x10, LS        //   c2 = c1
61
62        CMP     x0, 4                   // if mr < 4
63        ADD     x4, x12, x4             // a3 = a2 + a_stride
64        ADD     x7, x10, x7             // c3 = c2 + cm_stride
65        CSEL    x4, x12, x4, LO         //   a3 = a2
66        CSEL    x7, x10, x7, LO         //   c3 = c2
67
680:
69        # Load initial bias from w into accumulators
70        LDR     d24, [x5], 8
71        MOV     v26.8b, v24.8b
72        MOV     v30.8b, v24.8b
73        MOV     v28.8b, v24.8b
74        MOVI    v25.2s, 0
75        MOVI    v27.2s, 0
76        MOVI    v29.2s, 0
77        MOVI    v31.2s, 0
78
79        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
80        SUBS    x0, x2, 32              // k = kc - 32
81        B.LO    4f
82
83        # Prologue
84        # Read first block of 4 A and B.
85        LDR     q0,  [x3], 16
86        LDP     d20, d21, [x5], 16
87        LDR     q1, [x11], 16
88        LDR     q2, [x12], 16
89        LDR     q3,  [x4], 16
90        LDP     d22, d23, [x5], 16
91
92        # Is there at least 32.  yes do main loop
93        SUBS    x0, x0, 32
94        B.LO    2f
95
96        # Main loop - 8 floats of A (32 bytes)
971:
98        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
99        FMLA    v24.2s, v20.2s, v0.s[0]
100        LDR     q4, [x3], 16
101        FMLA    v26.2s, v20.2s, v1.s[0]
102        FMLA    v28.2s, v20.2s, v2.s[0]
103        LDR     d16, [x5, 0]
104        FMLA    v30.2s, v20.2s, v3.s[0]
105        FMLA    v25.2s, v21.2s, v0.s[1]
106        LDR     q5, [x11], 16
107        FMLA    v27.2s, v21.2s, v1.s[1]
108        FMLA    v29.2s, v21.2s, v2.s[1]
109        LDR     q6, [x12], 16
110        FMLA    v31.2s, v21.2s, v3.s[1]
111        FMLA    v24.2s, v22.2s, v0.s[2]
112        LDR     q7, [x4], 16
113        FMLA    v26.2s, v22.2s, v1.s[2]
114        FMLA    v28.2s, v22.2s, v2.s[2]
115        LDR     d17, [x5, 8]
116        FMLA    v30.2s, v22.2s, v3.s[2]
117        FMLA    v25.2s, v23.2s, v0.s[3]
118        LDR     d18, [x5, 16]
119        FMLA    v27.2s, v23.2s, v1.s[3]
120        FMLA    v29.2s, v23.2s, v2.s[3]
121        LDR     d19, [x5, 24]
122        FMLA    v31.2s, v23.2s, v3.s[3]
123
124        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
125        FMLA    v24.2s, v16.2s, v4.s[0]
126        LDR     q0, [x3], 16
127        FMLA    v26.2s, v16.2s, v5.s[0]
128        FMLA    v28.2s, v16.2s, v6.s[0]
129        LDR     d20, [x5, 32]
130        FMLA    v30.2s, v16.2s, v7.s[0]
131        FMLA    v25.2s, v17.2s, v4.s[1]
132        LDR     q1, [x11], 16
133        FMLA    v27.2s, v17.2s, v5.s[1]
134        FMLA    v29.2s, v17.2s, v6.s[1]
135        LDR     q2, [x12], 16
136        FMLA    v31.2s, v17.2s, v7.s[1]
137        FMLA    v24.2s, v18.2s, v4.s[2]
138        LDR     q3, [x4], 16
139        FMLA    v26.2s, v18.2s, v5.s[2]
140        FMLA    v28.2s, v18.2s, v6.s[2]
141        LDR     d21, [x5, 40]
142        FMLA    v30.2s, v18.2s, v7.s[2]
143        SUBS    x0, x0, 32
144        FMLA    v25.2s, v19.2s, v4.s[3]
145        LDR     d22, [x5, 48]
146        FMLA    v27.2s, v19.2s, v5.s[3]
147        LDR     d23, [x5, 56]
148        FMLA    v29.2s, v19.2s, v6.s[3]
149        ADD     x5, x5, 64
150        FMLA    v31.2s, v19.2s, v7.s[3]
151        B.HS    1b
152
1532:
154        # Epilogue
155        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
156        FMLA    v24.2s, v20.2s, v0.s[0]
157        LDR     q4, [x3], 16
158        FMLA    v26.2s, v20.2s, v1.s[0]
159        FMLA    v28.2s, v20.2s, v2.s[0]
160        LDR     d16, [x5, 0]
161        FMLA    v30.2s, v20.2s, v3.s[0]
162        FMLA    v25.2s, v21.2s, v0.s[1]
163        LDR     q5, [x11], 16
164        FMLA    v27.2s, v21.2s, v1.s[1]
165        FMLA    v29.2s, v21.2s, v2.s[1]
166        LDR     q6, [x12], 16
167        FMLA    v31.2s, v21.2s, v3.s[1]
168        FMLA    v24.2s, v22.2s, v0.s[2]
169        LDR     q7, [x4], 16
170        FMLA    v26.2s, v22.2s, v1.s[2]
171        FMLA    v28.2s, v22.2s, v2.s[2]
172        LDR     d17, [x5, 8]
173        FMLA    v30.2s, v22.2s, v3.s[2]
174        FMLA    v25.2s, v23.2s, v0.s[3]
175        LDR     d18, [x5, 16]
176        FMLA    v27.2s, v23.2s, v1.s[3]
177        FMLA    v29.2s, v23.2s, v2.s[3]
178        LDR     d19, [x5, 24]
179        FMLA    v31.2s, v23.2s, v3.s[3]
180
181        # Second block of 4.  FMA for second 4, no loads
182        FMLA    v24.2s, v16.2s, v4.s[0]
183        FMLA    v26.2s, v16.2s, v5.s[0]
184        FMLA    v28.2s, v16.2s, v6.s[0]
185        FMLA    v30.2s, v16.2s, v7.s[0]
186        FMLA    v25.2s, v17.2s, v4.s[1]
187        FMLA    v27.2s, v17.2s, v5.s[1]
188        FMLA    v29.2s, v17.2s, v6.s[1]
189        FMLA    v31.2s, v17.2s, v7.s[1]
190        FMLA    v24.2s, v18.2s, v4.s[2]
191        FMLA    v26.2s, v18.2s, v5.s[2]
192        FMLA    v28.2s, v18.2s, v6.s[2]
193        ADDS    x0, x0, 32
194        FMLA    v30.2s, v18.2s, v7.s[2]
195        FMLA    v25.2s, v19.2s, v4.s[3]
196        ADD     x5, x5, 32
197        FMLA    v27.2s, v19.2s, v5.s[3]
198        FMLA    v29.2s, v19.2s, v6.s[3]
199        LD2R    {v4.2s, v5.2s}, [x8]     // Load min/max values
200        FMLA    v31.2s, v19.2s, v7.s[3]
201
202        # Is there a remainder? up to 8 floats (32 bytes)
203        B.NE    4f
204
2053:
206        FADD    v24.2s, v24.2s, v25.2s
207        FADD    v26.2s, v26.2s, v27.2s
208        FADD    v28.2s, v28.2s, v29.2s
209        FADD    v30.2s, v30.2s, v31.2s
210
211        # Clamp
212        FMAX    v24.2s, v24.2s, v4.2s
213        FMAX    v26.2s, v26.2s, v4.2s
214        FMAX    v28.2s, v28.2s, v4.2s
215        FMAX    v30.2s, v30.2s, v4.2s
216        SUBS    x1, x1, 2
217        FMIN    v24.2s, v24.2s, v5.2s
218        FMIN    v26.2s, v26.2s, v5.2s
219        FMIN    v28.2s, v28.2s, v5.2s
220        FMIN    v30.2s, v30.2s, v5.2s
221
222        # Store full 4 x 2
223        B.LO    7f
224
225        STR     d24, [x6]
226        SUB     x3,  x3, x2             // a0 -= kc
227        ADD     x6,  x6, x14
228        STR     d26, [x9]
229        SUB     x11, x11, x2            // a1 -= kc
230        ADD     x9,  x9, x14
231        STR     d28, [x10]
232        SUB     x12, x12, x2            // a2 -= kc
233        ADD     x10, x10, x14
234        STR     d30, [x7]
235        SUB     x4,  x4, x2             // a3 -= kc
236        ADD     x7,  x7, x14
237
238        B.HI    0b
239        RET
240
2414:
242        # Remainder- 4 floats of A (16 bytes)
243        TBZ     x0, 4, 5f
244
245        LDR     q0,  [x3], 16
246        LDP     d20, d21, [x5], 16
247        LDR     q1, [x11], 16
248        LDR     q2, [x12], 16
249        LDR     q3,  [x4], 16
250        LDP     d22, d23, [x5], 16
251        FMLA    v24.2s, v20.2s, v0.s[0]
252        FMLA    v26.2s, v20.2s, v1.s[0]
253        FMLA    v28.2s, v20.2s, v2.s[0]
254        FMLA    v30.2s, v20.2s, v3.s[0]
255        FMLA    v25.2s, v21.2s, v0.s[1]
256        FMLA    v27.2s, v21.2s, v1.s[1]
257        FMLA    v29.2s, v21.2s, v2.s[1]
258        FMLA    v31.2s, v21.2s, v3.s[1]
259        FMLA    v24.2s, v22.2s, v0.s[2]
260        FMLA    v26.2s, v22.2s, v1.s[2]
261        FMLA    v28.2s, v22.2s, v2.s[2]
262        FMLA    v30.2s, v22.2s, v3.s[2]
263        FMLA    v25.2s, v23.2s, v0.s[3]
264        FMLA    v27.2s, v23.2s, v1.s[3]
265        FMLA    v29.2s, v23.2s, v2.s[3]
266        FMLA    v31.2s, v23.2s, v3.s[3]
267
2685:
269        # Remainder- 2 floats of A (8 bytes)
270        TBZ     x0, 3, 6f
271
272        LDR     d0,  [x3], 8
273        LDP     d20, d21, [x5], 16
274        LDR     d1, [x11], 8
275        LDR     d2, [x12], 8
276        LDR     d3,  [x4], 8
277        FMLA    v24.2s, v20.2s, v0.s[0]
278        FMLA    v26.2s, v20.2s, v1.s[0]
279        FMLA    v28.2s, v20.2s, v2.s[0]
280        FMLA    v30.2s, v20.2s, v3.s[0]
281        FMLA    v25.2s, v21.2s, v0.s[1]
282        FMLA    v27.2s, v21.2s, v1.s[1]
283        FMLA    v29.2s, v21.2s, v2.s[1]
284        FMLA    v31.2s, v21.2s, v3.s[1]
285
2866:
287        # Remainder- 1 float of A (4 bytes)
288        TBZ     x0, 2, 3b
289
290        LDR     s0,  [x3], 4
291        LDR     d20, [x5], 8
292        LDR     s1, [x11], 4
293        LDR     s2, [x12], 4
294        LDR     s3,  [x4], 4
295        FMLA    v24.2s, v20.2s, v0.s[0]
296        FMLA    v26.2s, v20.2s, v1.s[0]
297        FMLA    v28.2s, v20.2s, v2.s[0]
298        FMLA    v30.2s, v20.2s, v3.s[0]
299        B       3b
300
301        # Store odd width
3027:
303        STR     s24,  [x6]
304        STR     s26,  [x9]
305        STR     s28, [x10]
306        STR     s30,  [x7]
30710:
308        RET
309
310
311END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75
312
313#ifdef __ELF__
314.section ".note.GNU-stack","",%progbits
315#endif
316