xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38# Vector register usage
39# A0  v0  v4
40# A1  v1  v5
41# A2  v2  v6
42# A3  v3  v7
43# B   v8  v9 v10 v11
44# B  v12 v13 v14 v15
45# B  v16 v17 v18 v19
46# B  v20 v21 v22 v23
47# C  v24 v25
48# C  v26 v27
49# C  v28 v29
50# C  v30 v31
51# Clamp v4 v5
52
53BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75
54
55        # Load cn_stride, params pointer
56        LDP     x14, x8, [sp]
57
58        # Load min/max values
59        LD2R    {v4.4s, v5.4s}, [x8]
60
61        # Save d8-d15 on stack
62        STP     d8,  d9, [sp, -64]!
63        STP     d10, d11, [sp, 16]
64        STP     d12, d13, [sp, 32]
65        STP     d14, d15, [sp, 48]
66
67        # Clamp A and C pointers
68        CMP     x0, 2                   // if mr < 2
69        ADD     x11, x3, x4             // a1 = a0 + a_stride
70        ADD     x9, x6, x7              // c1 = c0 + cm_stride
71        CSEL    x11, x3, x11, LO        //   a1 = a0
72        CSEL    x9, x6, x9, LO          //   c1 = c0
73
74        ADD     x12, x11, x4            // a2 = a1 + a_stride
75        ADD     x10, x9, x7             // c2 = c1 + cm_stride
76                                        // if mr <= 2
77        CSEL    x12, x11, x12, LS       //   a2 = a1
78        CSEL    x10, x9, x10, LS        //   c2 = c1
79
80        CMP     x0, 4                   // if mr < 4
81        ADD     x4, x12, x4             // a3 = a2 + a_stride
82        ADD     x7, x10, x7             // c3 = c2 + cm_stride
83        CSEL    x4, x12, x4, LO         //   a3 = a2
84        CSEL    x7, x10, x7, LO         //   c3 = c2
85
860:
87        # Load initial bias from w into accumulators
88        LDP     q24, q25, [x5], 32
89        MOV     v26.16b, v24.16b
90        MOV     v27.16b, v25.16b
91        MOV     v28.16b, v24.16b
92        MOV     v29.16b, v25.16b
93        MOV     v30.16b, v24.16b
94        MOV     v31.16b, v25.16b
95
96        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
97        SUBS    x0, x2, 32              // k = kc - 32
98        B.LO    3f
99
100        # 16 prologue
101        # Read first block of 4 A and B.
102        LDR     q0,  [x3], 16
103        LDP     q16, q17, [x5], 32
104        LDR     q1, [x11], 16
105        LDR     q2, [x12], 16
106        LDR     q3,  [x4], 16
107        LDP     q18, q19, [x5], 32
108        LDP     q20, q21, [x5], 32
109        LDP     q22, q23, [x5], 32
110
111        # Is there at least 32.  yes do main loop
112        SUBS    x0, x0, 32
113        B.LO    2f
114
115        # Main loop - 8 floats of A (32 bytes)
1161:
117        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
118        FMLA    v24.4s, v16.4s, v0.s[0]
119        LDP     q8, q9, [x5], 32
120        FMLA    v25.4s, v17.4s, v0.s[0]
121        FMLA    v26.4s, v16.4s, v1.s[0]
122        LDP     q10, q11, [x5], 32
123        FMLA    v27.4s, v17.4s, v1.s[0]
124        FMLA    v28.4s, v16.4s, v2.s[0]
125        LDP     q12, q13, [x5], 32
126        FMLA    v29.4s, v17.4s, v2.s[0]
127        FMLA    v30.4s, v16.4s, v3.s[0]
128        LDP     q14, q15, [x5], 32
129        FMLA    v31.4s, v17.4s, v3.s[0]
130        FMLA    v24.4s, v18.4s, v0.s[1]
131        LDR     q4, [x3], 16
132        FMLA    v25.4s, v19.4s, v0.s[1]
133        FMLA    v26.4s, v18.4s, v1.s[1]
134        LDR     q5, [x11], 16
135        FMLA    v27.4s, v19.4s, v1.s[1]
136        FMLA    v28.4s, v18.4s, v2.s[1]
137        LDR     q6, [x12], 16
138        FMLA    v29.4s, v19.4s, v2.s[1]
139        FMLA    v30.4s, v18.4s, v3.s[1]
140        LDR     q7, [x4], 16
141        FMLA    v31.4s, v19.4s, v3.s[1]
142        FMLA    v24.4s, v20.4s, v0.s[2]
143        PRFM    PLDL1KEEP, [x5, 128]
144        FMLA    v25.4s, v21.4s, v0.s[2]
145        FMLA    v26.4s, v20.4s, v1.s[2]
146        PRFM    PLDL1KEEP, [x5, 192]
147        FMLA    v27.4s, v21.4s, v1.s[2]
148        FMLA    v28.4s, v20.4s, v2.s[2]
149        PRFM    PLDL1KEEP, [x5, 256]
150        FMLA    v29.4s, v21.4s, v2.s[2]
151        FMLA    v30.4s, v20.4s, v3.s[2]
152        PRFM    PLDL1KEEP, [x5, 320]
153        FMLA    v31.4s, v21.4s, v3.s[2]
154        FMLA    v24.4s, v22.4s, v0.s[3]
155        FMLA    v25.4s, v23.4s, v0.s[3]
156        FMLA    v26.4s, v22.4s, v1.s[3]
157        FMLA    v27.4s, v23.4s, v1.s[3]
158        FMLA    v28.4s, v22.4s, v2.s[3]
159        FMLA    v29.4s, v23.4s, v2.s[3]
160        FMLA    v30.4s, v22.4s, v3.s[3]
161        FMLA    v31.4s, v23.4s, v3.s[3]
162
163        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
164        FMLA    v24.4s, v8.4s, v4.s[0]
165        LDP     q16, q17, [x5], 32
166        FMLA    v25.4s, v9.4s, v4.s[0]
167        FMLA    v26.4s, v8.4s, v5.s[0]
168        LDP     q18, q19, [x5], 32
169        FMLA    v27.4s, v9.4s, v5.s[0]
170        FMLA    v28.4s, v8.4s, v6.s[0]
171        LDP     q20, q21, [x5], 32
172        FMLA    v29.4s, v9.4s, v6.s[0]
173        FMLA    v30.4s, v8.4s, v7.s[0]
174        LDP     q22, q23, [x5], 32
175        FMLA    v31.4s, v9.4s, v7.s[0]
176        FMLA    v24.4s, v10.4s, v4.s[1]
177        LDR     q0, [x3], 16
178        FMLA    v25.4s, v11.4s, v4.s[1]
179        FMLA    v26.4s, v10.4s, v5.s[1]
180        LDR     q1, [x11], 16
181        FMLA    v27.4s, v11.4s, v5.s[1]
182        FMLA    v28.4s, v10.4s, v6.s[1]
183        LDR     q2, [x12], 16
184        FMLA    v29.4s, v11.4s, v6.s[1]
185        FMLA    v30.4s, v10.4s, v7.s[1]
186        LDR     q3, [x4], 16
187        FMLA    v31.4s, v11.4s, v7.s[1]
188        FMLA    v24.4s, v12.4s, v4.s[2]
189        FMLA    v25.4s, v13.4s, v4.s[2]
190        FMLA    v26.4s, v12.4s, v5.s[2]
191        FMLA    v27.4s, v13.4s, v5.s[2]
192        FMLA    v28.4s, v12.4s, v6.s[2]
193        FMLA    v29.4s, v13.4s, v6.s[2]
194        FMLA    v30.4s, v12.4s, v7.s[2]
195        FMLA    v31.4s, v13.4s, v7.s[2]
196        FMLA    v24.4s, v14.4s, v4.s[3]
197        FMLA    v25.4s, v15.4s, v4.s[3]
198        FMLA    v26.4s, v14.4s, v5.s[3]
199        FMLA    v27.4s, v15.4s, v5.s[3]
200        FMLA    v28.4s, v14.4s, v6.s[3]
201        FMLA    v29.4s, v15.4s, v6.s[3]
202        SUBS    x0, x0, 32
203        FMLA    v30.4s, v14.4s, v7.s[3]
204        FMLA    v31.4s, v15.4s, v7.s[3]
205        B.HS    1b
206
2072:
208        # Epilogue
209        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
210        FMLA    v24.4s, v16.4s, v0.s[0]
211        LDP     q8, q9, [x5], 32
212        FMLA    v25.4s, v17.4s, v0.s[0]
213        FMLA    v26.4s, v16.4s, v1.s[0]
214        LDP     q10, q11, [x5], 32
215        FMLA    v27.4s, v17.4s, v1.s[0]
216        FMLA    v28.4s, v16.4s, v2.s[0]
217        LDP     q12, q13, [x5], 32
218        FMLA    v29.4s, v17.4s, v2.s[0]
219        FMLA    v30.4s, v16.4s, v3.s[0]
220        LDP     q14, q15, [x5], 32
221        FMLA    v31.4s, v17.4s, v3.s[0]
222        FMLA    v24.4s, v18.4s, v0.s[1]
223        LDR     q4, [x3], 16
224        FMLA    v25.4s, v19.4s, v0.s[1]
225        FMLA    v26.4s, v18.4s, v1.s[1]
226        LDR     q5, [x11], 16
227        FMLA    v27.4s, v19.4s, v1.s[1]
228        FMLA    v28.4s, v18.4s, v2.s[1]
229        LDR     q6, [x12], 16
230        FMLA    v29.4s, v19.4s, v2.s[1]
231        FMLA    v30.4s, v18.4s, v3.s[1]
232        LDR     q7, [x4], 16
233        FMLA    v31.4s, v19.4s, v3.s[1]
234        FMLA    v24.4s, v20.4s, v0.s[2]
235        FMLA    v25.4s, v21.4s, v0.s[2]
236        FMLA    v26.4s, v20.4s, v1.s[2]
237        FMLA    v27.4s, v21.4s, v1.s[2]
238        FMLA    v28.4s, v20.4s, v2.s[2]
239        FMLA    v29.4s, v21.4s, v2.s[2]
240        FMLA    v30.4s, v20.4s, v3.s[2]
241        FMLA    v31.4s, v21.4s, v3.s[2]
242        FMLA    v24.4s, v22.4s, v0.s[3]
243        FMLA    v25.4s, v23.4s, v0.s[3]
244        FMLA    v26.4s, v22.4s, v1.s[3]
245        FMLA    v27.4s, v23.4s, v1.s[3]
246        FMLA    v28.4s, v22.4s, v2.s[3]
247        FMLA    v29.4s, v23.4s, v2.s[3]
248        FMLA    v30.4s, v22.4s, v3.s[3]
249        FMLA    v31.4s, v23.4s, v3.s[3]
250
251        # Second block of 4.  FMA for second 4, noloads
252        FMLA    v24.4s, v8.4s, v4.s[0]
253        FMLA    v25.4s, v9.4s, v4.s[0]
254        FMLA    v26.4s, v8.4s, v5.s[0]
255        FMLA    v27.4s, v9.4s, v5.s[0]
256        FMLA    v28.4s, v8.4s, v6.s[0]
257        FMLA    v29.4s, v9.4s, v6.s[0]
258        FMLA    v30.4s, v8.4s, v7.s[0]
259        FMLA    v31.4s, v9.4s, v7.s[0]
260
261        FMLA    v24.4s, v10.4s, v4.s[1]
262        FMLA    v25.4s, v11.4s, v4.s[1]
263        FMLA    v26.4s, v10.4s, v5.s[1]
264        FMLA    v27.4s, v11.4s, v5.s[1]
265        FMLA    v28.4s, v10.4s, v6.s[1]
266        FMLA    v29.4s, v11.4s, v6.s[1]
267        FMLA    v30.4s, v10.4s, v7.s[1]
268        FMLA    v31.4s, v11.4s, v7.s[1]
269
270        FMLA    v24.4s, v12.4s, v4.s[2]
271        FMLA    v25.4s, v13.4s, v4.s[2]
272        FMLA    v26.4s, v12.4s, v5.s[2]
273        FMLA    v27.4s, v13.4s, v5.s[2]
274        FMLA    v28.4s, v12.4s, v6.s[2]
275        FMLA    v29.4s, v13.4s, v6.s[2]
276        FMLA    v30.4s, v12.4s, v7.s[2]
277        FMLA    v31.4s, v13.4s, v7.s[2]
278
279        FMLA    v24.4s, v14.4s, v4.s[3]
280        FMLA    v25.4s, v15.4s, v4.s[3]
281        FMLA    v26.4s, v14.4s, v5.s[3]
282        FMLA    v27.4s, v15.4s, v5.s[3]
283
284        # Load min/max values
285        LD2R    {v4.4s, v5.4s}, [x8]
286
287        FMLA    v28.4s, v14.4s, v6.s[3]
288        FMLA    v29.4s, v15.4s, v6.s[3]
289        FMLA    v30.4s, v14.4s, v7.s[3]
290        FMLA    v31.4s, v15.4s, v7.s[3]
291
2923:
293        # Remainder- 4 floats of A (16 bytes)
294        TBZ     x0, 4, 4f
295
296        LDR     q0,  [x3], 16
297        LDP     q16, q17, [x5], 32
298        LDR     q1, [x11], 16
299        LDR     q2, [x12], 16
300        LDR     q3,  [x4], 16
301        FMLA    v24.4s, v16.4s, v0.s[0]
302        FMLA    v25.4s, v17.4s, v0.s[0]
303        LDP     q18, q19, [x5], 32
304        FMLA    v26.4s, v16.4s, v1.s[0]
305        FMLA    v27.4s, v17.4s, v1.s[0]
306        LDP     q20, q21, [x5], 32
307        FMLA    v28.4s, v16.4s, v2.s[0]
308        FMLA    v29.4s, v17.4s, v2.s[0]
309        LDP     q22, q23, [x5], 32
310        FMLA    v30.4s, v16.4s, v3.s[0]
311        FMLA    v31.4s, v17.4s, v3.s[0]
312        FMLA    v24.4s, v18.4s, v0.s[1]
313        FMLA    v25.4s, v19.4s, v0.s[1]
314        FMLA    v26.4s, v18.4s, v1.s[1]
315        FMLA    v27.4s, v19.4s, v1.s[1]
316        FMLA    v28.4s, v18.4s, v2.s[1]
317        FMLA    v29.4s, v19.4s, v2.s[1]
318        FMLA    v30.4s, v18.4s, v3.s[1]
319        FMLA    v31.4s, v19.4s, v3.s[1]
320        FMLA    v24.4s, v20.4s, v0.s[2]
321        FMLA    v25.4s, v21.4s, v0.s[2]
322        FMLA    v26.4s, v20.4s, v1.s[2]
323        FMLA    v27.4s, v21.4s, v1.s[2]
324        FMLA    v28.4s, v20.4s, v2.s[2]
325        FMLA    v29.4s, v21.4s, v2.s[2]
326        FMLA    v30.4s, v20.4s, v3.s[2]
327        FMLA    v31.4s, v21.4s, v3.s[2]
328        FMLA    v24.4s, v22.4s, v0.s[3]
329        FMLA    v25.4s, v23.4s, v0.s[3]
330        FMLA    v26.4s, v22.4s, v1.s[3]
331        FMLA    v27.4s, v23.4s, v1.s[3]
332        FMLA    v28.4s, v22.4s, v2.s[3]
333        FMLA    v29.4s, v23.4s, v2.s[3]
334        FMLA    v30.4s, v22.4s, v3.s[3]
335        FMLA    v31.4s, v23.4s, v3.s[3]
336
3374:
338        # Remainder- 2 floats of A (8 bytes)
339        TBZ     x0, 3, 5f
340
341        LDR     d0,  [x3], 8
342        LDP     q16, q17, [x5], 32
343        LDR     d1, [x11], 8
344        LDR     d2, [x12], 8
345        LDR     d3,  [x4], 8
346        FMLA    v24.4s, v16.4s, v0.s[0]
347        FMLA    v25.4s, v17.4s, v0.s[0]
348        LDP     q18, q19, [x5], 32
349        FMLA    v26.4s, v16.4s, v1.s[0]
350        FMLA    v27.4s, v17.4s, v1.s[0]
351        FMLA    v28.4s, v16.4s, v2.s[0]
352        FMLA    v29.4s, v17.4s, v2.s[0]
353        FMLA    v30.4s, v16.4s, v3.s[0]
354        FMLA    v31.4s, v17.4s, v3.s[0]
355        FMLA    v24.4s, v18.4s, v0.s[1]
356        FMLA    v25.4s, v19.4s, v0.s[1]
357        FMLA    v26.4s, v18.4s, v1.s[1]
358        FMLA    v27.4s, v19.4s, v1.s[1]
359        FMLA    v28.4s, v18.4s, v2.s[1]
360        FMLA    v29.4s, v19.4s, v2.s[1]
361        FMLA    v30.4s, v18.4s, v3.s[1]
362        FMLA    v31.4s, v19.4s, v3.s[1]
363
3645:
365        # Remainder- 1 float of A (4 bytes)
366        TBZ     x0, 2, 6f
367
368        LDR     s0,  [x3], 4
369        LDP     q16, q17, [x5], 32
370        LDR     s1, [x11], 4
371        LDR     s2, [x12], 4
372        LDR     s3,  [x4], 4
373        FMLA    v24.4s, v16.4s, v0.s[0]
374        FMLA    v25.4s, v17.4s, v0.s[0]
375        FMLA    v26.4s, v16.4s, v1.s[0]
376        FMLA    v27.4s, v17.4s, v1.s[0]
377        FMLA    v28.4s, v16.4s, v2.s[0]
378        FMLA    v29.4s, v17.4s, v2.s[0]
379        FMLA    v30.4s, v16.4s, v3.s[0]
380        FMLA    v31.4s, v17.4s, v3.s[0]
381
3826:
383        # Clamp
384        FMAX    v24.4s, v24.4s, v4.4s
385        SUBS    x1, x1, 8
386        FMAX    v25.4s, v25.4s, v4.4s
387        FMAX    v26.4s, v26.4s, v4.4s
388        FMAX    v27.4s, v27.4s, v4.4s
389        FMAX    v28.4s, v28.4s, v4.4s
390        FMAX    v29.4s, v29.4s, v4.4s
391        FMAX    v30.4s, v30.4s, v4.4s
392        FMAX    v31.4s, v31.4s, v4.4s
393        FMIN    v24.4s, v24.4s, v5.4s
394        FMIN    v25.4s, v25.4s, v5.4s
395        FMIN    v26.4s, v26.4s, v5.4s
396        FMIN    v27.4s, v27.4s, v5.4s
397        FMIN    v28.4s, v28.4s, v5.4s
398        FMIN    v29.4s, v29.4s, v5.4s
399        FMIN    v30.4s, v30.4s, v5.4s
400        FMIN    v31.4s, v31.4s, v5.4s
401
402        # Store full 4 x 8
403        B.LO    7f
404
405        STP     q24, q25,  [x6]
406        SUB     x3,  x3, x2             // a0 -= kc
407        ADD     x6,  x6, x14
408        STP     q26, q27,  [x9]
409        SUB     x11, x11, x2            // a1 -= kc
410        ADD     x9,  x9, x14
411        STP     q28, q29, [x10]
412        SUB     x12, x12, x2            // a2 -= kc
413        ADD     x10, x10, x14
414        STP     q30, q31,  [x7]
415        SUB     x4,  x4, x2             // a3 -= kc
416        ADD     x7,  x7, x14
417
418        B.HI    0b
419
420        # Restore d8-d15 from stack
421        LDP     d14, d15, [sp, 48]
422        LDP     d12, d13, [sp, 32]
423        LDP     d10, d11, [sp, 16]
424        LDP     d8,  d9, [sp], 64
425        RET
426
427        # Store odd width
4287:
429        TBZ     x1, 2, 8f
430        STR     q24, [x6], 16
431        MOV     v24.16b, v25.16b
432        STR     q26, [x9], 16
433        MOV     v26.16b, v27.16b
434        STR     q28, [x10], 16
435        MOV     v28.16b, v29.16b
436        STR     q30, [x7], 16
437        MOV     v30.16b, v31.16b
438
4398:
440        TBZ     x1, 1, 9f
441        STR     d24, [x6], 8
442        STR     d26, [x9], 8
443        DUP     d24, v24.d[1]
444        DUP     d26, v26.d[1]
445        STR     d28, [x10], 8
446        STR     d30, [x7], 8
447        DUP     d28, v28.d[1]
448        DUP     d30, v30.d[1]
449
4509:
451        TBZ     x1, 0, 10f
452        STR     s24,  [x6]
453        STR     s26,  [x9]
454        STR     s28, [x10]
455        STR     s30,  [x7]
45610:
457        # Restore d8-d15 from stack
458        LDP     d14, d15, [sp, 48]
459        LDP     d12, d13, [sp, 32]
460        LDP     d10, d11, [sp, 16]
461        LDP     d8,  d9, [sp], 64
462        RET
463
464
465END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75
466
467#ifdef __ELF__
468.section ".note.GNU-stack","",%progbits
469#endif
470