xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# LINT.IfChange
9# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75(
10#     size_t mr,                         x0
11#     size_t nc,                         x1
12#     size_t kc,                         x2 / x0
13#     size_t ks,                         x3 / x9
14#     const float**restrict a,           x4
15#     const float*restrict w,            x5
16#     float*restrict c,                  x6
17#     size_t cm_stride,                  x7
18#     size_t cn_stride,                  [sp] -> x10
19#     size_t a_offset,                   [sp + 8] -> x11
20#     const float* zero,                 [sp + 16] -> x12
21#     const xnn_f32_minmax_params params [sp + 24] -> x8
22
23# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
24
25# A pointers
26# x20 a0
27# x13 a1
28# x14 a2
29# x15 a3
30
31# C pointers
32# x6  c0
33# x16 c1
34# x17 c2
35# x7  c3 / cm_stride
36
37# Vector register usage
38# A0  v0  v4
39# A1  v1  v5
40# A2  v2  v6
41# A3  v3  v7
42# B   v8  v9 v10 v11
43# B  v12 v13 v14 v15
44# B  v16 v17 v18 v19
45# B  v20 v21 v22 v23
46# C  v24 v25
47# C  v26 v27
48# C  v28 v29
49# C  v30 v31
50# Clamp v4 v5
51
52BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75
53
54        # Load cn_stride, a_offset
55        LDP     x10, x11, [sp]
56
57        # Load zero, params pointer
58        LDP     x12, x8, [sp, 16]
59
60        # Load min/max values
61        LD2R    {v4.4s, v5.4s}, [x8]
62
63        # Save x20 on stack
64        STR     x20, [sp, -80]!
65
66        # Save d8-d15 on stack
67        STP     d8,  d9, [sp, 16]
68        STP     d10, d11, [sp, 32]
69        STP     d12, d13, [sp, 48]
70        STP     d14, d15, [sp, 64]
71
72        # Clamp C pointers
73        CMP     x0, 2                   // if mr < 2
74        ADD     x16, x6, x7             // c1 = c0 + cm_stride
75        CSEL    x16, x6, x16, LO        //   c1 = c0
76
77        ADD     x17, x16, x7            // c2 = c1 + cm_stride
78                                        // if mr <= 2
79        CSEL    x17, x16, x17, LS       //   c2 = c1
80
81        CMP     x0, 4                   // if mr < 4
82        ADD     x7, x17, x7             // c3 = c2 + cm_stride
83        CSEL    x7, x17, x7, LO         //   c3 = c2
84
850:
86        # Load initial bias from w into accumulators
87        LDP     q24, q25, [x5], 32
88        MOV     v26.16b, v24.16b
89        MOV     v27.16b, v25.16b
90        MOV     v28.16b, v24.16b
91        MOV     v29.16b, v25.16b
92        MOV     v30.16b, v24.16b
93        MOV     v31.16b, v25.16b
94
95        MOV     x9, x3                  // p = ks
96
971:
98        # Load next 4 A pointers
99        LDP     x20, x13, [x4], 16
100        LDP     x14, x15, [x4], 16
101
102        CMP     x20, x12                // if a0 == zero
103        ADD     x20, x20, x11           // a0 += a_offset
104        CSEL    x20, x12, x20, EQ       //   a0 = zero, else += a0 + a_offset
105        CMP     x13, x12                // if a1 == zero
106        ADD     x13, x13, x11           // a1 += a_offset
107        CSEL    x13, x12, x13, EQ       //   a1 = zero, else += a1 + a_offset
108        CMP     x14, x12                // if a2 == zero
109        ADD     x14, x14, x11           // a2 += a_offset
110        CSEL    x14, x12, x14, EQ       //   a2 = zero, else += a2 + a_offset
111        CMP     x15, x12                // if a3 == zero
112        ADD     x15, x15, x11           // a3 += a_offset
113        CSEL    x15, x12, x15, EQ       //   a3 = zero, else += a3 + a_offset
114
115        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
116        SUBS    x0, x2, 32              // k = kc - 32
117        B.LO    4f
118
119        # 16 prologue
120        # Read first block of 4 A and B.
121        LDR     q0, [x20], 16
122        LDP     q16, q17, [x5], 32
123        LDR     q1, [x13], 16
124        LDR     q2, [x14], 16
125        LDR     q3, [x15], 16
126        LDP     q18, q19, [x5], 32
127        LDP     q20, q21, [x5], 32
128        LDP     q22, q23, [x5], 32
129
130        # Is there at least 32.  yes do main loop
131        SUBS    x0, x0, 32
132        B.LO    3f
133
134        # Main loop - 8 floats of A
1352:
136        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
137        FMLA    v24.4s, v16.4s, v0.s[0]
138        LDP     q8, q9, [x5], 32
139        FMLA    v25.4s, v17.4s, v0.s[0]
140        FMLA    v26.4s, v16.4s, v1.s[0]
141        LDP     q10, q11, [x5], 32
142        FMLA    v27.4s, v17.4s, v1.s[0]
143        FMLA    v28.4s, v16.4s, v2.s[0]
144        LDP     q12, q13, [x5], 32
145        FMLA    v29.4s, v17.4s, v2.s[0]
146        FMLA    v30.4s, v16.4s, v3.s[0]
147        LDP     q14, q15, [x5], 32
148        FMLA    v31.4s, v17.4s, v3.s[0]
149        FMLA    v24.4s, v18.4s, v0.s[1]
150        LDR     q4, [x20], 16
151        FMLA    v25.4s, v19.4s, v0.s[1]
152        FMLA    v26.4s, v18.4s, v1.s[1]
153        LDR     q5, [x13], 16
154        FMLA    v27.4s, v19.4s, v1.s[1]
155        FMLA    v28.4s, v18.4s, v2.s[1]
156        LDR     q6, [x14], 16
157        FMLA    v29.4s, v19.4s, v2.s[1]
158        FMLA    v30.4s, v18.4s, v3.s[1]
159        LDR     q7, [x15], 16
160        FMLA    v31.4s, v19.4s, v3.s[1]
161        FMLA    v24.4s, v20.4s, v0.s[2]
162        $if PREFETCH:
163          PRFM    PLDL1KEEP, [x5, 128]
164        FMLA    v25.4s, v21.4s, v0.s[2]
165        FMLA    v26.4s, v20.4s, v1.s[2]
166        $if PREFETCH:
167          PRFM    PLDL1KEEP, [x5, 192]
168        FMLA    v27.4s, v21.4s, v1.s[2]
169        FMLA    v28.4s, v20.4s, v2.s[2]
170        $if PREFETCH:
171          PRFM    PLDL1KEEP, [x5, 256]
172        FMLA    v29.4s, v21.4s, v2.s[2]
173        FMLA    v30.4s, v20.4s, v3.s[2]
174        $if PREFETCH:
175          PRFM    PLDL1KEEP, [x5, 320]
176        FMLA    v31.4s, v21.4s, v3.s[2]
177        FMLA    v24.4s, v22.4s, v0.s[3]
178        FMLA    v25.4s, v23.4s, v0.s[3]
179        FMLA    v26.4s, v22.4s, v1.s[3]
180        FMLA    v27.4s, v23.4s, v1.s[3]
181        FMLA    v28.4s, v22.4s, v2.s[3]
182        FMLA    v29.4s, v23.4s, v2.s[3]
183        FMLA    v30.4s, v22.4s, v3.s[3]
184        FMLA    v31.4s, v23.4s, v3.s[3]
185
186        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
187        FMLA    v24.4s, v8.4s, v4.s[0]
188        LDP     q16, q17, [x5], 32
189        FMLA    v25.4s, v9.4s, v4.s[0]
190        FMLA    v26.4s, v8.4s, v5.s[0]
191        LDP     q18, q19, [x5], 32
192        FMLA    v27.4s, v9.4s, v5.s[0]
193        FMLA    v28.4s, v8.4s, v6.s[0]
194        LDP     q20, q21, [x5], 32
195        FMLA    v29.4s, v9.4s, v6.s[0]
196        FMLA    v30.4s, v8.4s, v7.s[0]
197        LDP     q22, q23, [x5], 32
198        FMLA    v31.4s, v9.4s, v7.s[0]
199        FMLA    v24.4s, v10.4s, v4.s[1]
200        LDR     q0, [x20], 16
201        FMLA    v25.4s, v11.4s, v4.s[1]
202        FMLA    v26.4s, v10.4s, v5.s[1]
203        LDR     q1, [x13], 16
204        FMLA    v27.4s, v11.4s, v5.s[1]
205        FMLA    v28.4s, v10.4s, v6.s[1]
206        LDR     q2, [x14], 16
207        FMLA    v29.4s, v11.4s, v6.s[1]
208        FMLA    v30.4s, v10.4s, v7.s[1]
209        LDR     q3, [x15], 16
210        FMLA    v31.4s, v11.4s, v7.s[1]
211        FMLA    v24.4s, v12.4s, v4.s[2]
212        FMLA    v25.4s, v13.4s, v4.s[2]
213        FMLA    v26.4s, v12.4s, v5.s[2]
214        FMLA    v27.4s, v13.4s, v5.s[2]
215        FMLA    v28.4s, v12.4s, v6.s[2]
216        FMLA    v29.4s, v13.4s, v6.s[2]
217        FMLA    v30.4s, v12.4s, v7.s[2]
218        FMLA    v31.4s, v13.4s, v7.s[2]
219        FMLA    v24.4s, v14.4s, v4.s[3]
220        FMLA    v25.4s, v15.4s, v4.s[3]
221        FMLA    v26.4s, v14.4s, v5.s[3]
222        FMLA    v27.4s, v15.4s, v5.s[3]
223        FMLA    v28.4s, v14.4s, v6.s[3]
224        FMLA    v29.4s, v15.4s, v6.s[3]
225        SUBS    x0, x0, 32
226        FMLA    v30.4s, v14.4s, v7.s[3]
227        FMLA    v31.4s, v15.4s, v7.s[3]
228
229        B.HS    2b
230
2313:
232        # Epilogue
233        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
234        FMLA    v24.4s, v16.4s, v0.s[0]
235        LDP     q8, q9, [x5], 32
236        FMLA    v25.4s, v17.4s, v0.s[0]
237        FMLA    v26.4s, v16.4s, v1.s[0]
238        LDP     q10, q11, [x5], 32
239        FMLA    v27.4s, v17.4s, v1.s[0]
240        FMLA    v28.4s, v16.4s, v2.s[0]
241        LDP     q12, q13, [x5], 32
242        FMLA    v29.4s, v17.4s, v2.s[0]
243        FMLA    v30.4s, v16.4s, v3.s[0]
244        LDP     q14, q15, [x5], 32
245        FMLA    v31.4s, v17.4s, v3.s[0]
246        FMLA    v24.4s, v18.4s, v0.s[1]
247        LDR     q4, [x20], 16
248        FMLA    v25.4s, v19.4s, v0.s[1]
249        FMLA    v26.4s, v18.4s, v1.s[1]
250        LDR     q5, [x13], 16
251        FMLA    v27.4s, v19.4s, v1.s[1]
252        FMLA    v28.4s, v18.4s, v2.s[1]
253        LDR     q6, [x14], 16
254        FMLA    v29.4s, v19.4s, v2.s[1]
255        FMLA    v30.4s, v18.4s, v3.s[1]
256        LDR     q7, [x15], 16
257        FMLA    v31.4s, v19.4s, v3.s[1]
258        FMLA    v24.4s, v20.4s, v0.s[2]
259        FMLA    v25.4s, v21.4s, v0.s[2]
260        FMLA    v26.4s, v20.4s, v1.s[2]
261        FMLA    v27.4s, v21.4s, v1.s[2]
262        FMLA    v28.4s, v20.4s, v2.s[2]
263        FMLA    v29.4s, v21.4s, v2.s[2]
264        FMLA    v30.4s, v20.4s, v3.s[2]
265        FMLA    v31.4s, v21.4s, v3.s[2]
266        FMLA    v24.4s, v22.4s, v0.s[3]
267        FMLA    v25.4s, v23.4s, v0.s[3]
268        FMLA    v26.4s, v22.4s, v1.s[3]
269        FMLA    v27.4s, v23.4s, v1.s[3]
270        FMLA    v28.4s, v22.4s, v2.s[3]
271        FMLA    v29.4s, v23.4s, v2.s[3]
272        FMLA    v30.4s, v22.4s, v3.s[3]
273        FMLA    v31.4s, v23.4s, v3.s[3]
274
275        # Second block of 4.  FMA for second 4, noloads
276        FMLA    v24.4s, v8.4s, v4.s[0]
277        FMLA    v25.4s, v9.4s, v4.s[0]
278        FMLA    v26.4s, v8.4s, v5.s[0]
279        FMLA    v27.4s, v9.4s, v5.s[0]
280        FMLA    v28.4s, v8.4s, v6.s[0]
281        FMLA    v29.4s, v9.4s, v6.s[0]
282        FMLA    v30.4s, v8.4s, v7.s[0]
283        FMLA    v31.4s, v9.4s, v7.s[0]
284        FMLA    v24.4s, v10.4s, v4.s[1]
285        FMLA    v25.4s, v11.4s, v4.s[1]
286        FMLA    v26.4s, v10.4s, v5.s[1]
287        FMLA    v27.4s, v11.4s, v5.s[1]
288        FMLA    v28.4s, v10.4s, v6.s[1]
289        FMLA    v29.4s, v11.4s, v6.s[1]
290        FMLA    v30.4s, v10.4s, v7.s[1]
291        FMLA    v31.4s, v11.4s, v7.s[1]
292        FMLA    v24.4s, v12.4s, v4.s[2]
293        FMLA    v25.4s, v13.4s, v4.s[2]
294        FMLA    v26.4s, v12.4s, v5.s[2]
295        FMLA    v27.4s, v13.4s, v5.s[2]
296        FMLA    v28.4s, v12.4s, v6.s[2]
297        FMLA    v29.4s, v13.4s, v6.s[2]
298        FMLA    v30.4s, v12.4s, v7.s[2]
299        FMLA    v31.4s, v13.4s, v7.s[2]
300
301        FMLA    v24.4s, v14.4s, v4.s[3]
302        FMLA    v25.4s, v15.4s, v4.s[3]
303        FMLA    v26.4s, v14.4s, v5.s[3]
304        FMLA    v27.4s, v15.4s, v5.s[3]
305
306        # Load min/max values
307        LD2R    {v4.4s, v5.4s}, [x8]
308
309        FMLA    v28.4s, v14.4s, v6.s[3]
310        FMLA    v29.4s, v15.4s, v6.s[3]
311        FMLA    v30.4s, v14.4s, v7.s[3]
312        FMLA    v31.4s, v15.4s, v7.s[3]
313
3144:
315        # Remainder- 4 floats of A
316        TBZ     x0, 4, 5f
317
318        LDR     q0, [x20], 16
319        LDP     q16, q17, [x5], 32
320        LDR     q1, [x13], 16
321        LDR     q2, [x14], 16
322        LDR     q3, [x15], 16
323        FMLA    v24.4s, v16.4s, v0.s[0]
324        FMLA    v25.4s, v17.4s, v0.s[0]
325        LDP     q18, q19, [x5], 32
326        FMLA    v26.4s, v16.4s, v1.s[0]
327        FMLA    v27.4s, v17.4s, v1.s[0]
328        LDP     q20, q21, [x5], 32
329        FMLA    v28.4s, v16.4s, v2.s[0]
330        FMLA    v29.4s, v17.4s, v2.s[0]
331        LDP     q22, q23, [x5], 32
332        FMLA    v30.4s, v16.4s, v3.s[0]
333        FMLA    v31.4s, v17.4s, v3.s[0]
334        FMLA    v24.4s, v18.4s, v0.s[1]
335        FMLA    v25.4s, v19.4s, v0.s[1]
336        FMLA    v26.4s, v18.4s, v1.s[1]
337        FMLA    v27.4s, v19.4s, v1.s[1]
338        FMLA    v28.4s, v18.4s, v2.s[1]
339        FMLA    v29.4s, v19.4s, v2.s[1]
340        FMLA    v30.4s, v18.4s, v3.s[1]
341        FMLA    v31.4s, v19.4s, v3.s[1]
342        FMLA    v24.4s, v20.4s, v0.s[2]
343        FMLA    v25.4s, v21.4s, v0.s[2]
344        FMLA    v26.4s, v20.4s, v1.s[2]
345        FMLA    v27.4s, v21.4s, v1.s[2]
346        FMLA    v28.4s, v20.4s, v2.s[2]
347        FMLA    v29.4s, v21.4s, v2.s[2]
348        FMLA    v30.4s, v20.4s, v3.s[2]
349        FMLA    v31.4s, v21.4s, v3.s[2]
350        FMLA    v24.4s, v22.4s, v0.s[3]
351        FMLA    v25.4s, v23.4s, v0.s[3]
352        FMLA    v26.4s, v22.4s, v1.s[3]
353        FMLA    v27.4s, v23.4s, v1.s[3]
354        FMLA    v28.4s, v22.4s, v2.s[3]
355        FMLA    v29.4s, v23.4s, v2.s[3]
356        FMLA    v30.4s, v22.4s, v3.s[3]
357        FMLA    v31.4s, v23.4s, v3.s[3]
358
3595:
360        # Remainder- 2 floats of A
361        TBZ     x0, 3, 6f
362
363        LDR     d0, [x20], 8
364        LDP     q16, q17, [x5], 32
365        LDR     d1, [x13], 8
366        LDR     d2, [x14], 8
367        LDR     d3, [x15], 8
368        FMLA    v24.4s, v16.4s, v0.s[0]
369        FMLA    v25.4s, v17.4s, v0.s[0]
370        LDP     q18, q19, [x5], 32
371        FMLA    v26.4s, v16.4s, v1.s[0]
372        FMLA    v27.4s, v17.4s, v1.s[0]
373        FMLA    v28.4s, v16.4s, v2.s[0]
374        FMLA    v29.4s, v17.4s, v2.s[0]
375        FMLA    v30.4s, v16.4s, v3.s[0]
376        FMLA    v31.4s, v17.4s, v3.s[0]
377        FMLA    v24.4s, v18.4s, v0.s[1]
378        FMLA    v25.4s, v19.4s, v0.s[1]
379        FMLA    v26.4s, v18.4s, v1.s[1]
380        FMLA    v27.4s, v19.4s, v1.s[1]
381        FMLA    v28.4s, v18.4s, v2.s[1]
382        FMLA    v29.4s, v19.4s, v2.s[1]
383        FMLA    v30.4s, v18.4s, v3.s[1]
384        FMLA    v31.4s, v19.4s, v3.s[1]
385
3866:
387        # Remainder- 1 float of A
388        TBZ     x0, 2, 7f
389
390        LDR     s0, [x20], 4
391        LDP     q16, q17, [x5], 32
392        LDR     s1, [x13], 4
393        LDR     s2, [x14], 4
394        LDR     s3, [x15], 4
395        FMLA    v24.4s, v16.4s, v0.s[0]
396        FMLA    v25.4s, v17.4s, v0.s[0]
397        FMLA    v26.4s, v16.4s, v1.s[0]
398        FMLA    v27.4s, v17.4s, v1.s[0]
399        FMLA    v28.4s, v16.4s, v2.s[0]
400        FMLA    v29.4s, v17.4s, v2.s[0]
401        FMLA    v30.4s, v16.4s, v3.s[0]
402        FMLA    v31.4s, v17.4s, v3.s[0]
403
4047:
405        # ks loop
406        SUBS    x9, x9, 32              // ks -= MR * sizeof(void*)
407        B.HI    1b
408
409        # Clamp
410        FMAX    v24.4s, v24.4s, v4.4s
411        FMAX    v25.4s, v25.4s, v4.4s
412        FMAX    v26.4s, v26.4s, v4.4s
413        FMAX    v27.4s, v27.4s, v4.4s
414        FMAX    v28.4s, v28.4s, v4.4s
415        FMAX    v29.4s, v29.4s, v4.4s
416        FMAX    v30.4s, v30.4s, v4.4s
417        FMAX    v31.4s, v31.4s, v4.4s
418        FMIN    v24.4s, v24.4s, v5.4s
419        FMIN    v25.4s, v25.4s, v5.4s
420        FMIN    v26.4s, v26.4s, v5.4s
421        FMIN    v27.4s, v27.4s, v5.4s
422        FMIN    v28.4s, v28.4s, v5.4s
423        FMIN    v29.4s, v29.4s, v5.4s
424        FMIN    v30.4s, v30.4s, v5.4s
425        FMIN    v31.4s, v31.4s, v5.4s
426
427        # Store full 4 x 8
428        SUBS    x1, x1, 8
429        B.LO    8f
430
431        STP     q30, q31,  [x7]
432        ADD     x7,  x7, x10
433        STP     q28, q29, [x17]
434        ADD     x17, x17, x10
435        STP     q26, q27, [x16]
436        ADD     x16, x16, x10
437        STP     q24, q25,  [x6]
438        ADD     x6,  x6, x10
439
440        SUB     x4, x4, x3              // a -= ks
441
442        # nc loop
443        B.HI    0b
444
445        # Restore d8-d15 from stack
446        LDP     d14, d15, [sp, 64]
447        LDP     d12, d13, [sp, 48]
448        LDP     d10, d11, [sp, 32]
449        LDP     d8,  d9, [sp, 16]
450
451        # Restore x20 from stack
452        LDR     x20, [sp], 80
453        RET
454
455        # Store odd width
4568:
457        TBZ     x1, 2, 9f
458        STR     q30, [x7], 16
459        MOV     v30.16b, v31.16b
460        STR     q28, [x17], 16
461        MOV     v28.16b, v29.16b
462        STR     q26, [x16], 16
463        MOV     v26.16b, v27.16b
464        STR     q24, [x6], 16
465        MOV     v24.16b, v25.16b
466
4679:
468        TBZ     x1, 1, 10f
469        STR     d30, [x7], 8
470        STR     d28, [x17], 8
471        DUP     d30, v30.d[1]
472        DUP     d28, v28.d[1]
473        STR     d26, [x16], 8
474        STR     d24, [x6], 8
475        DUP     d26, v26.d[1]
476        DUP     d24, v24.d[1]
477
47810:
479        TBZ     x1, 0, 11f
480        STR     s30,  [x7]
481        STR     s28, [x17]
482        STR     s26, [x16]
483        STR     s24,  [x6]
48411:
485        # Restore d8-d15 from stack
486        LDP     d14, d15, [sp, 64]
487        LDP     d12, d13, [sp, 48]
488        LDP     d10, d11, [sp, 32]
489        LDP     d8,  d9, [sp, 16]
490
491        # Restore x20 from stack
492        LDR     x20, [sp], 80
493        RET
494
495END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75
496# LINT.ThenChange(4x8-aarch64-neonfma-cortex-a75.cc)
497
498#ifdef __ELF__
499.section ".note.GNU-stack","",%progbits
500#endif
501