xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const float*restrict w,            x5
19#     float*restrict c,                  x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> x10
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> x8
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x20 a0
30# x13 a1
31# x14 a2
32# x15 a3
33
34# C pointers
35# x6  c0
36# x16 c1
37# x17 c2
38# x7  c3 / cm_stride
39
40# Vector register usage
41# A0  v0  v4
42# A1  v1  v5
43# A2  v2  v6
44# A3  v3  v7
45# B   v8  v9 v10 v11
46# B  v12 v13 v14 v15
47# B  v16 v17 v18 v19
48# B  v20 v21 v22 v23
49# C  v24 v25
50# C  v26 v27
51# C  v28 v29
52# C  v30 v31
53# Clamp v4 v5
54
55BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75
56
57        # Load cn_stride, a_offset
58        LDP     x10, x11, [sp]
59
60        # Load zero, params pointer
61        LDP     x12, x8, [sp, 16]
62
63        # Load min/max values
64        LD2R    {v4.4s, v5.4s}, [x8]
65
66        # Save x20 on stack
67        STR     x20, [sp, -80]!
68
69        # Save d8-d15 on stack
70        STP     d8,  d9, [sp, 16]
71        STP     d10, d11, [sp, 32]
72        STP     d12, d13, [sp, 48]
73        STP     d14, d15, [sp, 64]
74
75        # Clamp C pointers
76        CMP     x0, 2                   // if mr < 2
77        ADD     x16, x6, x7             // c1 = c0 + cm_stride
78        CSEL    x16, x6, x16, LO        //   c1 = c0
79
80        ADD     x17, x16, x7            // c2 = c1 + cm_stride
81                                        // if mr <= 2
82        CSEL    x17, x16, x17, LS       //   c2 = c1
83
84        CMP     x0, 4                   // if mr < 4
85        ADD     x7, x17, x7             // c3 = c2 + cm_stride
86        CSEL    x7, x17, x7, LO         //   c3 = c2
87
880:
89        # Load initial bias from w into accumulators
90        LDP     q24, q25, [x5], 32
91        MOV     v26.16b, v24.16b
92        MOV     v27.16b, v25.16b
93        MOV     v28.16b, v24.16b
94        MOV     v29.16b, v25.16b
95        MOV     v30.16b, v24.16b
96        MOV     v31.16b, v25.16b
97
98        MOV     x9, x3                  // p = ks
99
1001:
101        # Load next 4 A pointers
102        LDP     x20, x13, [x4], 16
103        LDP     x14, x15, [x4], 16
104
105        CMP     x20, x12                // if a0 == zero
106        ADD     x20, x20, x11           // a0 += a_offset
107        CSEL    x20, x12, x20, EQ       //   a0 = zero, else += a0 + a_offset
108        CMP     x13, x12                // if a1 == zero
109        ADD     x13, x13, x11           // a1 += a_offset
110        CSEL    x13, x12, x13, EQ       //   a1 = zero, else += a1 + a_offset
111        CMP     x14, x12                // if a2 == zero
112        ADD     x14, x14, x11           // a2 += a_offset
113        CSEL    x14, x12, x14, EQ       //   a2 = zero, else += a2 + a_offset
114        CMP     x15, x12                // if a3 == zero
115        ADD     x15, x15, x11           // a3 += a_offset
116        CSEL    x15, x12, x15, EQ       //   a3 = zero, else += a3 + a_offset
117
118        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
119        SUBS    x0, x2, 32              // k = kc - 32
120        B.LO    4f
121
122        # 16 prologue
123        # Read first block of 4 A and B.
124        LDR     q0, [x20], 16
125        LDP     q16, q17, [x5], 32
126        LDR     q1, [x13], 16
127        LDR     q2, [x14], 16
128        LDR     q3, [x15], 16
129        LDP     q18, q19, [x5], 32
130        LDP     q20, q21, [x5], 32
131        LDP     q22, q23, [x5], 32
132
133        # Is there at least 32.  yes do main loop
134        SUBS    x0, x0, 32
135        B.LO    3f
136
137        # Main loop - 8 floats of A
1382:
139        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
140        FMLA    v24.4s, v16.4s, v0.s[0]
141        LDP     q8, q9, [x5], 32
142        FMLA    v25.4s, v17.4s, v0.s[0]
143        FMLA    v26.4s, v16.4s, v1.s[0]
144        LDP     q10, q11, [x5], 32
145        FMLA    v27.4s, v17.4s, v1.s[0]
146        FMLA    v28.4s, v16.4s, v2.s[0]
147        LDP     q12, q13, [x5], 32
148        FMLA    v29.4s, v17.4s, v2.s[0]
149        FMLA    v30.4s, v16.4s, v3.s[0]
150        LDP     q14, q15, [x5], 32
151        FMLA    v31.4s, v17.4s, v3.s[0]
152        FMLA    v24.4s, v18.4s, v0.s[1]
153        LDR     q4, [x20], 16
154        FMLA    v25.4s, v19.4s, v0.s[1]
155        FMLA    v26.4s, v18.4s, v1.s[1]
156        LDR     q5, [x13], 16
157        FMLA    v27.4s, v19.4s, v1.s[1]
158        FMLA    v28.4s, v18.4s, v2.s[1]
159        LDR     q6, [x14], 16
160        FMLA    v29.4s, v19.4s, v2.s[1]
161        FMLA    v30.4s, v18.4s, v3.s[1]
162        LDR     q7, [x15], 16
163        FMLA    v31.4s, v19.4s, v3.s[1]
164        FMLA    v24.4s, v20.4s, v0.s[2]
165        PRFM    PLDL1KEEP, [x5, 128]
166        FMLA    v25.4s, v21.4s, v0.s[2]
167        FMLA    v26.4s, v20.4s, v1.s[2]
168        PRFM    PLDL1KEEP, [x5, 192]
169        FMLA    v27.4s, v21.4s, v1.s[2]
170        FMLA    v28.4s, v20.4s, v2.s[2]
171        PRFM    PLDL1KEEP, [x5, 256]
172        FMLA    v29.4s, v21.4s, v2.s[2]
173        FMLA    v30.4s, v20.4s, v3.s[2]
174        PRFM    PLDL1KEEP, [x5, 320]
175        FMLA    v31.4s, v21.4s, v3.s[2]
176        FMLA    v24.4s, v22.4s, v0.s[3]
177        FMLA    v25.4s, v23.4s, v0.s[3]
178        FMLA    v26.4s, v22.4s, v1.s[3]
179        FMLA    v27.4s, v23.4s, v1.s[3]
180        FMLA    v28.4s, v22.4s, v2.s[3]
181        FMLA    v29.4s, v23.4s, v2.s[3]
182        FMLA    v30.4s, v22.4s, v3.s[3]
183        FMLA    v31.4s, v23.4s, v3.s[3]
184
185        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
186        FMLA    v24.4s, v8.4s, v4.s[0]
187        LDP     q16, q17, [x5], 32
188        FMLA    v25.4s, v9.4s, v4.s[0]
189        FMLA    v26.4s, v8.4s, v5.s[0]
190        LDP     q18, q19, [x5], 32
191        FMLA    v27.4s, v9.4s, v5.s[0]
192        FMLA    v28.4s, v8.4s, v6.s[0]
193        LDP     q20, q21, [x5], 32
194        FMLA    v29.4s, v9.4s, v6.s[0]
195        FMLA    v30.4s, v8.4s, v7.s[0]
196        LDP     q22, q23, [x5], 32
197        FMLA    v31.4s, v9.4s, v7.s[0]
198        FMLA    v24.4s, v10.4s, v4.s[1]
199        LDR     q0, [x20], 16
200        FMLA    v25.4s, v11.4s, v4.s[1]
201        FMLA    v26.4s, v10.4s, v5.s[1]
202        LDR     q1, [x13], 16
203        FMLA    v27.4s, v11.4s, v5.s[1]
204        FMLA    v28.4s, v10.4s, v6.s[1]
205        LDR     q2, [x14], 16
206        FMLA    v29.4s, v11.4s, v6.s[1]
207        FMLA    v30.4s, v10.4s, v7.s[1]
208        LDR     q3, [x15], 16
209        FMLA    v31.4s, v11.4s, v7.s[1]
210        FMLA    v24.4s, v12.4s, v4.s[2]
211        FMLA    v25.4s, v13.4s, v4.s[2]
212        FMLA    v26.4s, v12.4s, v5.s[2]
213        FMLA    v27.4s, v13.4s, v5.s[2]
214        FMLA    v28.4s, v12.4s, v6.s[2]
215        FMLA    v29.4s, v13.4s, v6.s[2]
216        FMLA    v30.4s, v12.4s, v7.s[2]
217        FMLA    v31.4s, v13.4s, v7.s[2]
218        FMLA    v24.4s, v14.4s, v4.s[3]
219        FMLA    v25.4s, v15.4s, v4.s[3]
220        FMLA    v26.4s, v14.4s, v5.s[3]
221        FMLA    v27.4s, v15.4s, v5.s[3]
222        FMLA    v28.4s, v14.4s, v6.s[3]
223        FMLA    v29.4s, v15.4s, v6.s[3]
224        SUBS    x0, x0, 32
225        FMLA    v30.4s, v14.4s, v7.s[3]
226        FMLA    v31.4s, v15.4s, v7.s[3]
227
228        B.HS    2b
229
2303:
231        # Epilogue
232        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
233        FMLA    v24.4s, v16.4s, v0.s[0]
234        LDP     q8, q9, [x5], 32
235        FMLA    v25.4s, v17.4s, v0.s[0]
236        FMLA    v26.4s, v16.4s, v1.s[0]
237        LDP     q10, q11, [x5], 32
238        FMLA    v27.4s, v17.4s, v1.s[0]
239        FMLA    v28.4s, v16.4s, v2.s[0]
240        LDP     q12, q13, [x5], 32
241        FMLA    v29.4s, v17.4s, v2.s[0]
242        FMLA    v30.4s, v16.4s, v3.s[0]
243        LDP     q14, q15, [x5], 32
244        FMLA    v31.4s, v17.4s, v3.s[0]
245        FMLA    v24.4s, v18.4s, v0.s[1]
246        LDR     q4, [x20], 16
247        FMLA    v25.4s, v19.4s, v0.s[1]
248        FMLA    v26.4s, v18.4s, v1.s[1]
249        LDR     q5, [x13], 16
250        FMLA    v27.4s, v19.4s, v1.s[1]
251        FMLA    v28.4s, v18.4s, v2.s[1]
252        LDR     q6, [x14], 16
253        FMLA    v29.4s, v19.4s, v2.s[1]
254        FMLA    v30.4s, v18.4s, v3.s[1]
255        LDR     q7, [x15], 16
256        FMLA    v31.4s, v19.4s, v3.s[1]
257        FMLA    v24.4s, v20.4s, v0.s[2]
258        FMLA    v25.4s, v21.4s, v0.s[2]
259        FMLA    v26.4s, v20.4s, v1.s[2]
260        FMLA    v27.4s, v21.4s, v1.s[2]
261        FMLA    v28.4s, v20.4s, v2.s[2]
262        FMLA    v29.4s, v21.4s, v2.s[2]
263        FMLA    v30.4s, v20.4s, v3.s[2]
264        FMLA    v31.4s, v21.4s, v3.s[2]
265        FMLA    v24.4s, v22.4s, v0.s[3]
266        FMLA    v25.4s, v23.4s, v0.s[3]
267        FMLA    v26.4s, v22.4s, v1.s[3]
268        FMLA    v27.4s, v23.4s, v1.s[3]
269        FMLA    v28.4s, v22.4s, v2.s[3]
270        FMLA    v29.4s, v23.4s, v2.s[3]
271        FMLA    v30.4s, v22.4s, v3.s[3]
272        FMLA    v31.4s, v23.4s, v3.s[3]
273
274        # Second block of 4.  FMA for second 4, noloads
275        FMLA    v24.4s, v8.4s, v4.s[0]
276        FMLA    v25.4s, v9.4s, v4.s[0]
277        FMLA    v26.4s, v8.4s, v5.s[0]
278        FMLA    v27.4s, v9.4s, v5.s[0]
279        FMLA    v28.4s, v8.4s, v6.s[0]
280        FMLA    v29.4s, v9.4s, v6.s[0]
281        FMLA    v30.4s, v8.4s, v7.s[0]
282        FMLA    v31.4s, v9.4s, v7.s[0]
283        FMLA    v24.4s, v10.4s, v4.s[1]
284        FMLA    v25.4s, v11.4s, v4.s[1]
285        FMLA    v26.4s, v10.4s, v5.s[1]
286        FMLA    v27.4s, v11.4s, v5.s[1]
287        FMLA    v28.4s, v10.4s, v6.s[1]
288        FMLA    v29.4s, v11.4s, v6.s[1]
289        FMLA    v30.4s, v10.4s, v7.s[1]
290        FMLA    v31.4s, v11.4s, v7.s[1]
291        FMLA    v24.4s, v12.4s, v4.s[2]
292        FMLA    v25.4s, v13.4s, v4.s[2]
293        FMLA    v26.4s, v12.4s, v5.s[2]
294        FMLA    v27.4s, v13.4s, v5.s[2]
295        FMLA    v28.4s, v12.4s, v6.s[2]
296        FMLA    v29.4s, v13.4s, v6.s[2]
297        FMLA    v30.4s, v12.4s, v7.s[2]
298        FMLA    v31.4s, v13.4s, v7.s[2]
299
300        FMLA    v24.4s, v14.4s, v4.s[3]
301        FMLA    v25.4s, v15.4s, v4.s[3]
302        FMLA    v26.4s, v14.4s, v5.s[3]
303        FMLA    v27.4s, v15.4s, v5.s[3]
304
305        # Load min/max values
306        LD2R    {v4.4s, v5.4s}, [x8]
307
308        FMLA    v28.4s, v14.4s, v6.s[3]
309        FMLA    v29.4s, v15.4s, v6.s[3]
310        FMLA    v30.4s, v14.4s, v7.s[3]
311        FMLA    v31.4s, v15.4s, v7.s[3]
312
3134:
314        # Remainder- 4 floats of A
315        TBZ     x0, 4, 5f
316
317        LDR     q0, [x20], 16
318        LDP     q16, q17, [x5], 32
319        LDR     q1, [x13], 16
320        LDR     q2, [x14], 16
321        LDR     q3, [x15], 16
322        FMLA    v24.4s, v16.4s, v0.s[0]
323        FMLA    v25.4s, v17.4s, v0.s[0]
324        LDP     q18, q19, [x5], 32
325        FMLA    v26.4s, v16.4s, v1.s[0]
326        FMLA    v27.4s, v17.4s, v1.s[0]
327        LDP     q20, q21, [x5], 32
328        FMLA    v28.4s, v16.4s, v2.s[0]
329        FMLA    v29.4s, v17.4s, v2.s[0]
330        LDP     q22, q23, [x5], 32
331        FMLA    v30.4s, v16.4s, v3.s[0]
332        FMLA    v31.4s, v17.4s, v3.s[0]
333        FMLA    v24.4s, v18.4s, v0.s[1]
334        FMLA    v25.4s, v19.4s, v0.s[1]
335        FMLA    v26.4s, v18.4s, v1.s[1]
336        FMLA    v27.4s, v19.4s, v1.s[1]
337        FMLA    v28.4s, v18.4s, v2.s[1]
338        FMLA    v29.4s, v19.4s, v2.s[1]
339        FMLA    v30.4s, v18.4s, v3.s[1]
340        FMLA    v31.4s, v19.4s, v3.s[1]
341        FMLA    v24.4s, v20.4s, v0.s[2]
342        FMLA    v25.4s, v21.4s, v0.s[2]
343        FMLA    v26.4s, v20.4s, v1.s[2]
344        FMLA    v27.4s, v21.4s, v1.s[2]
345        FMLA    v28.4s, v20.4s, v2.s[2]
346        FMLA    v29.4s, v21.4s, v2.s[2]
347        FMLA    v30.4s, v20.4s, v3.s[2]
348        FMLA    v31.4s, v21.4s, v3.s[2]
349        FMLA    v24.4s, v22.4s, v0.s[3]
350        FMLA    v25.4s, v23.4s, v0.s[3]
351        FMLA    v26.4s, v22.4s, v1.s[3]
352        FMLA    v27.4s, v23.4s, v1.s[3]
353        FMLA    v28.4s, v22.4s, v2.s[3]
354        FMLA    v29.4s, v23.4s, v2.s[3]
355        FMLA    v30.4s, v22.4s, v3.s[3]
356        FMLA    v31.4s, v23.4s, v3.s[3]
357
3585:
359        # Remainder- 2 floats of A
360        TBZ     x0, 3, 6f
361
362        LDR     d0, [x20], 8
363        LDP     q16, q17, [x5], 32
364        LDR     d1, [x13], 8
365        LDR     d2, [x14], 8
366        LDR     d3, [x15], 8
367        FMLA    v24.4s, v16.4s, v0.s[0]
368        FMLA    v25.4s, v17.4s, v0.s[0]
369        LDP     q18, q19, [x5], 32
370        FMLA    v26.4s, v16.4s, v1.s[0]
371        FMLA    v27.4s, v17.4s, v1.s[0]
372        FMLA    v28.4s, v16.4s, v2.s[0]
373        FMLA    v29.4s, v17.4s, v2.s[0]
374        FMLA    v30.4s, v16.4s, v3.s[0]
375        FMLA    v31.4s, v17.4s, v3.s[0]
376        FMLA    v24.4s, v18.4s, v0.s[1]
377        FMLA    v25.4s, v19.4s, v0.s[1]
378        FMLA    v26.4s, v18.4s, v1.s[1]
379        FMLA    v27.4s, v19.4s, v1.s[1]
380        FMLA    v28.4s, v18.4s, v2.s[1]
381        FMLA    v29.4s, v19.4s, v2.s[1]
382        FMLA    v30.4s, v18.4s, v3.s[1]
383        FMLA    v31.4s, v19.4s, v3.s[1]
384
3856:
386        # Remainder- 1 float of A
387        TBZ     x0, 2, 7f
388
389        LDR     s0, [x20], 4
390        LDP     q16, q17, [x5], 32
391        LDR     s1, [x13], 4
392        LDR     s2, [x14], 4
393        LDR     s3, [x15], 4
394        FMLA    v24.4s, v16.4s, v0.s[0]
395        FMLA    v25.4s, v17.4s, v0.s[0]
396        FMLA    v26.4s, v16.4s, v1.s[0]
397        FMLA    v27.4s, v17.4s, v1.s[0]
398        FMLA    v28.4s, v16.4s, v2.s[0]
399        FMLA    v29.4s, v17.4s, v2.s[0]
400        FMLA    v30.4s, v16.4s, v3.s[0]
401        FMLA    v31.4s, v17.4s, v3.s[0]
402
4037:
404        # ks loop
405        SUBS    x9, x9, 32              // ks -= MR * sizeof(void*)
406        B.HI    1b
407
408        # Clamp
409        FMAX    v24.4s, v24.4s, v4.4s
410        FMAX    v25.4s, v25.4s, v4.4s
411        FMAX    v26.4s, v26.4s, v4.4s
412        FMAX    v27.4s, v27.4s, v4.4s
413        FMAX    v28.4s, v28.4s, v4.4s
414        FMAX    v29.4s, v29.4s, v4.4s
415        FMAX    v30.4s, v30.4s, v4.4s
416        FMAX    v31.4s, v31.4s, v4.4s
417        FMIN    v24.4s, v24.4s, v5.4s
418        FMIN    v25.4s, v25.4s, v5.4s
419        FMIN    v26.4s, v26.4s, v5.4s
420        FMIN    v27.4s, v27.4s, v5.4s
421        FMIN    v28.4s, v28.4s, v5.4s
422        FMIN    v29.4s, v29.4s, v5.4s
423        FMIN    v30.4s, v30.4s, v5.4s
424        FMIN    v31.4s, v31.4s, v5.4s
425
426        # Store full 4 x 8
427        SUBS    x1, x1, 8
428        B.LO    8f
429
430        STP     q30, q31,  [x7]
431        ADD     x7,  x7, x10
432        STP     q28, q29, [x17]
433        ADD     x17, x17, x10
434        STP     q26, q27, [x16]
435        ADD     x16, x16, x10
436        STP     q24, q25,  [x6]
437        ADD     x6,  x6, x10
438
439        SUB     x4, x4, x3              // a -= ks
440
441        # nc loop
442        B.HI    0b
443
444        # Restore d8-d15 from stack
445        LDP     d14, d15, [sp, 64]
446        LDP     d12, d13, [sp, 48]
447        LDP     d10, d11, [sp, 32]
448        LDP     d8,  d9, [sp, 16]
449
450        # Restore x20 from stack
451        LDR     x20, [sp], 80
452        RET
453
454        # Store odd width
4558:
456        TBZ     x1, 2, 9f
457        STR     q30, [x7], 16
458        MOV     v30.16b, v31.16b
459        STR     q28, [x17], 16
460        MOV     v28.16b, v29.16b
461        STR     q26, [x16], 16
462        MOV     v26.16b, v27.16b
463        STR     q24, [x6], 16
464        MOV     v24.16b, v25.16b
465
4669:
467        TBZ     x1, 1, 10f
468        STR     d30, [x7], 8
469        STR     d28, [x17], 8
470        DUP     d30, v30.d[1]
471        DUP     d28, v28.d[1]
472        STR     d26, [x16], 8
473        STR     d24, [x6], 8
474        DUP     d26, v26.d[1]
475        DUP     d24, v24.d[1]
476
47710:
478        TBZ     x1, 0, 11f
479        STR     s30,  [x7]
480        STR     s28, [x17]
481        STR     s26, [x16]
482        STR     s24,  [x6]
48311:
484        # Restore d8-d15 from stack
485        LDP     d14, d15, [sp, 64]
486        LDP     d12, d13, [sp, 48]
487        LDP     d10, d11, [sp, 32]
488        LDP     d8,  d9, [sp, 16]
489
490        # Restore x20 from stack
491        LDR     x20, [sp], 80
492        RET
493
494END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75
495
496#ifdef __ELF__
497.section ".note.GNU-stack","",%progbits
498#endif
499