xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/4x8-minmax-aarch64-neonfma-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const float*restrict w,            x5
19#     float*restrict c,                  x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> x10
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> x8
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x20 a0
30# x13 a1
31# x14 a2
32# x15 a3
33
34# C pointers
35# x6  c0
36# x16 c1
37# x17 c2
38# x7  c3 / cm_stride
39
40# Vector register usage
41# A0  v0  v4
42# A1  v1  v5
43# A2  v2  v6
44# A3  v3  v7
45# B   v8  v9 v10 v11
46# B  v12 v13 v14 v15
47# B  v16 v17 v18 v19
48# B  v20 v21 v22 v23
49# C  v24 v25
50# C  v26 v27
51# C  v28 v29
52# C  v30 v31
53# Clamp v4 v5
54
55BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75
56
57        # Load cn_stride, a_offset
58        LDP     x10, x11, [sp]
59
60        # Load zero, params pointer
61        LDP     x12, x8, [sp, 16]
62
63        # Load min/max values
64        LD2R    {v4.4s, v5.4s}, [x8]
65
66        # Save x20 on stack
67        STR     x20, [sp, -80]!
68
69        # Save d8-d15 on stack
70        STP     d8,  d9, [sp, 16]
71        STP     d10, d11, [sp, 32]
72        STP     d12, d13, [sp, 48]
73        STP     d14, d15, [sp, 64]
74
75        # Clamp C pointers
76        CMP     x0, 2                   // if mr < 2
77        ADD     x16, x6, x7             // c1 = c0 + cm_stride
78        CSEL    x16, x6, x16, LO        //   c1 = c0
79
80        ADD     x17, x16, x7            // c2 = c1 + cm_stride
81                                        // if mr <= 2
82        CSEL    x17, x16, x17, LS       //   c2 = c1
83
84        CMP     x0, 4                   // if mr < 4
85        ADD     x7, x17, x7             // c3 = c2 + cm_stride
86        CSEL    x7, x17, x7, LO         //   c3 = c2
87
880:
89        # Load initial bias from w into accumulators
90        LDP     q24, q25, [x5], 32
91        MOV     v26.16b, v24.16b
92        MOV     v27.16b, v25.16b
93        MOV     v28.16b, v24.16b
94        MOV     v29.16b, v25.16b
95        MOV     v30.16b, v24.16b
96        MOV     v31.16b, v25.16b
97
98        MOV     x9, x3                  // p = ks
99
1001:
101        # Load next 4 A pointers
102        LDP     x20, x13, [x4], 16
103        LDP     x14, x15, [x4], 16
104
105        CMP     x20, x12                // if a0 == zero
106        ADD     x20, x20, x11           // a0 += a_offset
107        CSEL    x20, x12, x20, EQ       //   a0 = zero, else += a0 + a_offset
108        CMP     x13, x12                // if a1 == zero
109        ADD     x13, x13, x11           // a1 += a_offset
110        CSEL    x13, x12, x13, EQ       //   a1 = zero, else += a1 + a_offset
111        CMP     x14, x12                // if a2 == zero
112        ADD     x14, x14, x11           // a2 += a_offset
113        CSEL    x14, x12, x14, EQ       //   a2 = zero, else += a2 + a_offset
114        CMP     x15, x12                // if a3 == zero
115        ADD     x15, x15, x11           // a3 += a_offset
116        CSEL    x15, x12, x15, EQ       //   a3 = zero, else += a3 + a_offset
117
118        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
119        SUBS    x0, x2, 32              // k = kc - 32
120        B.LO    4f
121
122        # 16 prologue
123        # Read first block of 4 A and B.
124        LDR     q0, [x20], 16
125        LDP     q16, q17, [x5], 32
126        LDR     q1, [x13], 16
127        LDR     q2, [x14], 16
128        LDR     q3, [x15], 16
129        LDP     q18, q19, [x5], 32
130        LDP     q20, q21, [x5], 32
131        LDP     q22, q23, [x5], 32
132
133        # Is there at least 32.  yes do main loop
134        SUBS    x0, x0, 32
135        B.LO    3f
136
137        # Main loop - 8 floats of A
1382:
139        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
140        FMLA    v24.4s, v16.4s, v0.s[0]
141        LDP     q8, q9, [x5], 32
142        FMLA    v25.4s, v17.4s, v0.s[0]
143        FMLA    v26.4s, v16.4s, v1.s[0]
144        LDP     q10, q11, [x5], 32
145        FMLA    v27.4s, v17.4s, v1.s[0]
146        FMLA    v28.4s, v16.4s, v2.s[0]
147        LDP     q12, q13, [x5], 32
148        FMLA    v29.4s, v17.4s, v2.s[0]
149        FMLA    v30.4s, v16.4s, v3.s[0]
150        LDP     q14, q15, [x5], 32
151        FMLA    v31.4s, v17.4s, v3.s[0]
152        FMLA    v24.4s, v18.4s, v0.s[1]
153        LDR     q4, [x20], 16
154        FMLA    v25.4s, v19.4s, v0.s[1]
155        FMLA    v26.4s, v18.4s, v1.s[1]
156        LDR     q5, [x13], 16
157        FMLA    v27.4s, v19.4s, v1.s[1]
158        FMLA    v28.4s, v18.4s, v2.s[1]
159        LDR     q6, [x14], 16
160        FMLA    v29.4s, v19.4s, v2.s[1]
161        FMLA    v30.4s, v18.4s, v3.s[1]
162        LDR     q7, [x15], 16
163        FMLA    v31.4s, v19.4s, v3.s[1]
164        FMLA    v24.4s, v20.4s, v0.s[2]
165        FMLA    v25.4s, v21.4s, v0.s[2]
166        FMLA    v26.4s, v20.4s, v1.s[2]
167        FMLA    v27.4s, v21.4s, v1.s[2]
168        FMLA    v28.4s, v20.4s, v2.s[2]
169        FMLA    v29.4s, v21.4s, v2.s[2]
170        FMLA    v30.4s, v20.4s, v3.s[2]
171        FMLA    v31.4s, v21.4s, v3.s[2]
172        FMLA    v24.4s, v22.4s, v0.s[3]
173        FMLA    v25.4s, v23.4s, v0.s[3]
174        FMLA    v26.4s, v22.4s, v1.s[3]
175        FMLA    v27.4s, v23.4s, v1.s[3]
176        FMLA    v28.4s, v22.4s, v2.s[3]
177        FMLA    v29.4s, v23.4s, v2.s[3]
178        FMLA    v30.4s, v22.4s, v3.s[3]
179        FMLA    v31.4s, v23.4s, v3.s[3]
180
181        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
182        FMLA    v24.4s, v8.4s, v4.s[0]
183        LDP     q16, q17, [x5], 32
184        FMLA    v25.4s, v9.4s, v4.s[0]
185        FMLA    v26.4s, v8.4s, v5.s[0]
186        LDP     q18, q19, [x5], 32
187        FMLA    v27.4s, v9.4s, v5.s[0]
188        FMLA    v28.4s, v8.4s, v6.s[0]
189        LDP     q20, q21, [x5], 32
190        FMLA    v29.4s, v9.4s, v6.s[0]
191        FMLA    v30.4s, v8.4s, v7.s[0]
192        LDP     q22, q23, [x5], 32
193        FMLA    v31.4s, v9.4s, v7.s[0]
194        FMLA    v24.4s, v10.4s, v4.s[1]
195        LDR     q0, [x20], 16
196        FMLA    v25.4s, v11.4s, v4.s[1]
197        FMLA    v26.4s, v10.4s, v5.s[1]
198        LDR     q1, [x13], 16
199        FMLA    v27.4s, v11.4s, v5.s[1]
200        FMLA    v28.4s, v10.4s, v6.s[1]
201        LDR     q2, [x14], 16
202        FMLA    v29.4s, v11.4s, v6.s[1]
203        FMLA    v30.4s, v10.4s, v7.s[1]
204        LDR     q3, [x15], 16
205        FMLA    v31.4s, v11.4s, v7.s[1]
206        FMLA    v24.4s, v12.4s, v4.s[2]
207        FMLA    v25.4s, v13.4s, v4.s[2]
208        FMLA    v26.4s, v12.4s, v5.s[2]
209        FMLA    v27.4s, v13.4s, v5.s[2]
210        FMLA    v28.4s, v12.4s, v6.s[2]
211        FMLA    v29.4s, v13.4s, v6.s[2]
212        FMLA    v30.4s, v12.4s, v7.s[2]
213        FMLA    v31.4s, v13.4s, v7.s[2]
214        FMLA    v24.4s, v14.4s, v4.s[3]
215        FMLA    v25.4s, v15.4s, v4.s[3]
216        FMLA    v26.4s, v14.4s, v5.s[3]
217        FMLA    v27.4s, v15.4s, v5.s[3]
218        FMLA    v28.4s, v14.4s, v6.s[3]
219        FMLA    v29.4s, v15.4s, v6.s[3]
220        SUBS    x0, x0, 32
221        FMLA    v30.4s, v14.4s, v7.s[3]
222        FMLA    v31.4s, v15.4s, v7.s[3]
223
224        B.HS    2b
225
2263:
227        # Epilogue
228        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
229        FMLA    v24.4s, v16.4s, v0.s[0]
230        LDP     q8, q9, [x5], 32
231        FMLA    v25.4s, v17.4s, v0.s[0]
232        FMLA    v26.4s, v16.4s, v1.s[0]
233        LDP     q10, q11, [x5], 32
234        FMLA    v27.4s, v17.4s, v1.s[0]
235        FMLA    v28.4s, v16.4s, v2.s[0]
236        LDP     q12, q13, [x5], 32
237        FMLA    v29.4s, v17.4s, v2.s[0]
238        FMLA    v30.4s, v16.4s, v3.s[0]
239        LDP     q14, q15, [x5], 32
240        FMLA    v31.4s, v17.4s, v3.s[0]
241        FMLA    v24.4s, v18.4s, v0.s[1]
242        LDR     q4, [x20], 16
243        FMLA    v25.4s, v19.4s, v0.s[1]
244        FMLA    v26.4s, v18.4s, v1.s[1]
245        LDR     q5, [x13], 16
246        FMLA    v27.4s, v19.4s, v1.s[1]
247        FMLA    v28.4s, v18.4s, v2.s[1]
248        LDR     q6, [x14], 16
249        FMLA    v29.4s, v19.4s, v2.s[1]
250        FMLA    v30.4s, v18.4s, v3.s[1]
251        LDR     q7, [x15], 16
252        FMLA    v31.4s, v19.4s, v3.s[1]
253        FMLA    v24.4s, v20.4s, v0.s[2]
254        FMLA    v25.4s, v21.4s, v0.s[2]
255        FMLA    v26.4s, v20.4s, v1.s[2]
256        FMLA    v27.4s, v21.4s, v1.s[2]
257        FMLA    v28.4s, v20.4s, v2.s[2]
258        FMLA    v29.4s, v21.4s, v2.s[2]
259        FMLA    v30.4s, v20.4s, v3.s[2]
260        FMLA    v31.4s, v21.4s, v3.s[2]
261        FMLA    v24.4s, v22.4s, v0.s[3]
262        FMLA    v25.4s, v23.4s, v0.s[3]
263        FMLA    v26.4s, v22.4s, v1.s[3]
264        FMLA    v27.4s, v23.4s, v1.s[3]
265        FMLA    v28.4s, v22.4s, v2.s[3]
266        FMLA    v29.4s, v23.4s, v2.s[3]
267        FMLA    v30.4s, v22.4s, v3.s[3]
268        FMLA    v31.4s, v23.4s, v3.s[3]
269
270        # Second block of 4.  FMA for second 4, noloads
271        FMLA    v24.4s, v8.4s, v4.s[0]
272        FMLA    v25.4s, v9.4s, v4.s[0]
273        FMLA    v26.4s, v8.4s, v5.s[0]
274        FMLA    v27.4s, v9.4s, v5.s[0]
275        FMLA    v28.4s, v8.4s, v6.s[0]
276        FMLA    v29.4s, v9.4s, v6.s[0]
277        FMLA    v30.4s, v8.4s, v7.s[0]
278        FMLA    v31.4s, v9.4s, v7.s[0]
279        FMLA    v24.4s, v10.4s, v4.s[1]
280        FMLA    v25.4s, v11.4s, v4.s[1]
281        FMLA    v26.4s, v10.4s, v5.s[1]
282        FMLA    v27.4s, v11.4s, v5.s[1]
283        FMLA    v28.4s, v10.4s, v6.s[1]
284        FMLA    v29.4s, v11.4s, v6.s[1]
285        FMLA    v30.4s, v10.4s, v7.s[1]
286        FMLA    v31.4s, v11.4s, v7.s[1]
287        FMLA    v24.4s, v12.4s, v4.s[2]
288        FMLA    v25.4s, v13.4s, v4.s[2]
289        FMLA    v26.4s, v12.4s, v5.s[2]
290        FMLA    v27.4s, v13.4s, v5.s[2]
291        FMLA    v28.4s, v12.4s, v6.s[2]
292        FMLA    v29.4s, v13.4s, v6.s[2]
293        FMLA    v30.4s, v12.4s, v7.s[2]
294        FMLA    v31.4s, v13.4s, v7.s[2]
295
296        FMLA    v24.4s, v14.4s, v4.s[3]
297        FMLA    v25.4s, v15.4s, v4.s[3]
298        FMLA    v26.4s, v14.4s, v5.s[3]
299        FMLA    v27.4s, v15.4s, v5.s[3]
300
301        # Load min/max values
302        LD2R    {v4.4s, v5.4s}, [x8]
303
304        FMLA    v28.4s, v14.4s, v6.s[3]
305        FMLA    v29.4s, v15.4s, v6.s[3]
306        FMLA    v30.4s, v14.4s, v7.s[3]
307        FMLA    v31.4s, v15.4s, v7.s[3]
308
3094:
310        # Remainder- 4 floats of A
311        TBZ     x0, 4, 5f
312
313        LDR     q0, [x20], 16
314        LDP     q16, q17, [x5], 32
315        LDR     q1, [x13], 16
316        LDR     q2, [x14], 16
317        LDR     q3, [x15], 16
318        FMLA    v24.4s, v16.4s, v0.s[0]
319        FMLA    v25.4s, v17.4s, v0.s[0]
320        LDP     q18, q19, [x5], 32
321        FMLA    v26.4s, v16.4s, v1.s[0]
322        FMLA    v27.4s, v17.4s, v1.s[0]
323        LDP     q20, q21, [x5], 32
324        FMLA    v28.4s, v16.4s, v2.s[0]
325        FMLA    v29.4s, v17.4s, v2.s[0]
326        LDP     q22, q23, [x5], 32
327        FMLA    v30.4s, v16.4s, v3.s[0]
328        FMLA    v31.4s, v17.4s, v3.s[0]
329        FMLA    v24.4s, v18.4s, v0.s[1]
330        FMLA    v25.4s, v19.4s, v0.s[1]
331        FMLA    v26.4s, v18.4s, v1.s[1]
332        FMLA    v27.4s, v19.4s, v1.s[1]
333        FMLA    v28.4s, v18.4s, v2.s[1]
334        FMLA    v29.4s, v19.4s, v2.s[1]
335        FMLA    v30.4s, v18.4s, v3.s[1]
336        FMLA    v31.4s, v19.4s, v3.s[1]
337        FMLA    v24.4s, v20.4s, v0.s[2]
338        FMLA    v25.4s, v21.4s, v0.s[2]
339        FMLA    v26.4s, v20.4s, v1.s[2]
340        FMLA    v27.4s, v21.4s, v1.s[2]
341        FMLA    v28.4s, v20.4s, v2.s[2]
342        FMLA    v29.4s, v21.4s, v2.s[2]
343        FMLA    v30.4s, v20.4s, v3.s[2]
344        FMLA    v31.4s, v21.4s, v3.s[2]
345        FMLA    v24.4s, v22.4s, v0.s[3]
346        FMLA    v25.4s, v23.4s, v0.s[3]
347        FMLA    v26.4s, v22.4s, v1.s[3]
348        FMLA    v27.4s, v23.4s, v1.s[3]
349        FMLA    v28.4s, v22.4s, v2.s[3]
350        FMLA    v29.4s, v23.4s, v2.s[3]
351        FMLA    v30.4s, v22.4s, v3.s[3]
352        FMLA    v31.4s, v23.4s, v3.s[3]
353
3545:
355        # Remainder- 2 floats of A
356        TBZ     x0, 3, 6f
357
358        LDR     d0, [x20], 8
359        LDP     q16, q17, [x5], 32
360        LDR     d1, [x13], 8
361        LDR     d2, [x14], 8
362        LDR     d3, [x15], 8
363        FMLA    v24.4s, v16.4s, v0.s[0]
364        FMLA    v25.4s, v17.4s, v0.s[0]
365        LDP     q18, q19, [x5], 32
366        FMLA    v26.4s, v16.4s, v1.s[0]
367        FMLA    v27.4s, v17.4s, v1.s[0]
368        FMLA    v28.4s, v16.4s, v2.s[0]
369        FMLA    v29.4s, v17.4s, v2.s[0]
370        FMLA    v30.4s, v16.4s, v3.s[0]
371        FMLA    v31.4s, v17.4s, v3.s[0]
372        FMLA    v24.4s, v18.4s, v0.s[1]
373        FMLA    v25.4s, v19.4s, v0.s[1]
374        FMLA    v26.4s, v18.4s, v1.s[1]
375        FMLA    v27.4s, v19.4s, v1.s[1]
376        FMLA    v28.4s, v18.4s, v2.s[1]
377        FMLA    v29.4s, v19.4s, v2.s[1]
378        FMLA    v30.4s, v18.4s, v3.s[1]
379        FMLA    v31.4s, v19.4s, v3.s[1]
380
3816:
382        # Remainder- 1 float of A
383        TBZ     x0, 2, 7f
384
385        LDR     s0, [x20], 4
386        LDP     q16, q17, [x5], 32
387        LDR     s1, [x13], 4
388        LDR     s2, [x14], 4
389        LDR     s3, [x15], 4
390        FMLA    v24.4s, v16.4s, v0.s[0]
391        FMLA    v25.4s, v17.4s, v0.s[0]
392        FMLA    v26.4s, v16.4s, v1.s[0]
393        FMLA    v27.4s, v17.4s, v1.s[0]
394        FMLA    v28.4s, v16.4s, v2.s[0]
395        FMLA    v29.4s, v17.4s, v2.s[0]
396        FMLA    v30.4s, v16.4s, v3.s[0]
397        FMLA    v31.4s, v17.4s, v3.s[0]
398
3997:
400        # ks loop
401        SUBS    x9, x9, 32              // ks -= MR * sizeof(void*)
402        B.HI    1b
403
404        # Clamp
405        FMAX    v24.4s, v24.4s, v4.4s
406        FMAX    v25.4s, v25.4s, v4.4s
407        FMAX    v26.4s, v26.4s, v4.4s
408        FMAX    v27.4s, v27.4s, v4.4s
409        FMAX    v28.4s, v28.4s, v4.4s
410        FMAX    v29.4s, v29.4s, v4.4s
411        FMAX    v30.4s, v30.4s, v4.4s
412        FMAX    v31.4s, v31.4s, v4.4s
413        FMIN    v24.4s, v24.4s, v5.4s
414        FMIN    v25.4s, v25.4s, v5.4s
415        FMIN    v26.4s, v26.4s, v5.4s
416        FMIN    v27.4s, v27.4s, v5.4s
417        FMIN    v28.4s, v28.4s, v5.4s
418        FMIN    v29.4s, v29.4s, v5.4s
419        FMIN    v30.4s, v30.4s, v5.4s
420        FMIN    v31.4s, v31.4s, v5.4s
421
422        # Store full 4 x 8
423        SUBS    x1, x1, 8
424        B.LO    8f
425
426        STP     q30, q31,  [x7]
427        ADD     x7,  x7, x10
428        STP     q28, q29, [x17]
429        ADD     x17, x17, x10
430        STP     q26, q27, [x16]
431        ADD     x16, x16, x10
432        STP     q24, q25,  [x6]
433        ADD     x6,  x6, x10
434
435        SUB     x4, x4, x3              // a -= ks
436
437        # nc loop
438        B.HI    0b
439
440        # Restore d8-d15 from stack
441        LDP     d14, d15, [sp, 64]
442        LDP     d12, d13, [sp, 48]
443        LDP     d10, d11, [sp, 32]
444        LDP     d8,  d9, [sp, 16]
445
446        # Restore x20 from stack
447        LDR     x20, [sp], 80
448        RET
449
450        # Store odd width
4518:
452        TBZ     x1, 2, 9f
453        STR     q30, [x7], 16
454        MOV     v30.16b, v31.16b
455        STR     q28, [x17], 16
456        MOV     v28.16b, v29.16b
457        STR     q26, [x16], 16
458        MOV     v26.16b, v27.16b
459        STR     q24, [x6], 16
460        MOV     v24.16b, v25.16b
461
4629:
463        TBZ     x1, 1, 10f
464        STR     d30, [x7], 8
465        STR     d28, [x17], 8
466        DUP     d30, v30.d[1]
467        DUP     d28, v28.d[1]
468        STR     d26, [x16], 8
469        STR     d24, [x6], 8
470        DUP     d26, v26.d[1]
471        DUP     d24, v24.d[1]
472
47310:
474        TBZ     x1, 0, 11f
475        STR     s30,  [x7]
476        STR     s28, [x17]
477        STR     s26, [x16]
478        STR     s24,  [x6]
47911:
480        # Restore d8-d15 from stack
481        LDP     d14, d15, [sp, 64]
482        LDP     d12, d13, [sp, 48]
483        LDP     d10, d11, [sp, 32]
484        LDP     d8,  d9, [sp, 16]
485
486        # Restore x20 from stack
487        LDR     x20, [sp], 80
488        RET
489
490END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75
491
492#ifdef __ELF__
493.section ".note.GNU-stack","",%progbits
494#endif
495