xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38# Vector register usage
39# A0  v0  v4
40# A1  v1  v5
41# A2  v2  v6
42# A3  v3  v7
43# B   v8  v9 v10 v11
44# B  v12 v13 v14 v15
45# B  v16 v17 v18 v19
46# B  v20 v21 v22 v23
47# C  v24 v25
48# C  v26 v27
49# C  v28 v29
50# C  v30 v31
51# Clamp v4 v5
52
53BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75
54
55        # Load cn_stride, params pointer
56        LDP     x14, x8, [sp]
57
58        # Load min/max values
59        LD2R    {v4.4s, v5.4s}, [x8]
60
61        # Save d8-d15 on stack
62        STP     d8,  d9, [sp, -64]!
63        STP     d10, d11, [sp, 16]
64        STP     d12, d13, [sp, 32]
65        STP     d14, d15, [sp, 48]
66
67        # Clamp A and C pointers
68        CMP     x0, 2                   // if mr < 2
69        ADD     x11, x3, x4             // a1 = a0 + a_stride
70        ADD     x9, x6, x7              // c1 = c0 + cm_stride
71        CSEL    x11, x3, x11, LO        //   a1 = a0
72        CSEL    x9, x6, x9, LO          //   c1 = c0
73
74        ADD     x12, x11, x4            // a2 = a1 + a_stride
75        ADD     x10, x9, x7             // c2 = c1 + cm_stride
76                                        // if mr <= 2
77        CSEL    x12, x11, x12, LS       //   a2 = a1
78        CSEL    x10, x9, x10, LS        //   c2 = c1
79
80        CMP     x0, 4                   // if mr < 4
81        ADD     x4, x12, x4             // a3 = a2 + a_stride
82        ADD     x7, x10, x7             // c3 = c2 + cm_stride
83        CSEL    x4, x12, x4, LO         //   a3 = a2
84        CSEL    x7, x10, x7, LO         //   c3 = c2
85
860:
87        # Load initial bias from w into accumulators
88        LDP     q24, q25, [x5], 32
89        MOV     v26.16b, v24.16b
90        MOV     v27.16b, v25.16b
91        MOV     v28.16b, v24.16b
92        MOV     v29.16b, v25.16b
93        MOV     v30.16b, v24.16b
94        MOV     v31.16b, v25.16b
95
96        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
97        SUBS    x0, x2, 32              // k = kc - 32
98        B.LO    3f
99
100        # 16 prologue
101        # Read first block of 4 A and B.
102        LDR     q0,  [x3], 16
103        LDP     q16, q17, [x5], 32
104        LDR     q1, [x11], 16
105        LDR     q2, [x12], 16
106        LDR     q3,  [x4], 16
107        LDP     q18, q19, [x5], 32
108        LDP     q20, q21, [x5], 32
109        LDP     q22, q23, [x5], 32
110
111        # Is there at least 32.  yes do main loop
112        SUBS    x0, x0, 32
113        B.LO    2f
114
115        # Main loop - 8 floats of A (32 bytes)
1161:
117        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
118        FMLA    v24.4s, v16.4s, v0.s[0]
119        LDP     q8, q9, [x5], 32
120        FMLA    v25.4s, v17.4s, v0.s[0]
121        FMLA    v26.4s, v16.4s, v1.s[0]
122        LDP     q10, q11, [x5], 32
123        FMLA    v27.4s, v17.4s, v1.s[0]
124        FMLA    v28.4s, v16.4s, v2.s[0]
125        LDP     q12, q13, [x5], 32
126        FMLA    v29.4s, v17.4s, v2.s[0]
127        FMLA    v30.4s, v16.4s, v3.s[0]
128        LDP     q14, q15, [x5], 32
129        FMLA    v31.4s, v17.4s, v3.s[0]
130        FMLA    v24.4s, v18.4s, v0.s[1]
131        LDR     q4, [x3], 16
132        FMLA    v25.4s, v19.4s, v0.s[1]
133        FMLA    v26.4s, v18.4s, v1.s[1]
134        LDR     q5, [x11], 16
135        FMLA    v27.4s, v19.4s, v1.s[1]
136        FMLA    v28.4s, v18.4s, v2.s[1]
137        LDR     q6, [x12], 16
138        FMLA    v29.4s, v19.4s, v2.s[1]
139        FMLA    v30.4s, v18.4s, v3.s[1]
140        LDR     q7, [x4], 16
141        FMLA    v31.4s, v19.4s, v3.s[1]
142        FMLA    v24.4s, v20.4s, v0.s[2]
143        FMLA    v25.4s, v21.4s, v0.s[2]
144        FMLA    v26.4s, v20.4s, v1.s[2]
145        FMLA    v27.4s, v21.4s, v1.s[2]
146        FMLA    v28.4s, v20.4s, v2.s[2]
147        FMLA    v29.4s, v21.4s, v2.s[2]
148        FMLA    v30.4s, v20.4s, v3.s[2]
149        FMLA    v31.4s, v21.4s, v3.s[2]
150        FMLA    v24.4s, v22.4s, v0.s[3]
151        FMLA    v25.4s, v23.4s, v0.s[3]
152        FMLA    v26.4s, v22.4s, v1.s[3]
153        FMLA    v27.4s, v23.4s, v1.s[3]
154        FMLA    v28.4s, v22.4s, v2.s[3]
155        FMLA    v29.4s, v23.4s, v2.s[3]
156        FMLA    v30.4s, v22.4s, v3.s[3]
157        FMLA    v31.4s, v23.4s, v3.s[3]
158
159        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
160        FMLA    v24.4s, v8.4s, v4.s[0]
161        LDP     q16, q17, [x5], 32
162        FMLA    v25.4s, v9.4s, v4.s[0]
163        FMLA    v26.4s, v8.4s, v5.s[0]
164        LDP     q18, q19, [x5], 32
165        FMLA    v27.4s, v9.4s, v5.s[0]
166        FMLA    v28.4s, v8.4s, v6.s[0]
167        LDP     q20, q21, [x5], 32
168        FMLA    v29.4s, v9.4s, v6.s[0]
169        FMLA    v30.4s, v8.4s, v7.s[0]
170        LDP     q22, q23, [x5], 32
171        FMLA    v31.4s, v9.4s, v7.s[0]
172        FMLA    v24.4s, v10.4s, v4.s[1]
173        LDR     q0, [x3], 16
174        FMLA    v25.4s, v11.4s, v4.s[1]
175        FMLA    v26.4s, v10.4s, v5.s[1]
176        LDR     q1, [x11], 16
177        FMLA    v27.4s, v11.4s, v5.s[1]
178        FMLA    v28.4s, v10.4s, v6.s[1]
179        LDR     q2, [x12], 16
180        FMLA    v29.4s, v11.4s, v6.s[1]
181        FMLA    v30.4s, v10.4s, v7.s[1]
182        LDR     q3, [x4], 16
183        FMLA    v31.4s, v11.4s, v7.s[1]
184        FMLA    v24.4s, v12.4s, v4.s[2]
185        FMLA    v25.4s, v13.4s, v4.s[2]
186        FMLA    v26.4s, v12.4s, v5.s[2]
187        FMLA    v27.4s, v13.4s, v5.s[2]
188        FMLA    v28.4s, v12.4s, v6.s[2]
189        FMLA    v29.4s, v13.4s, v6.s[2]
190        FMLA    v30.4s, v12.4s, v7.s[2]
191        FMLA    v31.4s, v13.4s, v7.s[2]
192        FMLA    v24.4s, v14.4s, v4.s[3]
193        FMLA    v25.4s, v15.4s, v4.s[3]
194        FMLA    v26.4s, v14.4s, v5.s[3]
195        FMLA    v27.4s, v15.4s, v5.s[3]
196        FMLA    v28.4s, v14.4s, v6.s[3]
197        FMLA    v29.4s, v15.4s, v6.s[3]
198        SUBS    x0, x0, 32
199        FMLA    v30.4s, v14.4s, v7.s[3]
200        FMLA    v31.4s, v15.4s, v7.s[3]
201        B.HS    1b
202
2032:
204        # Epilogue
205        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
206        FMLA    v24.4s, v16.4s, v0.s[0]
207        LDP     q8, q9, [x5], 32
208        FMLA    v25.4s, v17.4s, v0.s[0]
209        FMLA    v26.4s, v16.4s, v1.s[0]
210        LDP     q10, q11, [x5], 32
211        FMLA    v27.4s, v17.4s, v1.s[0]
212        FMLA    v28.4s, v16.4s, v2.s[0]
213        LDP     q12, q13, [x5], 32
214        FMLA    v29.4s, v17.4s, v2.s[0]
215        FMLA    v30.4s, v16.4s, v3.s[0]
216        LDP     q14, q15, [x5], 32
217        FMLA    v31.4s, v17.4s, v3.s[0]
218        FMLA    v24.4s, v18.4s, v0.s[1]
219        LDR     q4, [x3], 16
220        FMLA    v25.4s, v19.4s, v0.s[1]
221        FMLA    v26.4s, v18.4s, v1.s[1]
222        LDR     q5, [x11], 16
223        FMLA    v27.4s, v19.4s, v1.s[1]
224        FMLA    v28.4s, v18.4s, v2.s[1]
225        LDR     q6, [x12], 16
226        FMLA    v29.4s, v19.4s, v2.s[1]
227        FMLA    v30.4s, v18.4s, v3.s[1]
228        LDR     q7, [x4], 16
229        FMLA    v31.4s, v19.4s, v3.s[1]
230        FMLA    v24.4s, v20.4s, v0.s[2]
231        FMLA    v25.4s, v21.4s, v0.s[2]
232        FMLA    v26.4s, v20.4s, v1.s[2]
233        FMLA    v27.4s, v21.4s, v1.s[2]
234        FMLA    v28.4s, v20.4s, v2.s[2]
235        FMLA    v29.4s, v21.4s, v2.s[2]
236        FMLA    v30.4s, v20.4s, v3.s[2]
237        FMLA    v31.4s, v21.4s, v3.s[2]
238        FMLA    v24.4s, v22.4s, v0.s[3]
239        FMLA    v25.4s, v23.4s, v0.s[3]
240        FMLA    v26.4s, v22.4s, v1.s[3]
241        FMLA    v27.4s, v23.4s, v1.s[3]
242        FMLA    v28.4s, v22.4s, v2.s[3]
243        FMLA    v29.4s, v23.4s, v2.s[3]
244        FMLA    v30.4s, v22.4s, v3.s[3]
245        FMLA    v31.4s, v23.4s, v3.s[3]
246
247        # Second block of 4.  FMA for second 4, noloads
248        FMLA    v24.4s, v8.4s, v4.s[0]
249        FMLA    v25.4s, v9.4s, v4.s[0]
250        FMLA    v26.4s, v8.4s, v5.s[0]
251        FMLA    v27.4s, v9.4s, v5.s[0]
252        FMLA    v28.4s, v8.4s, v6.s[0]
253        FMLA    v29.4s, v9.4s, v6.s[0]
254        FMLA    v30.4s, v8.4s, v7.s[0]
255        FMLA    v31.4s, v9.4s, v7.s[0]
256
257        FMLA    v24.4s, v10.4s, v4.s[1]
258        FMLA    v25.4s, v11.4s, v4.s[1]
259        FMLA    v26.4s, v10.4s, v5.s[1]
260        FMLA    v27.4s, v11.4s, v5.s[1]
261        FMLA    v28.4s, v10.4s, v6.s[1]
262        FMLA    v29.4s, v11.4s, v6.s[1]
263        FMLA    v30.4s, v10.4s, v7.s[1]
264        FMLA    v31.4s, v11.4s, v7.s[1]
265
266        FMLA    v24.4s, v12.4s, v4.s[2]
267        FMLA    v25.4s, v13.4s, v4.s[2]
268        FMLA    v26.4s, v12.4s, v5.s[2]
269        FMLA    v27.4s, v13.4s, v5.s[2]
270        FMLA    v28.4s, v12.4s, v6.s[2]
271        FMLA    v29.4s, v13.4s, v6.s[2]
272        FMLA    v30.4s, v12.4s, v7.s[2]
273        FMLA    v31.4s, v13.4s, v7.s[2]
274
275        FMLA    v24.4s, v14.4s, v4.s[3]
276        FMLA    v25.4s, v15.4s, v4.s[3]
277        FMLA    v26.4s, v14.4s, v5.s[3]
278        FMLA    v27.4s, v15.4s, v5.s[3]
279
280        # Load min/max values
281        LD2R    {v4.4s, v5.4s}, [x8]
282
283        FMLA    v28.4s, v14.4s, v6.s[3]
284        FMLA    v29.4s, v15.4s, v6.s[3]
285        FMLA    v30.4s, v14.4s, v7.s[3]
286        FMLA    v31.4s, v15.4s, v7.s[3]
287
2883:
289        # Remainder- 4 floats of A (16 bytes)
290        TBZ     x0, 4, 4f
291
292        LDR     q0,  [x3], 16
293        LDP     q16, q17, [x5], 32
294        LDR     q1, [x11], 16
295        LDR     q2, [x12], 16
296        LDR     q3,  [x4], 16
297        FMLA    v24.4s, v16.4s, v0.s[0]
298        FMLA    v25.4s, v17.4s, v0.s[0]
299        LDP     q18, q19, [x5], 32
300        FMLA    v26.4s, v16.4s, v1.s[0]
301        FMLA    v27.4s, v17.4s, v1.s[0]
302        LDP     q20, q21, [x5], 32
303        FMLA    v28.4s, v16.4s, v2.s[0]
304        FMLA    v29.4s, v17.4s, v2.s[0]
305        LDP     q22, q23, [x5], 32
306        FMLA    v30.4s, v16.4s, v3.s[0]
307        FMLA    v31.4s, v17.4s, v3.s[0]
308        FMLA    v24.4s, v18.4s, v0.s[1]
309        FMLA    v25.4s, v19.4s, v0.s[1]
310        FMLA    v26.4s, v18.4s, v1.s[1]
311        FMLA    v27.4s, v19.4s, v1.s[1]
312        FMLA    v28.4s, v18.4s, v2.s[1]
313        FMLA    v29.4s, v19.4s, v2.s[1]
314        FMLA    v30.4s, v18.4s, v3.s[1]
315        FMLA    v31.4s, v19.4s, v3.s[1]
316        FMLA    v24.4s, v20.4s, v0.s[2]
317        FMLA    v25.4s, v21.4s, v0.s[2]
318        FMLA    v26.4s, v20.4s, v1.s[2]
319        FMLA    v27.4s, v21.4s, v1.s[2]
320        FMLA    v28.4s, v20.4s, v2.s[2]
321        FMLA    v29.4s, v21.4s, v2.s[2]
322        FMLA    v30.4s, v20.4s, v3.s[2]
323        FMLA    v31.4s, v21.4s, v3.s[2]
324        FMLA    v24.4s, v22.4s, v0.s[3]
325        FMLA    v25.4s, v23.4s, v0.s[3]
326        FMLA    v26.4s, v22.4s, v1.s[3]
327        FMLA    v27.4s, v23.4s, v1.s[3]
328        FMLA    v28.4s, v22.4s, v2.s[3]
329        FMLA    v29.4s, v23.4s, v2.s[3]
330        FMLA    v30.4s, v22.4s, v3.s[3]
331        FMLA    v31.4s, v23.4s, v3.s[3]
332
3334:
334        # Remainder- 2 floats of A (8 bytes)
335        TBZ     x0, 3, 5f
336
337        LDR     d0,  [x3], 8
338        LDP     q16, q17, [x5], 32
339        LDR     d1, [x11], 8
340        LDR     d2, [x12], 8
341        LDR     d3,  [x4], 8
342        FMLA    v24.4s, v16.4s, v0.s[0]
343        FMLA    v25.4s, v17.4s, v0.s[0]
344        LDP     q18, q19, [x5], 32
345        FMLA    v26.4s, v16.4s, v1.s[0]
346        FMLA    v27.4s, v17.4s, v1.s[0]
347        FMLA    v28.4s, v16.4s, v2.s[0]
348        FMLA    v29.4s, v17.4s, v2.s[0]
349        FMLA    v30.4s, v16.4s, v3.s[0]
350        FMLA    v31.4s, v17.4s, v3.s[0]
351        FMLA    v24.4s, v18.4s, v0.s[1]
352        FMLA    v25.4s, v19.4s, v0.s[1]
353        FMLA    v26.4s, v18.4s, v1.s[1]
354        FMLA    v27.4s, v19.4s, v1.s[1]
355        FMLA    v28.4s, v18.4s, v2.s[1]
356        FMLA    v29.4s, v19.4s, v2.s[1]
357        FMLA    v30.4s, v18.4s, v3.s[1]
358        FMLA    v31.4s, v19.4s, v3.s[1]
359
3605:
361        # Remainder- 1 float of A (4 bytes)
362        TBZ     x0, 2, 6f
363
364        LDR     s0,  [x3], 4
365        LDP     q16, q17, [x5], 32
366        LDR     s1, [x11], 4
367        LDR     s2, [x12], 4
368        LDR     s3,  [x4], 4
369        FMLA    v24.4s, v16.4s, v0.s[0]
370        FMLA    v25.4s, v17.4s, v0.s[0]
371        FMLA    v26.4s, v16.4s, v1.s[0]
372        FMLA    v27.4s, v17.4s, v1.s[0]
373        FMLA    v28.4s, v16.4s, v2.s[0]
374        FMLA    v29.4s, v17.4s, v2.s[0]
375        FMLA    v30.4s, v16.4s, v3.s[0]
376        FMLA    v31.4s, v17.4s, v3.s[0]
377
3786:
379        # Clamp
380        FMAX    v24.4s, v24.4s, v4.4s
381        SUBS    x1, x1, 8
382        FMAX    v25.4s, v25.4s, v4.4s
383        FMAX    v26.4s, v26.4s, v4.4s
384        FMAX    v27.4s, v27.4s, v4.4s
385        FMAX    v28.4s, v28.4s, v4.4s
386        FMAX    v29.4s, v29.4s, v4.4s
387        FMAX    v30.4s, v30.4s, v4.4s
388        FMAX    v31.4s, v31.4s, v4.4s
389        FMIN    v24.4s, v24.4s, v5.4s
390        FMIN    v25.4s, v25.4s, v5.4s
391        FMIN    v26.4s, v26.4s, v5.4s
392        FMIN    v27.4s, v27.4s, v5.4s
393        FMIN    v28.4s, v28.4s, v5.4s
394        FMIN    v29.4s, v29.4s, v5.4s
395        FMIN    v30.4s, v30.4s, v5.4s
396        FMIN    v31.4s, v31.4s, v5.4s
397
398        # Store full 4 x 8
399        B.LO    7f
400
401        STP     q24, q25,  [x6]
402        SUB     x3,  x3, x2             // a0 -= kc
403        ADD     x6,  x6, x14
404        STP     q26, q27,  [x9]
405        SUB     x11, x11, x2            // a1 -= kc
406        ADD     x9,  x9, x14
407        STP     q28, q29, [x10]
408        SUB     x12, x12, x2            // a2 -= kc
409        ADD     x10, x10, x14
410        STP     q30, q31,  [x7]
411        SUB     x4,  x4, x2             // a3 -= kc
412        ADD     x7,  x7, x14
413
414        B.HI    0b
415
416        # Restore d8-d15 from stack
417        LDP     d14, d15, [sp, 48]
418        LDP     d12, d13, [sp, 32]
419        LDP     d10, d11, [sp, 16]
420        LDP     d8,  d9, [sp], 64
421        RET
422
423        # Store odd width
4247:
425        TBZ     x1, 2, 8f
426        STR     q24, [x6], 16
427        MOV     v24.16b, v25.16b
428        STR     q26, [x9], 16
429        MOV     v26.16b, v27.16b
430        STR     q28, [x10], 16
431        MOV     v28.16b, v29.16b
432        STR     q30, [x7], 16
433        MOV     v30.16b, v31.16b
434
4358:
436        TBZ     x1, 1, 9f
437        STR     d24, [x6], 8
438        STR     d26, [x9], 8
439        DUP     d24, v24.d[1]
440        DUP     d26, v26.d[1]
441        STR     d28, [x10], 8
442        STR     d30, [x7], 8
443        DUP     d28, v28.d[1]
444        DUP     d30, v30.d[1]
445
4469:
447        TBZ     x1, 0, 10f
448        STR     s24,  [x6]
449        STR     s26,  [x9]
450        STR     s28, [x10]
451        STR     s30,  [x7]
45210:
453        # Restore d8-d15 from stack
454        LDP     d14, d15, [sp, 48]
455        LDP     d12, d13, [sp, 32]
456        LDP     d10, d11, [sp, 16]
457        LDP     d8,  d9, [sp], 64
458        RET
459
460
461END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75
462
463#ifdef __ELF__
464.section ".note.GNU-stack","",%progbits
465#endif
466