xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/6x8-aarch64-neonfma-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const void*restrict w,             x5
19#     uint8_t*restrict c,                x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> (x0)
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x14 a0
30# x15 a1
31# x20 a2
32# x21 a3
33# x22 a4
34# x23 a5
35
36# C pointers
37#  x6 c0
38# x16 c1
39# x17 c2
40# x10 c3
41# x13 c4
42#  x7 c5
43
44# Vector register usage
45# A0   v0
46# A1   v1
47# A2   v2
48# A3   v3
49# A4   v4
50# A5   v5
51# B   v16 v17 v18 v19
52# C   v20 v21
53# C   v22 v23
54# C   v24 v25
55# C   v26 v27
56# C   v28 v29
57# C   v30 v31
58# Clamp v6 v7
59# unused A   v8 v9 v10 v11
60# unused B   v12 v13 v14 v15
61
62BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64
63
64        # Load zero, params pointer
65        LDP     x12, x8, [sp, 16]
66
67        # Clamp C pointers
68        CMP     x0, 2                   // if mr < 2
69        ADD     x16, x6, x7             // c1 = c0 + cm_stride
70        CSEL    x16, x6, x16, LO        //   c1 = c0
71
72        # Load min/max values
73        LD2R    {v6.4s, v7.4s}, [x8]
74
75        ADD     x17, x16, x7            // c2 = c1 + cm_stride
76                                        // if mr <= 2
77        CSEL    x17, x16, x17, LS       //   c2 = c1
78
79        # Save x20,x21,x22,x23 on stack
80        STP     x20, x21, [sp, -32]!
81
82        CMP     x0, 4                   // if mr < 4
83        ADD     x10, x17, x7            // c3 = c2 + cm_stride
84        CSEL    x10, x17, x10, LO       //   c3 = c2
85
86        STP     x22, x23, [sp, 16]
87
88        ADD     x13, x10, x7            // c4 = c3 + cm_stride
89                                        // if mr <= 4
90        CSEL    x13, x10, x13, LS       //   c4 = c3
91
92        # Load a_offset
93        LDR     x11, [sp, 40]
94
95        CMP     x0, 6                   // if mr < 6
96        ADD     x7, x13, x7             // c5 = c4 + cm_stride
97        CSEL    x7, x13, x7, LO         //   c5 = c4
98
990:
100        # Load initial bias from w into accumulators
101        LDP     q20, q21, [x5], 32
102        MOV     v22.16b, v20.16b
103        MOV     v23.16b, v21.16b
104        MOV     v24.16b, v20.16b
105        MOV     v25.16b, v21.16b
106        MOV     v26.16b, v20.16b
107        MOV     v27.16b, v21.16b
108        MOV     v28.16b, v20.16b
109        MOV     v29.16b, v21.16b
110        MOV     v30.16b, v20.16b
111        MOV     v31.16b, v21.16b
112
113        MOV     x9, x3                  // p = ks
114
1151:
116        # Load next 6 A pointers
117        LDP     x14, x15, [x4], 16
118        LDP     x20, x21, [x4], 16
119        LDP     x22, x23, [x4], 16
120
121        CMP     x14, x12                // if a0 == zero
122        ADD     x14, x14, x11           // a0 += a_offset
123        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
124        CMP     x15, x12                // if a1 == zero
125        ADD     x15, x15, x11           // a1 += a_offset
126        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
127        CMP     x20, x12                // if a2 == zero
128        ADD     x20, x20, x11           // a2 += a_offset
129        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
130        CMP     x21, x12                // if a3 == zero
131        ADD     x21, x21, x11           // a3 += a_offset
132        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
133        CMP     x22, x12                // if a4 == zero
134        ADD     x22, x22, x11           // a4 += a_offset
135        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
136        CMP     x23, x12                // if a5 == zero
137        ADD     x23, x23, x11           // a5 += a_offset
138        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
139
140        # Is there at least 2 floats (8 bytes) for main loop?
141        SUBS    x0, x2, 8               // k = kc - 8
142        B.LO    4f
143
144        # Main loop - 2 floats of A (8 bytes)
145        # 24 FMA + 6 LD64 A + 2 LDP B
1462:
147        LDR     d0, [x14], 8
148        LDP     q16,  q17, [x5], 32
149        LDR     d1, [x15], 8
150        LDR     d2, [x20], 8
151        LDR     d3, [x21], 8
152        LDR     d4, [x22], 8
153        LDR     d5, [x23], 8
154        FMLA    v20.4s, v16.4s,  v0.s[0]
155        FMLA    v22.4s, v16.4s,  v1.s[0]
156        FMLA    v24.4s, v16.4s,  v2.s[0]
157        FMLA    v26.4s, v16.4s,  v3.s[0]
158        LDP     q18,  q19, [x5], 32
159        FMLA    v28.4s, v16.4s,  v4.s[0]
160        FMLA    v30.4s, v16.4s,  v5.s[0]
161        FMLA    v21.4s, v17.4s,  v0.s[0]
162        FMLA    v23.4s, v17.4s,  v1.s[0]
163        FMLA    v25.4s, v17.4s,  v2.s[0]
164        FMLA    v27.4s, v17.4s,  v3.s[0]
165        FMLA    v29.4s, v17.4s,  v4.s[0]
166        FMLA    v31.4s, v17.4s,  v5.s[0]
167
168        FMLA    v20.4s, v18.4s,  v0.s[1]
169        FMLA    v22.4s, v18.4s,  v1.s[1]
170        FMLA    v24.4s, v18.4s,  v2.s[1]
171        FMLA    v26.4s, v18.4s,  v3.s[1]
172        FMLA    v28.4s, v18.4s,  v4.s[1]
173        FMLA    v30.4s, v18.4s,  v5.s[1]
174        FMLA    v21.4s, v19.4s,  v0.s[1]
175        FMLA    v23.4s, v19.4s,  v1.s[1]
176        FMLA    v25.4s, v19.4s,  v2.s[1]
177        FMLA    v27.4s, v19.4s,  v3.s[1]
178        SUBS    x0, x0, 8
179        FMLA    v29.4s, v19.4s,  v4.s[1]
180        FMLA    v31.4s, v19.4s,  v5.s[1]
181        B.HS    2b
182
183        # Is there a remainder?- 1 float of A (4 bytes)
184        TBNZ    x0, 2, 4f
185
1863:
187        # ks loop
188        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
189        B.HI    1b
190
191        # Clamp
192        FMAX    v20.4s, v20.4s, v6.4s
193        # Load cn_stride
194        LDR     x0, [sp, 32]
195        FMAX    v21.4s, v21.4s, v6.4s
196        FMAX    v22.4s, v22.4s, v6.4s
197        FMAX    v23.4s, v23.4s, v6.4s
198        FMAX    v24.4s, v24.4s, v6.4s
199        FMAX    v25.4s, v25.4s, v6.4s
200        FMAX    v26.4s, v26.4s, v6.4s
201        FMAX    v27.4s, v27.4s, v6.4s
202        FMAX    v28.4s, v28.4s, v6.4s
203        FMAX    v29.4s, v29.4s, v6.4s
204        FMAX    v30.4s, v30.4s, v6.4s
205        FMAX    v31.4s, v31.4s, v6.4s
206        SUBS    x1, x1, 8
207        FMIN    v20.4s, v20.4s, v7.4s
208        FMIN    v21.4s, v21.4s, v7.4s
209        FMIN    v22.4s, v22.4s, v7.4s
210        FMIN    v23.4s, v23.4s, v7.4s
211        FMIN    v24.4s, v24.4s, v7.4s
212        FMIN    v25.4s, v25.4s, v7.4s
213        FMIN    v26.4s, v26.4s, v7.4s
214        FMIN    v27.4s, v27.4s, v7.4s
215        FMIN    v28.4s, v28.4s, v7.4s
216        FMIN    v29.4s, v29.4s, v7.4s
217        FMIN    v30.4s, v30.4s, v7.4s
218        FMIN    v31.4s, v31.4s, v7.4s
219
220        # Store full 6 x 8
221        B.LO    5f
222
223        STP     q30, q31,  [x7]
224        ADD     x7, x7, x0
225        STP     q28, q29, [x13]
226        ADD     x13, x13, x0
227        STP     q26, q27, [x10]
228        ADD     x10, x10, x0
229        STP     q24, q25, [x17]
230        ADD     x17, x17, x0
231        STP     q22, q23, [x16]
232        ADD     x16, x16, x0
233        STP     q20, q21,  [x6]
234        ADD     x6,  x6, x0
235
236        SUB     x4, x4, x3              // a -= ks
237
238        # nc loop
239        B.HI    0b
240
241        # Restore x20,x21,x22,x23 from stack
242        LDP     x22, x23, [sp, 16]
243        LDP     x20, x21, [sp], 32
244        RET
245
246        # Remainder- 1 float of A (4 bytes)
2474:
248        LDR     s0, [x14], 4
249        LDP     q16,  q17, [x5], 32
250        LDR     s1, [x15], 4
251        LDR     s2, [x20], 4
252        LDR     s3, [x21], 4
253        LDR     s4, [x22], 4
254        LDR     s5, [x23], 4
255        FMLA    v20.4s, v16.4s,  v0.s[0]
256        FMLA    v22.4s, v16.4s,  v1.s[0]
257        FMLA    v24.4s, v16.4s,  v2.s[0]
258        FMLA    v26.4s, v16.4s,  v3.s[0]
259        FMLA    v28.4s, v16.4s,  v4.s[0]
260        FMLA    v30.4s, v16.4s,  v5.s[0]
261        FMLA    v21.4s, v17.4s,  v0.s[0]
262        FMLA    v23.4s, v17.4s,  v1.s[0]
263        FMLA    v25.4s, v17.4s,  v2.s[0]
264        FMLA    v27.4s, v17.4s,  v3.s[0]
265        FMLA    v29.4s, v17.4s,  v4.s[0]
266        FMLA    v31.4s, v17.4s,  v5.s[0]
267        B       3b
268
269        # Store odd width
2705:
271        TBZ     x1, 2, 6f
272        STR     q30,  [x7], 16
273        MOV     v30.16b, v31.16b
274        STR     q28, [x13], 16
275        MOV     v28.16b, v29.16b
276        STR     q26, [x10], 16
277        MOV     v26.16b, v27.16b
278        STR     q24, [x17], 16
279        MOV     v24.16b, v25.16b
280        STR     q22, [x16], 16
281        MOV     v22.16b, v23.16b
282        STR     q20,  [x6], 16
283        MOV     v20.16b, v21.16b
2846:
285        TBZ     x1, 1, 7f
286        STR     d30,  [x7], 8
287        STR     d28, [x13], 8
288        DUP     d30, v30.d[1]
289        DUP     d28, v28.d[1]
290        STR     d26, [x10], 8
291        STR     d24, [x17], 8
292        DUP     d26, v26.d[1]
293        DUP     d24, v24.d[1]
294        STR     d22, [x16], 8
295        STR     d20,  [x6], 8
296        DUP     d22, v22.d[1]
297        DUP     d20, v20.d[1]
298
2997:
300        TBZ     x1, 0, 8f
301        STR     s30,  [x7]
302        STR     s28, [x13]
303        STR     s26, [x10]
304        STR     s24, [x17]
305        STR     s22, [x16]
306        STR     s20,  [x6]
3078:
308        # Restore x20,x21,x22,x23 from stack
309        LDP     x22, x23, [sp, 16]
310        LDP     x20, x21, [sp], 32
311        RET
312
313END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64
314
315#ifdef __ELF__
316.section ".note.GNU-stack","",%progbits
317#endif
318