xref: /aosp_15_r20/external/XNNPACK/src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-ld128.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/6x8-aarch64-neonfma-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const void*restrict w,             x5
19#     uint8_t*restrict c,                x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> (x0)
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> x8
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x14 a0
30# x15 a1
31# x20 a2
32# x21 a3
33# x22 a4
34# x23 a5
35
36# C pointers
37#  x6 c0
38# x16 c1
39# x17 c2
40# x10 c3
41# x13 c4
42#  x7 c5
43
44# Vector register usage
45# A0   v0
46# A1   v1
47# A2   v2
48# A3   v3
49# A4   v4
50# A5   v5
51# B   v16 v17 v18 v19
52# C   v20 v21
53# C   v22 v23
54# C   v24 v25
55# C   v26 v27
56# C   v28 v29
57# C   v30 v31
58# Clamp v6 v7
59# unused A   v8 v9 v10 v11
60# unused B   v12 v13 v14 v15
61
62BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128
63
64        # Load zero, params pointer
65        LDP     x12, x8, [sp, 16]
66
67        # Clamp C pointers
68        CMP     x0, 2                   // if mr < 2
69        ADD     x16, x6, x7             // c1 = c0 + cm_stride
70        CSEL    x16, x6, x16, LO        //   c1 = c0
71
72        # Load min/max values
73        LD2R    {v6.4s, v7.4s}, [x8]
74
75        ADD     x17, x16, x7            // c2 = c1 + cm_stride
76                                        // if mr <= 2
77        CSEL    x17, x16, x17, LS       //   c2 = c1
78
79        # Save x20,x21,x22,x23 on stack
80        STP     x20, x21, [sp, -32]!
81
82        CMP     x0, 4                   // if mr < 4
83        ADD     x10, x17, x7            // c3 = c2 + cm_stride
84        CSEL    x10, x17, x10, LO       //   c3 = c2
85
86        STP     x22, x23, [sp, 16]
87
88        ADD     x13, x10, x7            // c4 = c3 + cm_stride
89                                        // if mr <= 4
90        CSEL    x13, x10, x13, LS       //   c4 = c3
91
92        # Load a_offset
93        LDR     x11, [sp, 40]
94
95        CMP     x0, 6                   // if mr < 6
96        ADD     x7, x13, x7             // c5 = c4 + cm_stride
97        CSEL    x7, x13, x7, LO         //   c5 = c4
98
990:
100        # Load initial bias from w into accumulators
101        LDP     q20, q21, [x5], 32
102        MOV     v22.16b, v20.16b
103        MOV     v23.16b, v21.16b
104        MOV     v24.16b, v20.16b
105        MOV     v25.16b, v21.16b
106        MOV     v26.16b, v20.16b
107        MOV     v27.16b, v21.16b
108        MOV     v28.16b, v20.16b
109        MOV     v29.16b, v21.16b
110        MOV     v30.16b, v20.16b
111        MOV     v31.16b, v21.16b
112
113        MOV     x9, x3                  // p = ks
114
1151:
116        # Load next 6 A pointers
117        LDP     x14, x15, [x4], 16
118        LDP     x20, x21, [x4], 16
119        LDP     x22, x23, [x4], 16
120
121        CMP     x14, x12                // if a0 == zero
122        ADD     x14, x14, x11           // a0 += a_offset
123        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
124        CMP     x15, x12                // if a1 == zero
125        ADD     x15, x15, x11           // a1 += a_offset
126        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
127        CMP     x20, x12                // if a2 == zero
128        ADD     x20, x20, x11           // a2 += a_offset
129        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
130        CMP     x21, x12                // if a3 == zero
131        ADD     x21, x21, x11           // a3 += a_offset
132        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
133        CMP     x22, x12                // if a4 == zero
134        ADD     x22, x22, x11           // a4 += a_offset
135        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
136        CMP     x23, x12                // if a5 == zero
137        ADD     x23, x23, x11           // a5 += a_offset
138        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
139
140        # Is there at least 4 floats (16 bytes)?
141        SUBS    x0, x2, 16              // k = kc - 16
142        B.LO    4f
143
144        # Main loop - 4 floats of A (16 bytes)
145        # 48 FMA + 6 ld128 A + 4 LDP B
1462:
147        LDP     q16,  q17, [x5], 32
148        LDR     q0, [x14], 16
149        LDR     q1, [x15], 16
150        LDR     q2, [x20], 16
151        LDR     q3, [x21], 16
152        LDR     q4, [x22], 16
153        LDR     q5, [x23], 16
154        FMLA    v20.4s, v16.4s,  v0.s[0]
155        FMLA    v22.4s, v16.4s,  v1.s[0]
156        FMLA    v24.4s, v16.4s,  v2.s[0]
157        FMLA    v26.4s, v16.4s,  v3.s[0]
158        LDP     q18,  q19, [x5], 32
159        FMLA    v28.4s, v16.4s,  v4.s[0]
160        FMLA    v30.4s, v16.4s,  v5.s[0]
161        FMLA    v21.4s, v17.4s,  v0.s[0]
162        FMLA    v23.4s, v17.4s,  v1.s[0]
163        FMLA    v25.4s, v17.4s,  v2.s[0]
164        FMLA    v27.4s, v17.4s,  v3.s[0]
165        FMLA    v29.4s, v17.4s,  v4.s[0]
166        FMLA    v31.4s, v17.4s,  v5.s[0]
167
168        FMLA    v20.4s, v18.4s,  v0.s[1]
169        LDP     q16,  q17, [x5], 32
170        FMLA    v22.4s, v18.4s,  v1.s[1]
171        FMLA    v24.4s, v18.4s,  v2.s[1]
172        FMLA    v26.4s, v18.4s,  v3.s[1]
173        FMLA    v28.4s, v18.4s,  v4.s[1]
174        FMLA    v30.4s, v18.4s,  v5.s[1]
175        FMLA    v21.4s, v19.4s,  v0.s[1]
176        FMLA    v23.4s, v19.4s,  v1.s[1]
177        FMLA    v25.4s, v19.4s,  v2.s[1]
178        FMLA    v27.4s, v19.4s,  v3.s[1]
179        FMLA    v29.4s, v19.4s,  v4.s[1]
180        FMLA    v31.4s, v19.4s,  v5.s[1]
181
182        FMLA    v20.4s, v16.4s,  v0.s[2]
183        LDP     q18,  q19, [x5], 32
184        FMLA    v22.4s, v16.4s,  v1.s[2]
185        FMLA    v24.4s, v16.4s,  v2.s[2]
186        FMLA    v26.4s, v16.4s,  v3.s[2]
187        FMLA    v28.4s, v16.4s,  v4.s[2]
188        FMLA    v30.4s, v16.4s,  v5.s[2]
189        FMLA    v21.4s, v17.4s,  v0.s[2]
190        FMLA    v23.4s, v17.4s,  v1.s[2]
191        FMLA    v25.4s, v17.4s,  v2.s[2]
192        FMLA    v27.4s, v17.4s,  v3.s[2]
193        FMLA    v29.4s, v17.4s,  v4.s[2]
194        FMLA    v31.4s, v17.4s,  v5.s[2]
195
196        FMLA    v20.4s, v18.4s,  v0.s[3]
197        FMLA    v22.4s, v18.4s,  v1.s[3]
198        FMLA    v24.4s, v18.4s,  v2.s[3]
199        FMLA    v26.4s, v18.4s,  v3.s[3]
200        FMLA    v28.4s, v18.4s,  v4.s[3]
201        FMLA    v30.4s, v18.4s,  v5.s[3]
202        FMLA    v21.4s, v19.4s,  v0.s[3]
203        FMLA    v23.4s, v19.4s,  v1.s[3]
204        FMLA    v25.4s, v19.4s,  v2.s[3]
205        FMLA    v27.4s, v19.4s,  v3.s[3]
206        SUBS    x0, x0, 16
207        FMLA    v29.4s, v19.4s,  v4.s[3]
208        FMLA    v31.4s, v19.4s,  v5.s[3]
209        B.HS    2b
210
211        # Is there a remainder?- 2 floats of A (8 bytes) or less
212        TST     x0, 15
213        B.NE    4f
214
2153:
216        # ks loop
217        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
218        B.HI    1b
219
220        # Clamp
221        FMAX    v20.4s, v20.4s, v6.4s
222        # Load cn_stride
223        LDR     x0, [sp, 32]
224        FMAX    v21.4s, v21.4s, v6.4s
225        FMAX    v22.4s, v22.4s, v6.4s
226        FMAX    v23.4s, v23.4s, v6.4s
227        FMAX    v24.4s, v24.4s, v6.4s
228        FMAX    v25.4s, v25.4s, v6.4s
229        FMAX    v26.4s, v26.4s, v6.4s
230        FMAX    v27.4s, v27.4s, v6.4s
231        FMAX    v28.4s, v28.4s, v6.4s
232        FMAX    v29.4s, v29.4s, v6.4s
233        FMAX    v30.4s, v30.4s, v6.4s
234        FMAX    v31.4s, v31.4s, v6.4s
235        SUBS    x1, x1, 8
236        FMIN    v20.4s, v20.4s, v7.4s
237        FMIN    v21.4s, v21.4s, v7.4s
238        FMIN    v22.4s, v22.4s, v7.4s
239        FMIN    v23.4s, v23.4s, v7.4s
240        FMIN    v24.4s, v24.4s, v7.4s
241        FMIN    v25.4s, v25.4s, v7.4s
242        FMIN    v26.4s, v26.4s, v7.4s
243        FMIN    v27.4s, v27.4s, v7.4s
244        FMIN    v28.4s, v28.4s, v7.4s
245        FMIN    v29.4s, v29.4s, v7.4s
246        FMIN    v30.4s, v30.4s, v7.4s
247        FMIN    v31.4s, v31.4s, v7.4s
248
249        # Store full 6 x 8
250        B.LO    6f
251
252        STP     q30, q31,  [x7]
253        ADD     x7, x7, x0
254        STP     q28, q29, [x13]
255        ADD     x13, x13, x0
256        STP     q26, q27, [x10]
257        ADD     x10, x10, x0
258        STP     q24, q25, [x17]
259        ADD     x17, x17, x0
260        STP     q22, q23, [x16]
261        ADD     x16, x16, x0
262        STP     q20, q21,  [x6]
263        ADD     x6,  x6, x0
264
265        SUB     x4, x4, x3              // a -= ks
266
267        # nc loop
268        B.HI    0b
269
270        # Restore x20,x21,x22,x23 from stack
271        LDP     x22, x23, [sp, 16]
272        LDP     x20, x21, [sp], 32
273        RET
274
2754:
276        # Is there a remainder?- 2 floats of A (8 bytes)
277        TBZ     x0, 3, 5f
278
279        # Remainder- 2 floats of A (8 bytes)
280        LDR     d0, [x14], 8
281        LDP     q16,  q17, [x5], 32
282        LDR     d1, [x15], 8
283        LDR     d2, [x20], 8
284        LDR     d3, [x21], 8
285        LDR     d4, [x22], 8
286        LDR     d5, [x23], 8
287        FMLA    v20.4s, v16.4s,  v0.s[0]
288        FMLA    v22.4s, v16.4s,  v1.s[0]
289        FMLA    v24.4s, v16.4s,  v2.s[0]
290        FMLA    v26.4s, v16.4s,  v3.s[0]
291        LDP     q18,  q19, [x5], 32
292        FMLA    v28.4s, v16.4s,  v4.s[0]
293        FMLA    v30.4s, v16.4s,  v5.s[0]
294        FMLA    v21.4s, v17.4s,  v0.s[0]
295        FMLA    v23.4s, v17.4s,  v1.s[0]
296        FMLA    v25.4s, v17.4s,  v2.s[0]
297        FMLA    v27.4s, v17.4s,  v3.s[0]
298        FMLA    v29.4s, v17.4s,  v4.s[0]
299        FMLA    v31.4s, v17.4s,  v5.s[0]
300
301        FMLA    v20.4s, v18.4s,  v0.s[1]
302        FMLA    v22.4s, v18.4s,  v1.s[1]
303        FMLA    v24.4s, v18.4s,  v2.s[1]
304        FMLA    v26.4s, v18.4s,  v3.s[1]
305        FMLA    v28.4s, v18.4s,  v4.s[1]
306        FMLA    v30.4s, v18.4s,  v5.s[1]
307        FMLA    v21.4s, v19.4s,  v0.s[1]
308        FMLA    v23.4s, v19.4s,  v1.s[1]
309        FMLA    v25.4s, v19.4s,  v2.s[1]
310        FMLA    v27.4s, v19.4s,  v3.s[1]
311        FMLA    v29.4s, v19.4s,  v4.s[1]
312        FMLA    v31.4s, v19.4s,  v5.s[1]
313
314        # Is there a remainder?- 1 float of A (4 bytes)
315        TBZ     x0, 2, 3b
316
317        # Remainder- 1 float of A (4 bytes)
3185:
319        LDR     s0, [x14], 4
320        LDP     q16,  q17, [x5], 32
321        LDR     s1, [x15], 4
322        LDR     s2, [x20], 4
323        LDR     s3, [x21], 4
324        LDR     s4, [x22], 4
325        LDR     s5, [x23], 4
326        FMLA    v20.4s, v16.4s,  v0.s[0]
327        FMLA    v22.4s, v16.4s,  v1.s[0]
328        FMLA    v24.4s, v16.4s,  v2.s[0]
329        FMLA    v26.4s, v16.4s,  v3.s[0]
330        FMLA    v28.4s, v16.4s,  v4.s[0]
331        FMLA    v30.4s, v16.4s,  v5.s[0]
332        FMLA    v21.4s, v17.4s,  v0.s[0]
333        FMLA    v23.4s, v17.4s,  v1.s[0]
334        FMLA    v25.4s, v17.4s,  v2.s[0]
335        FMLA    v27.4s, v17.4s,  v3.s[0]
336        FMLA    v29.4s, v17.4s,  v4.s[0]
337        FMLA    v31.4s, v17.4s,  v5.s[0]
338        B       3b
339
340        # Store odd width
3416:
342        TBZ     x1, 2, 7f
343        STR     q30,  [x7], 16
344        MOV     v30.16b, v31.16b
345        STR     q28, [x13], 16
346        MOV     v28.16b, v29.16b
347        STR     q26, [x10], 16
348        MOV     v26.16b, v27.16b
349        STR     q24, [x17], 16
350        MOV     v24.16b, v25.16b
351        STR     q22, [x16], 16
352        MOV     v22.16b, v23.16b
353        STR     q20,  [x6], 16
354        MOV     v20.16b, v21.16b
3557:
356        TBZ     x1, 1, 8f
357        STR     d30,  [x7], 8
358        STR     d28, [x13], 8
359        DUP     d30, v30.d[1]
360        DUP     d28, v28.d[1]
361        STR     d26, [x10], 8
362        STR     d24, [x17], 8
363        DUP     d26, v26.d[1]
364        DUP     d24, v24.d[1]
365        STR     d22, [x16], 8
366        STR     d20,  [x6], 8
367        DUP     d22, v22.d[1]
368        DUP     d20, v20.d[1]
369
3708:
371        TBZ     x1, 0, 9f
372        STR     s30,  [x7]
373        STR     s28, [x13]
374        STR     s26, [x10]
375        STR     s24, [x17]
376        STR     s22, [x16]
377        STR     s20,  [x6]
3789:
379        # Restore x20,x21,x22,x23 from stack
380        LDP     x22, x23, [sp, 16]
381        LDP     x20, x21, [sp], 32
382        RET
383
384END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128
385
386#ifdef __ELF__
387.section ".note.GNU-stack","",%progbits
388#endif
389