xref: /aosp_15_r20/external/XNNPACK/src/f16-igemm/6x16-minmax-aarch64-neonfp16arith-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const void**restrict a,            x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> x8
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const void* zero,                  [sp + 16] -> x12
20#     const xnn_f16_minmax_params params [sp + 24] -> (x8)
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# Register usage
25# A0 x14 v0
26# A1 x15 v1
27# A2 x20 v2
28# A3 x21 v3
29# A4 x22 v4
30# A5 x23 v5
31
32# B   x5 v16 v17 v18 v19
33
34# C0  x6  v20 v21
35# C1 x16  v22 v23
36# C2 x17  v24 v25
37# C3 x10  v26 v27
38# C4 x13  v28 v29
39# C5  x7  v30 v31
40
41# Clamp v6, (v4), (v5)
42# unused     v7
43# unused A   v8 v9 v10 v11
44# unused B   v12 v13 v14 v15
45
46BEGIN_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55
47
48        # Load zero, params pointer
49        LDP     x12, x8, [sp, 16]
50
51        # Clamp C pointers
52        CMP     x0, 2                   // if mr < 2
53        ADD     x16, x6, x7             // c1 = c0 + cm_stride
54        CSEL    x16, x6, x16, LO        //   c1 = c0
55        ADD     x17, x16, x7            // c2 = c1 + cm_stride
56                                        // if mr <= 2
57        CSEL    x17, x16, x17, LS       //   c2 = c1
58
59        # Load params
60        LDR     s6, [x8]
61
62        CMP     x0, 4                   // if mr < 4
63        ADD     x10, x17, x7            // c3 = c2 + cm_stride
64        CSEL    x10, x17, x10, LO       //   c3 = c2
65        ADD     x13, x10, x7            // c4 = c3 + cm_stride
66                                        // if mr <= 4
67        CSEL    x13, x10, x13, LS       //   c4 = c3
68        CMP     x0, 6                   // if mr < 6
69        ADD     x7, x13, x7             // c5 = c4 + cm_stride
70        CSEL    x7, x13, x7, LO         //   c5 = c4
71
72        LDP     x8, x11, [sp]           // load cn_stride, a_offset
73
74        # Save x20-x23 on stack
75        STP     x20, x21, [sp, -32]!
76        STP     x22, x23, [sp, 16]
77
780:
79        # Load initial bias from w into accumulators
80        LDP     q20, q21, [x5], 32
81        MOV     x9, x3                  // p = ks
82        MOV     v22.16b, v20.16b
83        PRFM    PLDL1KEEP, [x5, 0]      // Prefetch B
84        MOV     v23.16b, v21.16b
85        PRFM    PLDL1KEEP, [x5, 64]
86        MOV     v24.16b, v20.16b
87        PRFM    PLDL1KEEP, [x5, 128]
88        MOV     v25.16b, v21.16b
89        PRFM    PLDL1KEEP, [x5, 192]
90        MOV     v26.16b, v20.16b
91        PRFM    PLDL1KEEP, [x5, 256]
92        MOV     v27.16b, v21.16b
93        PRFM    PLDL1KEEP, [x5, 320]
94        MOV     v28.16b, v20.16b
95        MOV     v29.16b, v21.16b
96        MOV     v30.16b, v20.16b
97        MOV     v31.16b, v21.16b
98
991:
100        # Load next 6 A pointers
101        LDP     x14, x15, [x4], 16
102        LDP     x20, x21, [x4], 16
103        LDP     x22, x23, [x4], 16
104
105        CMP     x14, x12                // if a0 == zero
106        ADD     x14, x14, x11           // a0 += a_offset
107        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
108        CMP     x15, x12                // if a1 == zero
109        ADD     x15, x15, x11           // a1 += a_offset
110        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
111        CMP     x20, x12                // if a2 == zero
112        ADD     x20, x20, x11           // a2 += a_offset
113        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
114        CMP     x21, x12                // if a3 == zero
115        ADD     x21, x21, x11           // a3 += a_offset
116        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
117        CMP     x22, x12                // if a4 == zero
118        ADD     x22, x22, x11           // a4 += a_offset
119        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
120        CMP     x23, x12                // if a5 == zero
121        ADD     x23, x23, x11           // a5 += a_offset
122        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
123
124        # Is there at least 2 halffloats (4 bytes)?
125        SUBS    x0, x2, 4               // k = kc - 4
126        B.LO    5f
127
128        # Prologue - load 4 A and 2 B
129        LDR     s0, [x14], 4              // A0
130        LDR     q16, [x5], 16             // B
131        LDR     q17, [x5], 16             // B
132        LDR     s1, [x15], 4              // A1
133        LDR     s2, [x20], 4              // A2
134        LDR     s3, [x21], 4              // A3
135
136        # Is there at least 2 halffloats for main loop?
137        SUBS    x0, x0, 4
138        B.LO    3f
139
140       .p2align 3
141        # Main loop - 2 halffloats of A (4 bytes)
142        # 24 FMA + 6 ld32 A + 4 LDR B
1432:
144        FMLA    v20.8h, v16.8h,  v0.h[0]
145        LDR     s4, [x22], 4              // A4
146        FMLA    v21.8h, v17.8h,  v0.h[0]
147        LDR     s5, [x23], 4              // A5
148        FMLA    v22.8h, v16.8h,  v1.h[0]
149        LDR     d18, [x5], 8              // B0
150        FMLA    v23.8h, v17.8h,  v1.h[0]
151        LD1     {v18.d}[1], [x5], 8       // B1
152        FMLA    v24.8h, v16.8h,  v2.h[0]
153        LDR     d19, [x5], 8              // B2
154        FMLA    v25.8h, v17.8h,  v2.h[0]
155        LD1     {v19.d}[1], [x5], 8       // B3
156        FMLA    v26.8h, v16.8h,  v3.h[0]
157        FMLA    v27.8h, v17.8h,  v3.h[0]
158        FMLA    v28.8h, v16.8h,  v4.h[0]
159        FMLA    v29.8h, v17.8h,  v4.h[0]
160        FMLA    v30.8h, v16.8h,  v5.h[0]
161        FMLA    v31.8h, v17.8h,  v5.h[0]
162        SUBS    x0, x0, 4
163
164        FMLA    v20.8h, v18.8h,  v0.h[1]
165        LDR     d16, [x5], 8              // B0
166        FMLA    v21.8h, v19.8h,  v0.h[1]
167        LD1     {v16.d}[1], [x5], 8       // B1
168        FMLA    v22.8h, v18.8h,  v1.h[1]
169        LDR     d17, [x5], 8              // B2
170        FMLA    v23.8h, v19.8h,  v1.h[1]
171        LD1     {v17.d}[1], [x5], 8       // B3
172        FMLA    v24.8h, v18.8h,  v2.h[1]
173        FMLA    v25.8h, v19.8h,  v2.h[1]
174        FMLA    v26.8h, v18.8h,  v3.h[1]
175        FMLA    v27.8h, v19.8h,  v3.h[1]
176        LDR     s0,  [x14], 4             // A0
177        FMLA    v28.8h, v18.8h,  v4.h[1]
178        LDR     s1,  [x15], 4             // A1
179        FMLA    v29.8h, v19.8h,  v4.h[1]
180        LDR     s2, [x20], 4              // A2
181        FMLA    v30.8h, v18.8h,  v5.h[1]
182        LDR     s3, [x21], 4              // A3
183        FMLA    v31.8h, v19.8h,  v5.h[1]
184        B.HS    2b
185
186        # Epilogue - same as main loop but no loads for next loop
1873:
188        FMLA    v20.8h, v16.8h,  v0.h[0]
189        LDR     s4, [x22], 4              // A4
190        FMLA    v21.8h, v17.8h,  v0.h[0]
191        LDR     s5, [x23], 4              // A5
192        FMLA    v22.8h, v16.8h,  v1.h[0]
193        LDR     d18, [x5], 8              // B0
194        FMLA    v23.8h, v17.8h,  v1.h[0]
195        LD1     {v18.d}[1], [x5], 8       // B1
196        FMLA    v24.8h, v16.8h,  v2.h[0]
197        LDR     d19, [x5], 8              // B2
198        FMLA    v25.8h, v17.8h,  v2.h[0]
199        LD1     {v19.d}[1], [x5], 8       // B3
200        FMLA    v26.8h, v16.8h,  v3.h[0]
201        FMLA    v27.8h, v17.8h,  v3.h[0]
202        FMLA    v28.8h, v16.8h,  v4.h[0]
203        FMLA    v29.8h, v17.8h,  v4.h[0]
204        FMLA    v30.8h, v16.8h,  v5.h[0]
205        FMLA    v31.8h, v17.8h,  v5.h[0]
206
207        FMLA    v20.8h, v18.8h,  v0.h[1]
208        FMLA    v21.8h, v19.8h,  v0.h[1]
209        FMLA    v22.8h, v18.8h,  v1.h[1]
210        FMLA    v23.8h, v19.8h,  v1.h[1]
211        FMLA    v24.8h, v18.8h,  v2.h[1]
212        FMLA    v25.8h, v19.8h,  v2.h[1]
213        FMLA    v26.8h, v18.8h,  v3.h[1]
214        FMLA    v27.8h, v19.8h,  v3.h[1]
215        FMLA    v28.8h, v18.8h,  v4.h[1]
216        FMLA    v29.8h, v19.8h,  v4.h[1]
217        FMLA    v30.8h, v18.8h,  v5.h[1]
218        FMLA    v31.8h, v19.8h,  v5.h[1]
219
220        # Is there a remainder?- 1 halffloat of A (2 bytes)
221        TBNZ    x0, 1, 5f
222
2234:
224        # ks loop
225        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
226        B.HI    1b
227
228        # Clamp
229        DUP     v4.8h, v6.h[0]
230        DUP     v5.8h, v6.h[1]
231        FMAX    v20.8h, v20.8h, v4.8h
232        FMAX    v21.8h, v21.8h, v4.8h
233        FMAX    v22.8h, v22.8h, v4.8h
234        FMAX    v23.8h, v23.8h, v4.8h
235        FMAX    v24.8h, v24.8h, v4.8h
236        FMAX    v25.8h, v25.8h, v4.8h
237        FMAX    v26.8h, v26.8h, v4.8h
238        FMAX    v27.8h, v27.8h, v4.8h
239        FMAX    v28.8h, v28.8h, v4.8h
240        FMAX    v29.8h, v29.8h, v4.8h
241        FMAX    v30.8h, v30.8h, v4.8h
242        FMAX    v31.8h, v31.8h, v4.8h
243        SUBS    x1, x1, 16
244        FMIN    v20.8h, v20.8h, v5.8h
245        FMIN    v21.8h, v21.8h, v5.8h
246        FMIN    v22.8h, v22.8h, v5.8h
247        FMIN    v23.8h, v23.8h, v5.8h
248        FMIN    v24.8h, v24.8h, v5.8h
249        FMIN    v25.8h, v25.8h, v5.8h
250        FMIN    v26.8h, v26.8h, v5.8h
251        FMIN    v27.8h, v27.8h, v5.8h
252        FMIN    v28.8h, v28.8h, v5.8h
253        FMIN    v29.8h, v29.8h, v5.8h
254        FMIN    v30.8h, v30.8h, v5.8h
255        FMIN    v31.8h, v31.8h, v5.8h
256
257        # Store full 6 x 16
258        B.LO    6f
259
260        ST1     {v30.16b, v31.16b},  [x7], x8
261        ST1     {v28.16b, v29.16b}, [x13], x8
262        ST1     {v26.16b, v27.16b}, [x10], x8
263        ST1     {v24.16b, v25.16b}, [x17], x8
264        ST1     {v22.16b, v23.16b}, [x16], x8
265        ST1     {v20.16b, v21.16b},  [x6], x8
266
267        SUB     x4, x4, x3              // a -= ks
268
269        # nc loop
270        B.HI    0b
271
272        # Restore x20-x23 from stack
273        LDP     x22, x23, [sp, 16]
274        LDP     x20, x21, [sp], 32
275        RET
276
2775:
278        # Remainder- 1 halffloat of A (2 bytes)
279        LDR     h0, [x14], 2              // A0
280        LDR     q16, [x5], 16             // B
281        LDR     q17, [x5], 16             // B
282        FMLA    v20.8h, v16.8h,  v0.h[0]
283        LDR     h1, [x15], 2              // A1
284        FMLA    v21.8h, v17.8h,  v0.h[0]
285        LDR     h2, [x20], 2              // A2
286        FMLA    v22.8h, v16.8h,  v1.h[0]
287        LDR     h3, [x21], 2              // A3
288        FMLA    v23.8h, v17.8h,  v1.h[0]
289        LDR     h4,  [x22], 2             // A4
290        FMLA    v24.8h, v16.8h,  v2.h[0]
291        LDR     h5,  [x23], 2             // A5
292        FMLA    v25.8h, v17.8h,  v2.h[0]
293        FMLA    v26.8h, v16.8h,  v3.h[0]
294        FMLA    v27.8h, v17.8h,  v3.h[0]
295        FMLA    v28.8h, v16.8h,  v4.h[0]
296        FMLA    v29.8h, v17.8h,  v4.h[0]
297        FMLA    v30.8h, v16.8h,  v5.h[0]
298        FMLA    v31.8h, v17.8h,  v5.h[0]
299        B       4b
300
301        # Store odd width
3026:
303        TBZ     x1, 3, 7f
304        STR     q30,  [x7], 16
305        MOV     v30.16b, v31.16b
306        STR     q28, [x13], 16
307        MOV     v28.16b, v29.16b
308        STR     q26, [x10], 16
309        MOV     v26.16b, v27.16b
310        STR     q24, [x17], 16
311        MOV     v24.16b, v25.16b
312        STR     q22, [x16], 16
313        MOV     v22.16b, v23.16b
314        STR     q20,  [x6], 16
315        MOV     v20.16b, v21.16b
3167:
317        TBZ     x1, 2, 8f
318        STR     d30,  [x7], 8
319        STR     d28, [x13], 8
320        DUP     d30, v30.d[1]
321        DUP     d28, v28.d[1]
322        STR     d26, [x10], 8
323        STR     d24, [x17], 8
324        DUP     d26, v26.d[1]
325        DUP     d24, v24.d[1]
326        STR     d22, [x16], 8
327        STR     d20,  [x6], 8
328        DUP     d22, v22.d[1]
329        DUP     d20, v20.d[1]
330
3318:
332        TBZ     x1, 1, 9f
333        STR     s30,  [x7], 4
334        STR     s28, [x13], 4
335        DUP     s30, v30.s[1]
336        DUP     s28, v28.s[1]
337        STR     s26, [x10], 4
338        STR     s24, [x17], 4
339        DUP     s26, v26.s[1]
340        DUP     s24, v24.s[1]
341        STR     s22, [x16], 4
342        STR     s20,  [x6], 4
343        DUP     s22, v22.s[1]
344        DUP     s20, v20.s[1]
345
3469:
347        TBZ     x1, 0, 10f
348        STR     h30,  [x7]
349        STR     h28, [x13]
350        STR     h26, [x10]
351        STR     h24, [x17]
352        STR     h22, [x16]
353        STR     h20,  [x6]
35410:
355        # Restore x20-x23 from stack
356        LDP     x22, x23, [sp, 16]
357        LDP     x20, x21, [sp], 32
358        RET
359
360END_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55
361
362#ifdef __ELF__
363.section ".note.GNU-stack","",%progbits
364#endif
365