xref: /aosp_15_r20/external/XNNPACK/src/f16-igemm/6x16-minmax-aarch64-neonfp16arith-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const void**restrict a,            x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> x8
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const void* zero,                  [sp + 16] -> x12
20#     const xnn_f16_minmax_params params [sp + 24] -> (x8)
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# Register usage
25# A0 x14 v0
26# A1 x15 v1
27# A2 x20 v2
28# A3 x21 v3
29# A4 x22 v4
30# A5 x23 v5
31
32# B   x5 v16 v17 v18 v19
33
34# C0  x6  v20 v21
35# C1 x16  v22 v23
36# C2 x17  v24 v25
37# C3 x10  v26 v27
38# C4 x13  v28 v29
39# C5  x7  v30 v31
40
41# Clamp v6, (v4), (v5)
42# unused     v7
43# unused A   v8 v9 v10 v11
44# unused B   v12 v13 v14 v15
45
46BEGIN_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75
47
48        # Load zero, params pointer
49        LDP     x12, x8, [sp, 16]
50
51        # Clamp C pointers
52        CMP     x0, 2                   // if mr < 2
53        ADD     x16, x6, x7             // c1 = c0 + cm_stride
54        CSEL    x16, x6, x16, LO        //   c1 = c0
55        ADD     x17, x16, x7            // c2 = c1 + cm_stride
56                                        // if mr <= 2
57        CSEL    x17, x16, x17, LS       //   c2 = c1
58
59        # Load params
60        LDR     s6, [x8]
61
62        CMP     x0, 4                   // if mr < 4
63        ADD     x10, x17, x7            // c3 = c2 + cm_stride
64        CSEL    x10, x17, x10, LO       //   c3 = c2
65        ADD     x13, x10, x7            // c4 = c3 + cm_stride
66                                        // if mr <= 4
67        CSEL    x13, x10, x13, LS       //   c4 = c3
68        CMP     x0, 6                   // if mr < 6
69        ADD     x7, x13, x7             // c5 = c4 + cm_stride
70        CSEL    x7, x13, x7, LO         //   c5 = c4
71
72        LDP     x8, x11, [sp]           // load cn_stride, a_offset
73
74        # Save x20-x23 on stack
75        STP     x20, x21, [sp, -32]!
76        STP     x22, x23, [sp, 16]
77
780:
79        # Load initial bias from w into accumulators
80        LDP     q20, q21, [x5], 32
81        MOV     x9, x3                  // p = ks
82        MOV     v22.16b, v20.16b
83        PRFM    PLDL1KEEP, [x5, 0]      // Prefetch B
84        MOV     v23.16b, v21.16b
85        PRFM    PLDL1KEEP, [x5, 64]
86        MOV     v24.16b, v20.16b
87        PRFM    PLDL1KEEP, [x5, 128]
88        MOV     v25.16b, v21.16b
89        PRFM    PLDL1KEEP, [x5, 192]
90        MOV     v26.16b, v20.16b
91        PRFM    PLDL1KEEP, [x5, 256]
92        MOV     v27.16b, v21.16b
93        PRFM    PLDL1KEEP, [x5, 320]
94        MOV     v28.16b, v20.16b
95        MOV     v29.16b, v21.16b
96        MOV     v30.16b, v20.16b
97        MOV     v31.16b, v21.16b
98
991:
100        # Load next 6 A pointers
101        LDP     x14, x15, [x4], 16
102        LDP     x20, x21, [x4], 16
103        LDP     x22, x23, [x4], 16
104
105        CMP     x14, x12                // if a0 == zero
106        ADD     x14, x14, x11           // a0 += a_offset
107        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
108        CMP     x15, x12                // if a1 == zero
109        ADD     x15, x15, x11           // a1 += a_offset
110        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
111        CMP     x20, x12                // if a2 == zero
112        ADD     x20, x20, x11           // a2 += a_offset
113        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
114        CMP     x21, x12                // if a3 == zero
115        ADD     x21, x21, x11           // a3 += a_offset
116        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
117        CMP     x22, x12                // if a4 == zero
118        ADD     x22, x22, x11           // a4 += a_offset
119        CSEL    x22, x12, x22, EQ       //   a4 = zero, else += a4 + a_offset
120        CMP     x23, x12                // if a5 == zero
121        ADD     x23, x23, x11           // a5 += a_offset
122        CSEL    x23, x12, x23, EQ       //   a5 = zero, else += a5 + a_offset
123
124        # Is there at least 4 halffloats (8 bytes)?
125        SUBS    x0, x2, 8               // k = kc - 8
126        B.LO    5f
127
128        # Prologue - load 4 A and 2 B
129
130        LDR     d0, [x14], 8              // A0
131        LDR     q16, [x5], 16             // B0
132        LDR     q17, [x5], 16             // B1
133        LDR     d1, [x15], 8              // A1
134        LDR     d2, [x20], 8              // A2
135        LDR     d3, [x21], 8              // A3
136
137        # Is there at least 4 halffloats for main loop?
138        SUBS    x0, x0, 8
139        B.LO    3f
140
141       .p2align 3
142        # Main loop - 4 halffloats of A (8 bytes)
143        # 48 FMA + 6 ld32 A + 8 LDR B
1442:
145        FMLA    v20.8h, v16.8h,  v0.h[0]
146        FMLA    v21.8h, v17.8h,  v0.h[0]
147        LDR     d4, [x22], 8              // A4
148        FMLA    v22.8h, v16.8h,  v1.h[0]
149        FMLA    v23.8h, v17.8h,  v1.h[0]
150        LDR     d5, [x23], 8              // A5
151        FMLA    v24.8h, v16.8h,  v2.h[0]
152        FMLA    v25.8h, v17.8h,  v2.h[0]
153        LDR     q18, [x5], 16             // B2
154        FMLA    v26.8h, v16.8h,  v3.h[0]
155        FMLA    v27.8h, v17.8h,  v3.h[0]
156        LDR     q19, [x5], 16             // B3
157        FMLA    v28.8h, v16.8h,  v4.h[0]
158        FMLA    v29.8h, v17.8h,  v4.h[0]
159        FMLA    v30.8h, v16.8h,  v5.h[0]
160        FMLA    v31.8h, v17.8h,  v5.h[0]
161        SUBS    x0, x0, 8
162
163        FMLA    v20.8h, v18.8h,  v0.h[1]
164        FMLA    v21.8h, v19.8h,  v0.h[1]
165        LDR     q16, [x5], 16             // B4
166        FMLA    v22.8h, v18.8h,  v1.h[1]
167        FMLA    v23.8h, v19.8h,  v1.h[1]
168        LDR     q17, [x5], 16             // B5
169        FMLA    v24.8h, v18.8h,  v2.h[1]
170        FMLA    v25.8h, v19.8h,  v2.h[1]
171        FMLA    v26.8h, v18.8h,  v3.h[1]
172        FMLA    v27.8h, v19.8h,  v3.h[1]
173        FMLA    v28.8h, v18.8h,  v4.h[1]
174        FMLA    v29.8h, v19.8h,  v4.h[1]
175        FMLA    v30.8h, v18.8h,  v5.h[1]
176        FMLA    v31.8h, v19.8h,  v5.h[1]
177
178        FMLA    v20.8h, v16.8h,  v0.h[2]
179        FMLA    v21.8h, v17.8h,  v0.h[2]
180        LDR     q18, [x5], 16             // B6
181        FMLA    v22.8h, v16.8h,  v1.h[2]
182        FMLA    v23.8h, v17.8h,  v1.h[2]
183        LDR     q19, [x5], 16             // B7
184        FMLA    v24.8h, v16.8h,  v2.h[2]
185        FMLA    v25.8h, v17.8h,  v2.h[2]
186        FMLA    v26.8h, v16.8h,  v3.h[2]
187        FMLA    v27.8h, v17.8h,  v3.h[2]
188        FMLA    v28.8h, v16.8h,  v4.h[2]
189        FMLA    v29.8h, v17.8h,  v4.h[2]
190        FMLA    v30.8h, v16.8h,  v5.h[2]
191        FMLA    v31.8h, v17.8h,  v5.h[2]
192
193        LDR     q16, [x5], 16             // B0
194        FMLA    v20.8h, v18.8h,  v0.h[3]
195        FMLA    v21.8h, v19.8h,  v0.h[3]
196        LDR     q17, [x5], 16             // B1
197        FMLA    v22.8h, v18.8h,  v1.h[3]
198        FMLA    v23.8h, v19.8h,  v1.h[3]
199        LDR     d0, [x14], 8              // A0
200        FMLA    v24.8h, v18.8h,  v2.h[3]
201        FMLA    v25.8h, v19.8h,  v2.h[3]
202        LDR     d1, [x15], 8              // A1
203        FMLA    v26.8h, v18.8h,  v3.h[3]
204        FMLA    v27.8h, v19.8h,  v3.h[3]
205        LDR     d2, [x20], 8              // A2
206        FMLA    v28.8h, v18.8h,  v4.h[3]
207        FMLA    v29.8h, v19.8h,  v4.h[3]
208        LDR     d3, [x21], 8              // A3
209        FMLA    v30.8h, v18.8h,  v5.h[3]
210        FMLA    v31.8h, v19.8h,  v5.h[3]
211        B.HS    2b
212
213        # Epilogue - same as main loop but no loads for next loop
2143:
215        FMLA    v20.8h, v16.8h,  v0.h[0]
216        FMLA    v21.8h, v17.8h,  v0.h[0]
217        LDR     d4, [x22], 8              // A4
218        FMLA    v22.8h, v16.8h,  v1.h[0]
219        FMLA    v23.8h, v17.8h,  v1.h[0]
220        LDR     d5, [x23], 8              // A5
221        FMLA    v24.8h, v16.8h,  v2.h[0]
222        FMLA    v25.8h, v17.8h,  v2.h[0]
223        LDR     q18, [x5], 16             // B2
224        FMLA    v26.8h, v16.8h,  v3.h[0]
225        FMLA    v27.8h, v17.8h,  v3.h[0]
226        LDR     q19, [x5], 16             // B3
227        FMLA    v28.8h, v16.8h,  v4.h[0]
228        FMLA    v29.8h, v17.8h,  v4.h[0]
229        FMLA    v30.8h, v16.8h,  v5.h[0]
230        FMLA    v31.8h, v17.8h,  v5.h[0]
231        ADDS    x0, x0, 8
232
233        FMLA    v20.8h, v18.8h,  v0.h[1]
234        FMLA    v21.8h, v19.8h,  v0.h[1]
235        LDR     q16, [x5], 16             // B4
236        FMLA    v22.8h, v18.8h,  v1.h[1]
237        FMLA    v23.8h, v19.8h,  v1.h[1]
238        LDR     q17, [x5], 16             // B5
239        FMLA    v24.8h, v18.8h,  v2.h[1]
240        FMLA    v25.8h, v19.8h,  v2.h[1]
241        FMLA    v26.8h, v18.8h,  v3.h[1]
242        FMLA    v27.8h, v19.8h,  v3.h[1]
243        FMLA    v28.8h, v18.8h,  v4.h[1]
244        FMLA    v29.8h, v19.8h,  v4.h[1]
245        FMLA    v30.8h, v18.8h,  v5.h[1]
246        FMLA    v31.8h, v19.8h,  v5.h[1]
247
248        FMLA    v20.8h, v16.8h,  v0.h[2]
249        FMLA    v21.8h, v17.8h,  v0.h[2]
250        LDR     q18, [x5], 16             // B6
251        FMLA    v22.8h, v16.8h,  v1.h[2]
252        FMLA    v23.8h, v17.8h,  v1.h[2]
253        LDR     q19, [x5], 16             // B7
254        FMLA    v24.8h, v16.8h,  v2.h[2]
255        FMLA    v25.8h, v17.8h,  v2.h[2]
256        FMLA    v26.8h, v16.8h,  v3.h[2]
257        FMLA    v27.8h, v17.8h,  v3.h[2]
258        FMLA    v28.8h, v16.8h,  v4.h[2]
259        FMLA    v29.8h, v17.8h,  v4.h[2]
260        FMLA    v30.8h, v16.8h,  v5.h[2]
261        FMLA    v31.8h, v17.8h,  v5.h[2]
262
263        FMLA    v20.8h, v18.8h,  v0.h[3]
264        FMLA    v21.8h, v19.8h,  v0.h[3]
265        FMLA    v22.8h, v18.8h,  v1.h[3]
266        FMLA    v23.8h, v19.8h,  v1.h[3]
267        FMLA    v24.8h, v18.8h,  v2.h[3]
268        FMLA    v25.8h, v19.8h,  v2.h[3]
269        FMLA    v26.8h, v18.8h,  v3.h[3]
270        FMLA    v27.8h, v19.8h,  v3.h[3]
271        FMLA    v28.8h, v18.8h,  v4.h[3]
272        FMLA    v29.8h, v19.8h,  v4.h[3]
273        FMLA    v30.8h, v18.8h,  v5.h[3]
274        FMLA    v31.8h, v19.8h,  v5.h[3]
275
276        # Is there a remainder?- 1-3 halffloats of A (2-6 bytes)
277        B.NE    5f
278
2794:
280        # ks loop
281        SUBS    x9, x9, 48              // ks -= MR * sizeof(void*)
282        B.HI    1b
283
284        # Clamp
285        DUP     v4.8h, v6.h[0]
286        DUP     v5.8h, v6.h[1]
287        FMAX    v20.8h, v20.8h, v4.8h
288        FMAX    v21.8h, v21.8h, v4.8h
289        FMAX    v22.8h, v22.8h, v4.8h
290        FMAX    v23.8h, v23.8h, v4.8h
291        FMAX    v24.8h, v24.8h, v4.8h
292        FMAX    v25.8h, v25.8h, v4.8h
293        FMAX    v26.8h, v26.8h, v4.8h
294        FMAX    v27.8h, v27.8h, v4.8h
295        FMAX    v28.8h, v28.8h, v4.8h
296        FMAX    v29.8h, v29.8h, v4.8h
297        FMAX    v30.8h, v30.8h, v4.8h
298        FMAX    v31.8h, v31.8h, v4.8h
299        SUBS    x1, x1, 16
300        FMIN    v20.8h, v20.8h, v5.8h
301        FMIN    v21.8h, v21.8h, v5.8h
302        FMIN    v22.8h, v22.8h, v5.8h
303        FMIN    v23.8h, v23.8h, v5.8h
304        FMIN    v24.8h, v24.8h, v5.8h
305        FMIN    v25.8h, v25.8h, v5.8h
306        FMIN    v26.8h, v26.8h, v5.8h
307        FMIN    v27.8h, v27.8h, v5.8h
308        FMIN    v28.8h, v28.8h, v5.8h
309        FMIN    v29.8h, v29.8h, v5.8h
310        FMIN    v30.8h, v30.8h, v5.8h
311        FMIN    v31.8h, v31.8h, v5.8h
312
313        # Store full 6 x 16
314        B.LO    7f
315
316        ST1     {v30.16b, v31.16b},  [x7], x8
317        ST1     {v28.16b, v29.16b}, [x13], x8
318        ST1     {v26.16b, v27.16b}, [x10], x8
319        ST1     {v24.16b, v25.16b}, [x17], x8
320        ST1     {v22.16b, v23.16b}, [x16], x8
321        ST1     {v20.16b, v21.16b},  [x6], x8
322
323        SUB     x4, x4, x3              // a -= ks
324
325        # nc loop
326        B.HI    0b
327
328        # Restore x20-x23 from stack
329        LDP     x22, x23, [sp, 16]
330        LDP     x20, x21, [sp], 32
331        RET
332
333        # Remainder- 1-3 halffloats of A (2-6 bytes)
3345:
335        TBZ     x0, 2, 6f
336        LDR     s0, [x14], 4
337        LDR     q16, [x5], 16
338        LDR     q17, [x5], 16
339        LDR     s1, [x15], 4
340        LDR     s2, [x20], 4
341        LDR     s3, [x21], 4
342        LDR     s4, [x22], 4
343        LDR     s5, [x23], 4
344        LDR     q18, [x5], 16
345        LDR     q19, [x5], 16
346        FMLA    v20.8h, v16.8h,  v0.h[0]
347        FMLA    v22.8h, v16.8h,  v1.h[0]
348        FMLA    v24.8h, v16.8h,  v2.h[0]
349        FMLA    v26.8h, v16.8h,  v3.h[0]
350        FMLA    v28.8h, v16.8h,  v4.h[0]
351        FMLA    v30.8h, v16.8h,  v5.h[0]
352        FMLA    v21.8h, v17.8h,  v0.h[0]
353        FMLA    v23.8h, v17.8h,  v1.h[0]
354        FMLA    v25.8h, v17.8h,  v2.h[0]
355        FMLA    v27.8h, v17.8h,  v3.h[0]
356        FMLA    v29.8h, v17.8h,  v4.h[0]
357        FMLA    v31.8h, v17.8h,  v5.h[0]
358
359        FMLA    v20.8h, v18.8h,  v0.h[1]
360        FMLA    v22.8h, v18.8h,  v1.h[1]
361        FMLA    v24.8h, v18.8h,  v2.h[1]
362        FMLA    v26.8h, v18.8h,  v3.h[1]
363        FMLA    v28.8h, v18.8h,  v4.h[1]
364        FMLA    v30.8h, v18.8h,  v5.h[1]
365        FMLA    v21.8h, v19.8h,  v0.h[1]
366        FMLA    v23.8h, v19.8h,  v1.h[1]
367        FMLA    v25.8h, v19.8h,  v2.h[1]
368        FMLA    v27.8h, v19.8h,  v3.h[1]
369        FMLA    v29.8h, v19.8h,  v4.h[1]
370        FMLA    v31.8h, v19.8h,  v5.h[1]
371        TBZ     x0, 1, 4b
372
3736:
374        LDR     h0, [x14], 2
375        LDR     q16, [x5], 16
376        LDR     q17, [x5], 16
377        LDR     h1, [x15], 2
378        LDR     h2, [x20], 2
379        LDR     h3, [x21], 2
380        LDR     h4, [x22], 2
381        LDR     h5, [x23], 2
382        FMLA    v20.8h, v16.8h,  v0.h[0]
383        FMLA    v22.8h, v16.8h,  v1.h[0]
384        FMLA    v24.8h, v16.8h,  v2.h[0]
385        FMLA    v26.8h, v16.8h,  v3.h[0]
386        FMLA    v28.8h, v16.8h,  v4.h[0]
387        FMLA    v30.8h, v16.8h,  v5.h[0]
388        FMLA    v21.8h, v17.8h,  v0.h[0]
389        FMLA    v23.8h, v17.8h,  v1.h[0]
390        FMLA    v25.8h, v17.8h,  v2.h[0]
391        FMLA    v27.8h, v17.8h,  v3.h[0]
392        FMLA    v29.8h, v17.8h,  v4.h[0]
393        FMLA    v31.8h, v17.8h,  v5.h[0]
394        B       4b
395
396        # Store odd width
3977:
398        TBZ     x1, 3, 8f
399        STR     q30,  [x7], 16
400        MOV     v30.16b, v31.16b
401        STR     q28, [x13], 16
402        MOV     v28.16b, v29.16b
403        STR     q26, [x10], 16
404        MOV     v26.16b, v27.16b
405        STR     q24, [x17], 16
406        MOV     v24.16b, v25.16b
407        STR     q22, [x16], 16
408        MOV     v22.16b, v23.16b
409        STR     q20,  [x6], 16
410        MOV     v20.16b, v21.16b
4118:
412        TBZ     x1, 2, 9f
413        STR     d30,  [x7], 8
414        STR     d28, [x13], 8
415        DUP     d30, v30.d[1]
416        DUP     d28, v28.d[1]
417        STR     d26, [x10], 8
418        STR     d24, [x17], 8
419        DUP     d26, v26.d[1]
420        DUP     d24, v24.d[1]
421        STR     d22, [x16], 8
422        STR     d20,  [x6], 8
423        DUP     d22, v22.d[1]
424        DUP     d20, v20.d[1]
425
4269:
427        TBZ     x1, 1, 10f
428        STR     s30,  [x7], 4
429        STR     s28, [x13], 4
430        DUP     s30, v30.s[1]
431        DUP     s28, v28.s[1]
432        STR     s26, [x10], 4
433        STR     s24, [x17], 4
434        DUP     s26, v26.s[1]
435        DUP     s24, v24.s[1]
436        STR     s22, [x16], 4
437        STR     s20,  [x6], 4
438        DUP     s22, v22.s[1]
439        DUP     s20, v20.s[1]
440
44110:
442        TBZ     x1, 0, 11f
443        STR     h30,  [x7]
444        STR     h28, [x13]
445        STR     h26, [x10]
446        STR     h24, [x17]
447        STR     h22, [x16]
448        STR     h20,  [x6]
44911:
450        # Restore x20-x23 from stack
451        LDP     x22, x23, [sp, 16]
452        LDP     x20, x21, [sp], 32
453        RET
454
455END_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75
456
457#ifdef __ELF__
458.section ".note.GNU-stack","",%progbits
459#endif
460