xref: /aosp_15_r20/external/XNNPACK/src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a75.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const void*restrict a,    x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     void*restrict c,          x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x8
18
19$if INC:
20  #     const float*restrict acc,  [sp + 8] -> x15
21  #     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> (x8)
22$else:
23  #     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# A1  x9 v1
30# A2 x10 v2
31# A3 x11 v3
32# A4 x12 v4
33# A5  x4 v5
34
35# B   x5 v16 v17 v18 v19
36
37# C0  x6  v20 v21
38# C1 x16  v22 v23
39# C2 x17  v24 v25
40# C3 x14  v26 v27
41# C4 x13  v28 v29
42# C5  x7  v30 v31
43
44# Clamp v6, (v4), (v5)
45# unused     v7
46# unused A   v8 v9 v10 v11
47# unused B   v12 v13 v14 v15
48
49BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75
50
51        $if INC:
52          # Load acc, params pointer
53          LDP     x15, x8, [sp, 8]
54        $else:
55          # Load params pointer
56          LDR     x8, [sp, 8]
57
58        # Clamp A and C pointers
59        CMP     x0, 2                   // if mr < 2
60        ADD     x9, x3, x4              // a1 = a0 + a_stride
61        ADD     x16, x6, x7             // c1 = c0 + cm_stride
62        CSEL    x9, x3, x9, LO          //   a1 = a0
63        CSEL    x16, x6, x16, LO        //   c1 = c0
64
65        # Load params
66        LDR     s6, [x8]
67
68        ADD     x10, x9, x4             // a2 = a1 + a_stride
69        ADD     x17, x16, x7            // c2 = c1 + cm_stride
70                                        // if mr <= 2
71        CSEL    x10, x9, x10, LS        //   a2 = a1
72        CSEL    x17, x16, x17, LS       //   c2 = c1
73
74        CMP     x0, 4                   // if mr < 4
75        ADD     x11, x10, x4            // a3 = a2 + a_stride
76        ADD     x14, x17, x7            // c3 = c2 + cm_stride
77        CSEL    x11, x10, x11, LO       //   a3 = a2
78        CSEL    x14, x17, x14, LO       //   c3 = c2
79
80        ADD     x12, x11, x4            // a4 = a3 + a_stride
81        ADD     x13, x14, x7            // c4 = c3 + cm_stride
82                                        // if mr <= 4
83        CSEL    x12, x11, x12, LS       //   a4 = a3
84        CSEL    x13, x14, x13, LS       //   c4 = c3
85
86        CMP     x0, 6                   // if mr < 6
87        ADD     x4, x12, x4             // a5 = a4 + a_stride
88        ADD     x7, x13, x7             // c5 = c4 + cm_stride
89        CSEL    x4, x12, x4, LO         //   a5 = a4
90        CSEL    x7, x13, x7, LO         //   c5 = c4
91
92        LDR     x8, [sp]                // load cn_stride
93
940:
95        $if INC:
96          # Load initial accumulators
97          LDP     q20, q21, [x15], 32
98          LDP     q22, q23, [x15], 32
99          LDP     q24, q25, [x15], 32
100          LDP     q26, q27, [x15], 32
101          LDP     q28, q29, [x15], 32
102          LDP     q30, q31, [x15], 32
103        $else:
104          # Load initial bias from w into accumulators
105          LDP     q20, q21, [x5], 32
106          MOV     v22.16b, v20.16b
107          MOV     v23.16b, v21.16b
108          MOV     v24.16b, v20.16b
109          MOV     v25.16b, v21.16b
110          MOV     v26.16b, v20.16b
111          MOV     v27.16b, v21.16b
112          MOV     v28.16b, v20.16b
113          MOV     v29.16b, v21.16b
114          MOV     v30.16b, v20.16b
115          MOV     v31.16b, v21.16b
116
117        # Is there at least 4 halffloats (8 bytes)?
118        SUBS    x0, x2, 8               // k = kc - 8
119        B.LO    4f
120
121        # Prologue - load 4 A and 2 B
122
123        LDR     d0,  [x3], 8              // A0
124        LDR     q16, [x5], 16             // B0
125        LDR     q17, [x5], 16             // B1
126        LDR     d1,  [x9], 8              // A1
127        LDR     d2, [x10], 8              // A2
128        LDR     d3, [x11], 8              // A3
129
130        # Is there at least 4 halffloats for main loop?
131        SUBS    x0, x0, 8
132        B.LO    2f
133
134       .p2align 3
135        # Main loop - 4 halffloats of A (8 bytes)
136        # 48 FMA + 6 ld32 A + 8 LDR B
1371:
138        FMLA    v20.8h, v16.8h,  v0.h[0]
139        FMLA    v21.8h, v17.8h,  v0.h[0]
140        LDR     d4, [x12], 8              // A4
141        FMLA    v22.8h, v16.8h,  v1.h[0]
142        FMLA    v23.8h, v17.8h,  v1.h[0]
143        LDR     d5,  [x4], 8              // A5
144        FMLA    v24.8h, v16.8h,  v2.h[0]
145        FMLA    v25.8h, v17.8h,  v2.h[0]
146        LDR     q18, [x5], 16             // B2
147        FMLA    v26.8h, v16.8h,  v3.h[0]
148        FMLA    v27.8h, v17.8h,  v3.h[0]
149        LDR     q19, [x5], 16             // B3
150        FMLA    v28.8h, v16.8h,  v4.h[0]
151        FMLA    v29.8h, v17.8h,  v4.h[0]
152        FMLA    v30.8h, v16.8h,  v5.h[0]
153        FMLA    v31.8h, v17.8h,  v5.h[0]
154        SUBS    x0, x0, 8
155
156        FMLA    v20.8h, v18.8h,  v0.h[1]
157        FMLA    v21.8h, v19.8h,  v0.h[1]
158        LDR     q16, [x5], 16             // B4
159        FMLA    v22.8h, v18.8h,  v1.h[1]
160        FMLA    v23.8h, v19.8h,  v1.h[1]
161        LDR     q17, [x5], 16             // B5
162        FMLA    v24.8h, v18.8h,  v2.h[1]
163        FMLA    v25.8h, v19.8h,  v2.h[1]
164        FMLA    v26.8h, v18.8h,  v3.h[1]
165        FMLA    v27.8h, v19.8h,  v3.h[1]
166        FMLA    v28.8h, v18.8h,  v4.h[1]
167        FMLA    v29.8h, v19.8h,  v4.h[1]
168        FMLA    v30.8h, v18.8h,  v5.h[1]
169        FMLA    v31.8h, v19.8h,  v5.h[1]
170
171        FMLA    v20.8h, v16.8h,  v0.h[2]
172        FMLA    v21.8h, v17.8h,  v0.h[2]
173        LDR     q18, [x5], 16             // B6
174        FMLA    v22.8h, v16.8h,  v1.h[2]
175        FMLA    v23.8h, v17.8h,  v1.h[2]
176        LDR     q19, [x5], 16             // B7
177        FMLA    v24.8h, v16.8h,  v2.h[2]
178        FMLA    v25.8h, v17.8h,  v2.h[2]
179        FMLA    v26.8h, v16.8h,  v3.h[2]
180        FMLA    v27.8h, v17.8h,  v3.h[2]
181        FMLA    v28.8h, v16.8h,  v4.h[2]
182        FMLA    v29.8h, v17.8h,  v4.h[2]
183        FMLA    v30.8h, v16.8h,  v5.h[2]
184        FMLA    v31.8h, v17.8h,  v5.h[2]
185
186        LDR     q16, [x5], 16             // B0
187        FMLA    v20.8h, v18.8h,  v0.h[3]
188        FMLA    v21.8h, v19.8h,  v0.h[3]
189        LDR     q17, [x5], 16             // B1
190        FMLA    v22.8h, v18.8h,  v1.h[3]
191        FMLA    v23.8h, v19.8h,  v1.h[3]
192        LDR     d0,  [x3], 8              // A0
193        FMLA    v24.8h, v18.8h,  v2.h[3]
194        FMLA    v25.8h, v19.8h,  v2.h[3]
195        LDR     d1,  [x9], 8              // A1
196        FMLA    v26.8h, v18.8h,  v3.h[3]
197        FMLA    v27.8h, v19.8h,  v3.h[3]
198        LDR     d2, [x10], 8              // A2
199        FMLA    v28.8h, v18.8h,  v4.h[3]
200        FMLA    v29.8h, v19.8h,  v4.h[3]
201        LDR     d3, [x11], 8              // A3
202        FMLA    v30.8h, v18.8h,  v5.h[3]
203        FMLA    v31.8h, v19.8h,  v5.h[3]
204        B.HS    1b
205
206        # Epilogue - same as main loop but no loads for next loop
2072:
208        FMLA    v20.8h, v16.8h,  v0.h[0]
209        FMLA    v21.8h, v17.8h,  v0.h[0]
210        LDR     d4, [x12], 8              // A4
211        FMLA    v22.8h, v16.8h,  v1.h[0]
212        FMLA    v23.8h, v17.8h,  v1.h[0]
213        LDR     d5,  [x4], 8              // A5
214        FMLA    v24.8h, v16.8h,  v2.h[0]
215        FMLA    v25.8h, v17.8h,  v2.h[0]
216        LDR     q18, [x5], 16             // B2
217        FMLA    v26.8h, v16.8h,  v3.h[0]
218        FMLA    v27.8h, v17.8h,  v3.h[0]
219        LDR     q19, [x5], 16             // B3
220        FMLA    v28.8h, v16.8h,  v4.h[0]
221        FMLA    v29.8h, v17.8h,  v4.h[0]
222        FMLA    v30.8h, v16.8h,  v5.h[0]
223        FMLA    v31.8h, v17.8h,  v5.h[0]
224        ADDS    x0, x0, 8
225
226        FMLA    v20.8h, v18.8h,  v0.h[1]
227        FMLA    v21.8h, v19.8h,  v0.h[1]
228        LDR     q16, [x5], 16             // B4
229        FMLA    v22.8h, v18.8h,  v1.h[1]
230        FMLA    v23.8h, v19.8h,  v1.h[1]
231        LDR     q17, [x5], 16             // B5
232        FMLA    v24.8h, v18.8h,  v2.h[1]
233        FMLA    v25.8h, v19.8h,  v2.h[1]
234        FMLA    v26.8h, v18.8h,  v3.h[1]
235        FMLA    v27.8h, v19.8h,  v3.h[1]
236        FMLA    v28.8h, v18.8h,  v4.h[1]
237        FMLA    v29.8h, v19.8h,  v4.h[1]
238        FMLA    v30.8h, v18.8h,  v5.h[1]
239        FMLA    v31.8h, v19.8h,  v5.h[1]
240
241        FMLA    v20.8h, v16.8h,  v0.h[2]
242        FMLA    v21.8h, v17.8h,  v0.h[2]
243        LDR     q18, [x5], 16             // B6
244        FMLA    v22.8h, v16.8h,  v1.h[2]
245        FMLA    v23.8h, v17.8h,  v1.h[2]
246        LDR     q19, [x5], 16             // B7
247        FMLA    v24.8h, v16.8h,  v2.h[2]
248        FMLA    v25.8h, v17.8h,  v2.h[2]
249        FMLA    v26.8h, v16.8h,  v3.h[2]
250        FMLA    v27.8h, v17.8h,  v3.h[2]
251        FMLA    v28.8h, v16.8h,  v4.h[2]
252        FMLA    v29.8h, v17.8h,  v4.h[2]
253        FMLA    v30.8h, v16.8h,  v5.h[2]
254        FMLA    v31.8h, v17.8h,  v5.h[2]
255
256        FMLA    v20.8h, v18.8h,  v0.h[3]
257        FMLA    v21.8h, v19.8h,  v0.h[3]
258        FMLA    v22.8h, v18.8h,  v1.h[3]
259        FMLA    v23.8h, v19.8h,  v1.h[3]
260        FMLA    v24.8h, v18.8h,  v2.h[3]
261        FMLA    v25.8h, v19.8h,  v2.h[3]
262        FMLA    v26.8h, v18.8h,  v3.h[3]
263        FMLA    v27.8h, v19.8h,  v3.h[3]
264        FMLA    v28.8h, v18.8h,  v4.h[3]
265        FMLA    v29.8h, v19.8h,  v4.h[3]
266        FMLA    v30.8h, v18.8h,  v5.h[3]
267        FMLA    v31.8h, v19.8h,  v5.h[3]
268
269        # Is there a remainder?- 1-3 halffloats of A (2-6 bytes)
270        B.NE    4f
271
2723:
273        # Clamp
274        DUP     v4.8h, v6.h[0]
275        DUP     v5.8h, v6.h[1]
276        FMAX    v20.8h, v20.8h, v4.8h
277        FMAX    v21.8h, v21.8h, v4.8h
278        FMAX    v22.8h, v22.8h, v4.8h
279        FMAX    v23.8h, v23.8h, v4.8h
280        FMAX    v24.8h, v24.8h, v4.8h
281        FMAX    v25.8h, v25.8h, v4.8h
282        FMAX    v26.8h, v26.8h, v4.8h
283        FMAX    v27.8h, v27.8h, v4.8h
284        FMAX    v28.8h, v28.8h, v4.8h
285        FMAX    v29.8h, v29.8h, v4.8h
286        FMAX    v30.8h, v30.8h, v4.8h
287        FMAX    v31.8h, v31.8h, v4.8h
288        SUBS    x1, x1, 16
289        FMIN    v20.8h, v20.8h, v5.8h
290        FMIN    v21.8h, v21.8h, v5.8h
291        FMIN    v22.8h, v22.8h, v5.8h
292        FMIN    v23.8h, v23.8h, v5.8h
293        FMIN    v24.8h, v24.8h, v5.8h
294        FMIN    v25.8h, v25.8h, v5.8h
295        FMIN    v26.8h, v26.8h, v5.8h
296        FMIN    v27.8h, v27.8h, v5.8h
297        FMIN    v28.8h, v28.8h, v5.8h
298        FMIN    v29.8h, v29.8h, v5.8h
299        FMIN    v30.8h, v30.8h, v5.8h
300        FMIN    v31.8h, v31.8h, v5.8h
301
302        # Store full 6 x 16
303        B.LO    6f
304
305        $if INC:
306          ST1     {v30.16b, v31.16b},  [x7], x8
307          SUB     x3,  x3, x2             // a0 -= kc
308          ST1     {v28.16b, v29.16b}, [x13], x8
309          SUB     x9,  x9, x2             // a1 -= kc
310          ST1     {v26.16b, v27.16b}, [x14], x8
311          SUB     x10, x10, x2            // a2 -= kc
312          ST1     {v24.16b, v25.16b}, [x17], x8
313          SUB     x11, x11, x2            // a3 -= kc
314          ST1     {v22.16b, v23.16b}, [x16], x8
315          SUB     x12, x12, x2            // a4 -= kc
316          ST1     {v20.16b, v21.16b},  [x6], x8
317          SUB     x4,  x4, x2             // a5 -= kc
318        $else:
319          ST1     {v20.16b, v21.16b},  [x6], x8
320          SUB     x3,  x3, x2             // a0 -= kc
321          ST1     {v22.16b, v23.16b}, [x16], x8
322          SUB     x9,  x9, x2             // a1 -= kc
323          ST1     {v24.16b, v25.16b}, [x17], x8
324          SUB     x10, x10, x2            // a2 -= kc
325          ST1     {v26.16b, v27.16b}, [x14], x8
326          SUB     x11, x11, x2            // a3 -= kc
327          ST1     {v28.16b, v29.16b}, [x13], x8
328          SUB     x12, x12, x2            // a4 -= kc
329          ST1     {v30.16b, v31.16b},  [x7], x8
330          SUB     x4,  x4, x2             // a5 -= kc
331
332        B.HI    0b
333        RET
334
335        # Remainder- 1-3 halffloats of A (2-6 bytes)
3364:
337        TBZ     x0, 2, 5f
338        LDR     s0,  [x3], 4
339        LDR     q16, [x5], 16
340        LDR     q17, [x5], 16
341        LDR     s1,  [x9], 4
342        LDR     s2, [x10], 4
343        LDR     s3, [x11], 4
344        LDR     s4, [x12], 4
345        LDR     s5,  [x4], 4
346        LDR     q18, [x5], 16
347        LDR     q19, [x5], 16
348        FMLA    v20.8h, v16.8h,  v0.h[0]
349        FMLA    v22.8h, v16.8h,  v1.h[0]
350        FMLA    v24.8h, v16.8h,  v2.h[0]
351        FMLA    v26.8h, v16.8h,  v3.h[0]
352        FMLA    v28.8h, v16.8h,  v4.h[0]
353        FMLA    v30.8h, v16.8h,  v5.h[0]
354        FMLA    v21.8h, v17.8h,  v0.h[0]
355        FMLA    v23.8h, v17.8h,  v1.h[0]
356        FMLA    v25.8h, v17.8h,  v2.h[0]
357        FMLA    v27.8h, v17.8h,  v3.h[0]
358        FMLA    v29.8h, v17.8h,  v4.h[0]
359        FMLA    v31.8h, v17.8h,  v5.h[0]
360
361        FMLA    v20.8h, v18.8h,  v0.h[1]
362        FMLA    v22.8h, v18.8h,  v1.h[1]
363        FMLA    v24.8h, v18.8h,  v2.h[1]
364        FMLA    v26.8h, v18.8h,  v3.h[1]
365        FMLA    v28.8h, v18.8h,  v4.h[1]
366        FMLA    v30.8h, v18.8h,  v5.h[1]
367        FMLA    v21.8h, v19.8h,  v0.h[1]
368        FMLA    v23.8h, v19.8h,  v1.h[1]
369        FMLA    v25.8h, v19.8h,  v2.h[1]
370        FMLA    v27.8h, v19.8h,  v3.h[1]
371        FMLA    v29.8h, v19.8h,  v4.h[1]
372        FMLA    v31.8h, v19.8h,  v5.h[1]
373        TBZ     x0, 1, 3b
374
3755:
376        LDR     h0,  [x3], 2
377        LDR     q16, [x5], 16
378        LDR     q17, [x5], 16
379        LDR     h1,  [x9], 2
380        LDR     h2, [x10], 2
381        LDR     h3, [x11], 2
382        LDR     h4, [x12], 2
383        LDR     h5,  [x4], 2
384        FMLA    v20.8h, v16.8h,  v0.h[0]
385        FMLA    v22.8h, v16.8h,  v1.h[0]
386        FMLA    v24.8h, v16.8h,  v2.h[0]
387        FMLA    v26.8h, v16.8h,  v3.h[0]
388        FMLA    v28.8h, v16.8h,  v4.h[0]
389        FMLA    v30.8h, v16.8h,  v5.h[0]
390        FMLA    v21.8h, v17.8h,  v0.h[0]
391        FMLA    v23.8h, v17.8h,  v1.h[0]
392        FMLA    v25.8h, v17.8h,  v2.h[0]
393        FMLA    v27.8h, v17.8h,  v3.h[0]
394        FMLA    v29.8h, v17.8h,  v4.h[0]
395        FMLA    v31.8h, v17.8h,  v5.h[0]
396        B       3b
397
398        # Store odd width
3996:
400        TBZ     x1, 3, 7f
401        $if INC:
402          STR     q30,  [x7], 16
403          MOV     v30.16b, v31.16b
404          STR     q28, [x13], 16
405          MOV     v28.16b, v29.16b
406          STR     q26, [x14], 16
407          MOV     v26.16b, v27.16b
408          STR     q24, [x17], 16
409          MOV     v24.16b, v25.16b
410          STR     q22, [x16], 16
411          MOV     v22.16b, v23.16b
412          STR     q20,  [x6], 16
413          MOV     v20.16b, v21.16b
414        $else:
415          STR     q20,  [x6], 16
416          MOV     v20.16b, v21.16b
417          STR     q22, [x16], 16
418          MOV     v22.16b, v23.16b
419          STR     q24, [x17], 16
420          MOV     v24.16b, v25.16b
421          STR     q26, [x14], 16
422          MOV     v26.16b, v27.16b
423          STR     q28, [x13], 16
424          MOV     v28.16b, v29.16b
425          STR     q30,  [x7], 16
426          MOV     v30.16b, v31.16b
427
4287:
429        TBZ     x1, 2, 8f
430        $if INC:
431          STR     d30,  [x7], 8
432          STR     d28, [x13], 8
433          DUP     d30, v30.d[1]
434          DUP     d28, v28.d[1]
435          STR     d26, [x14], 8
436          STR     d24, [x17], 8
437          DUP     d26, v26.d[1]
438          DUP     d24, v24.d[1]
439          STR     d22, [x16], 8
440          STR     d20,  [x6], 8
441          DUP     d22, v22.d[1]
442          DUP     d20, v20.d[1]
443        $else:
444          STR     d20,  [x6], 8
445          STR     d22, [x16], 8
446          DUP     d20, v20.d[1]
447          DUP     d22, v22.d[1]
448          STR     d24, [x17], 8
449          STR     d26, [x14], 8
450          DUP     d24, v24.d[1]
451          DUP     d26, v26.d[1]
452          STR     d28, [x13], 8
453          STR     d30,  [x7], 8
454          DUP     d28, v28.d[1]
455          DUP     d30, v30.d[1]
456
4578:
458        TBZ     x1, 1, 9f
459        $if INC:
460          STR     s30,  [x7], 4
461          STR     s28, [x13], 4
462          DUP     s30, v30.s[1]
463          DUP     s28, v28.s[1]
464          STR     s26, [x14], 4
465          STR     s24, [x17], 4
466          DUP     s26, v26.s[1]
467          DUP     s24, v24.s[1]
468          STR     s22, [x16], 4
469          STR     s20,  [x6], 4
470          DUP     s22, v22.s[1]
471          DUP     s20, v20.s[1]
472        $else:
473          STR     s20,  [x6], 4
474          STR     s22, [x16], 4
475          DUP     s20, v20.s[1]
476          DUP     s22, v22.s[1]
477          STR     s24, [x17], 4
478          STR     s26, [x14], 4
479          DUP     s24, v24.s[1]
480          DUP     s26, v26.s[1]
481          STR     s28, [x13], 4
482          STR     s30,  [x7], 4
483          DUP     s28, v28.s[1]
484          DUP     s30, v30.s[1]
485
4869:
487        TBZ     x1, 0, 10f
488        $if INC:
489          STR     h30,  [x7]
490          STR     h28, [x13]
491          STR     h26, [x14]
492          STR     h24, [x17]
493          STR     h22, [x16]
494          STR     h20,  [x6]
495        $else:
496          STR     h20,  [x6]
497          STR     h22, [x16]
498          STR     h24, [x17]
499          STR     h26, [x14]
500          STR     h28, [x13]
501          STR     h30,  [x7]
50210:
503        RET
504
505END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75
506
507#ifdef __ELF__
508.section ".note.GNU-stack","",%progbits
509#endif
510