xref: /aosp_15_r20/external/XNNPACK/src/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-cortex-a75.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const void*restrict a,    x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     void*restrict c,          x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x8
22
23#     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3 v0
29# A1  x9 v1
30# A2 x10 v2
31# A3 x11 v3
32# A4 x12 v4
33# A5  x4 v5
34
35# B   x5 v16 v17 v18 v19
36
37# C0  x6  v20 v21
38# C1 x16  v22 v23
39# C2 x17  v24 v25
40# C3 x14  v26 v27
41# C4 x13  v28 v29
42# C5  x7  v30 v31
43
44# Clamp v6, (v4), (v5)
45# unused     v7
46# unused A   v8 v9 v10 v11
47# unused B   v12 v13 v14 v15
48
49BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75
50
51        # Load params pointer
52        LDR     x8, [sp, 8]
53
54        # Clamp A and C pointers
55        CMP     x0, 2                   // if mr < 2
56        ADD     x9, x3, x4              // a1 = a0 + a_stride
57        ADD     x16, x6, x7             // c1 = c0 + cm_stride
58        CSEL    x9, x3, x9, LO          //   a1 = a0
59        CSEL    x16, x6, x16, LO        //   c1 = c0
60
61        # Load params
62        LDR     s6, [x8]
63
64        ADD     x10, x9, x4             // a2 = a1 + a_stride
65        ADD     x17, x16, x7            // c2 = c1 + cm_stride
66                                        // if mr <= 2
67        CSEL    x10, x9, x10, LS        //   a2 = a1
68        CSEL    x17, x16, x17, LS       //   c2 = c1
69
70        CMP     x0, 4                   // if mr < 4
71        ADD     x11, x10, x4            // a3 = a2 + a_stride
72        ADD     x14, x17, x7            // c3 = c2 + cm_stride
73        CSEL    x11, x10, x11, LO       //   a3 = a2
74        CSEL    x14, x17, x14, LO       //   c3 = c2
75
76        ADD     x12, x11, x4            // a4 = a3 + a_stride
77        ADD     x13, x14, x7            // c4 = c3 + cm_stride
78                                        // if mr <= 4
79        CSEL    x12, x11, x12, LS       //   a4 = a3
80        CSEL    x13, x14, x13, LS       //   c4 = c3
81
82        CMP     x0, 6                   // if mr < 6
83        ADD     x4, x12, x4             // a5 = a4 + a_stride
84        ADD     x7, x13, x7             // c5 = c4 + cm_stride
85        CSEL    x4, x12, x4, LO         //   a5 = a4
86        CSEL    x7, x13, x7, LO         //   c5 = c4
87
88        LDR     x8, [sp]                // load cn_stride
89
900:
91        # Load initial bias from w into accumulators
92        LDP     q20, q21, [x5], 32
93        MOV     v22.16b, v20.16b
94        MOV     v23.16b, v21.16b
95        MOV     v24.16b, v20.16b
96        MOV     v25.16b, v21.16b
97        MOV     v26.16b, v20.16b
98        MOV     v27.16b, v21.16b
99        MOV     v28.16b, v20.16b
100        MOV     v29.16b, v21.16b
101        MOV     v30.16b, v20.16b
102        MOV     v31.16b, v21.16b
103
104        # Is there at least 4 halffloats (8 bytes)?
105        SUBS    x0, x2, 8               // k = kc - 8
106        B.LO    4f
107
108        # Prologue - load 4 A and 2 B
109
110        LDR     d0,  [x3], 8              // A0
111        LDR     q16, [x5], 16             // B0
112        LDR     q17, [x5], 16             // B1
113        LDR     d1,  [x9], 8              // A1
114        LDR     d2, [x10], 8              // A2
115        LDR     d3, [x11], 8              // A3
116
117        # Is there at least 4 halffloats for main loop?
118        SUBS    x0, x0, 8
119        B.LO    2f
120
121       .p2align 3
122        # Main loop - 4 halffloats of A (8 bytes)
123        # 48 FMA + 6 ld32 A + 8 LDR B
1241:
125        FMLA    v20.8h, v16.8h,  v0.h[0]
126        FMLA    v21.8h, v17.8h,  v0.h[0]
127        LDR     d4, [x12], 8              // A4
128        FMLA    v22.8h, v16.8h,  v1.h[0]
129        FMLA    v23.8h, v17.8h,  v1.h[0]
130        LDR     d5,  [x4], 8              // A5
131        FMLA    v24.8h, v16.8h,  v2.h[0]
132        FMLA    v25.8h, v17.8h,  v2.h[0]
133        LDR     q18, [x5], 16             // B2
134        FMLA    v26.8h, v16.8h,  v3.h[0]
135        FMLA    v27.8h, v17.8h,  v3.h[0]
136        LDR     q19, [x5], 16             // B3
137        FMLA    v28.8h, v16.8h,  v4.h[0]
138        FMLA    v29.8h, v17.8h,  v4.h[0]
139        FMLA    v30.8h, v16.8h,  v5.h[0]
140        FMLA    v31.8h, v17.8h,  v5.h[0]
141        SUBS    x0, x0, 8
142
143        FMLA    v20.8h, v18.8h,  v0.h[1]
144        FMLA    v21.8h, v19.8h,  v0.h[1]
145        LDR     q16, [x5], 16             // B4
146        FMLA    v22.8h, v18.8h,  v1.h[1]
147        FMLA    v23.8h, v19.8h,  v1.h[1]
148        LDR     q17, [x5], 16             // B5
149        FMLA    v24.8h, v18.8h,  v2.h[1]
150        FMLA    v25.8h, v19.8h,  v2.h[1]
151        FMLA    v26.8h, v18.8h,  v3.h[1]
152        FMLA    v27.8h, v19.8h,  v3.h[1]
153        FMLA    v28.8h, v18.8h,  v4.h[1]
154        FMLA    v29.8h, v19.8h,  v4.h[1]
155        FMLA    v30.8h, v18.8h,  v5.h[1]
156        FMLA    v31.8h, v19.8h,  v5.h[1]
157
158        FMLA    v20.8h, v16.8h,  v0.h[2]
159        FMLA    v21.8h, v17.8h,  v0.h[2]
160        LDR     q18, [x5], 16             // B6
161        FMLA    v22.8h, v16.8h,  v1.h[2]
162        FMLA    v23.8h, v17.8h,  v1.h[2]
163        LDR     q19, [x5], 16             // B7
164        FMLA    v24.8h, v16.8h,  v2.h[2]
165        FMLA    v25.8h, v17.8h,  v2.h[2]
166        FMLA    v26.8h, v16.8h,  v3.h[2]
167        FMLA    v27.8h, v17.8h,  v3.h[2]
168        FMLA    v28.8h, v16.8h,  v4.h[2]
169        FMLA    v29.8h, v17.8h,  v4.h[2]
170        FMLA    v30.8h, v16.8h,  v5.h[2]
171        FMLA    v31.8h, v17.8h,  v5.h[2]
172
173        LDR     q16, [x5], 16             // B0
174        FMLA    v20.8h, v18.8h,  v0.h[3]
175        FMLA    v21.8h, v19.8h,  v0.h[3]
176        LDR     q17, [x5], 16             // B1
177        FMLA    v22.8h, v18.8h,  v1.h[3]
178        FMLA    v23.8h, v19.8h,  v1.h[3]
179        LDR     d0,  [x3], 8              // A0
180        FMLA    v24.8h, v18.8h,  v2.h[3]
181        FMLA    v25.8h, v19.8h,  v2.h[3]
182        LDR     d1,  [x9], 8              // A1
183        FMLA    v26.8h, v18.8h,  v3.h[3]
184        FMLA    v27.8h, v19.8h,  v3.h[3]
185        LDR     d2, [x10], 8              // A2
186        FMLA    v28.8h, v18.8h,  v4.h[3]
187        FMLA    v29.8h, v19.8h,  v4.h[3]
188        LDR     d3, [x11], 8              // A3
189        FMLA    v30.8h, v18.8h,  v5.h[3]
190        FMLA    v31.8h, v19.8h,  v5.h[3]
191        B.HS    1b
192
193        # Epilogue - same as main loop but no loads for next loop
1942:
195        FMLA    v20.8h, v16.8h,  v0.h[0]
196        FMLA    v21.8h, v17.8h,  v0.h[0]
197        LDR     d4, [x12], 8              // A4
198        FMLA    v22.8h, v16.8h,  v1.h[0]
199        FMLA    v23.8h, v17.8h,  v1.h[0]
200        LDR     d5,  [x4], 8              // A5
201        FMLA    v24.8h, v16.8h,  v2.h[0]
202        FMLA    v25.8h, v17.8h,  v2.h[0]
203        LDR     q18, [x5], 16             // B2
204        FMLA    v26.8h, v16.8h,  v3.h[0]
205        FMLA    v27.8h, v17.8h,  v3.h[0]
206        LDR     q19, [x5], 16             // B3
207        FMLA    v28.8h, v16.8h,  v4.h[0]
208        FMLA    v29.8h, v17.8h,  v4.h[0]
209        FMLA    v30.8h, v16.8h,  v5.h[0]
210        FMLA    v31.8h, v17.8h,  v5.h[0]
211        ADDS    x0, x0, 8
212
213        FMLA    v20.8h, v18.8h,  v0.h[1]
214        FMLA    v21.8h, v19.8h,  v0.h[1]
215        LDR     q16, [x5], 16             // B4
216        FMLA    v22.8h, v18.8h,  v1.h[1]
217        FMLA    v23.8h, v19.8h,  v1.h[1]
218        LDR     q17, [x5], 16             // B5
219        FMLA    v24.8h, v18.8h,  v2.h[1]
220        FMLA    v25.8h, v19.8h,  v2.h[1]
221        FMLA    v26.8h, v18.8h,  v3.h[1]
222        FMLA    v27.8h, v19.8h,  v3.h[1]
223        FMLA    v28.8h, v18.8h,  v4.h[1]
224        FMLA    v29.8h, v19.8h,  v4.h[1]
225        FMLA    v30.8h, v18.8h,  v5.h[1]
226        FMLA    v31.8h, v19.8h,  v5.h[1]
227
228        FMLA    v20.8h, v16.8h,  v0.h[2]
229        FMLA    v21.8h, v17.8h,  v0.h[2]
230        LDR     q18, [x5], 16             // B6
231        FMLA    v22.8h, v16.8h,  v1.h[2]
232        FMLA    v23.8h, v17.8h,  v1.h[2]
233        LDR     q19, [x5], 16             // B7
234        FMLA    v24.8h, v16.8h,  v2.h[2]
235        FMLA    v25.8h, v17.8h,  v2.h[2]
236        FMLA    v26.8h, v16.8h,  v3.h[2]
237        FMLA    v27.8h, v17.8h,  v3.h[2]
238        FMLA    v28.8h, v16.8h,  v4.h[2]
239        FMLA    v29.8h, v17.8h,  v4.h[2]
240        FMLA    v30.8h, v16.8h,  v5.h[2]
241        FMLA    v31.8h, v17.8h,  v5.h[2]
242
243        FMLA    v20.8h, v18.8h,  v0.h[3]
244        FMLA    v21.8h, v19.8h,  v0.h[3]
245        FMLA    v22.8h, v18.8h,  v1.h[3]
246        FMLA    v23.8h, v19.8h,  v1.h[3]
247        FMLA    v24.8h, v18.8h,  v2.h[3]
248        FMLA    v25.8h, v19.8h,  v2.h[3]
249        FMLA    v26.8h, v18.8h,  v3.h[3]
250        FMLA    v27.8h, v19.8h,  v3.h[3]
251        FMLA    v28.8h, v18.8h,  v4.h[3]
252        FMLA    v29.8h, v19.8h,  v4.h[3]
253        FMLA    v30.8h, v18.8h,  v5.h[3]
254        FMLA    v31.8h, v19.8h,  v5.h[3]
255
256        # Is there a remainder?- 1-3 halffloats of A (2-6 bytes)
257        B.NE    4f
258
2593:
260        # Clamp
261        DUP     v4.8h, v6.h[0]
262        DUP     v5.8h, v6.h[1]
263        FMAX    v20.8h, v20.8h, v4.8h
264        FMAX    v21.8h, v21.8h, v4.8h
265        FMAX    v22.8h, v22.8h, v4.8h
266        FMAX    v23.8h, v23.8h, v4.8h
267        FMAX    v24.8h, v24.8h, v4.8h
268        FMAX    v25.8h, v25.8h, v4.8h
269        FMAX    v26.8h, v26.8h, v4.8h
270        FMAX    v27.8h, v27.8h, v4.8h
271        FMAX    v28.8h, v28.8h, v4.8h
272        FMAX    v29.8h, v29.8h, v4.8h
273        FMAX    v30.8h, v30.8h, v4.8h
274        FMAX    v31.8h, v31.8h, v4.8h
275        SUBS    x1, x1, 16
276        FMIN    v20.8h, v20.8h, v5.8h
277        FMIN    v21.8h, v21.8h, v5.8h
278        FMIN    v22.8h, v22.8h, v5.8h
279        FMIN    v23.8h, v23.8h, v5.8h
280        FMIN    v24.8h, v24.8h, v5.8h
281        FMIN    v25.8h, v25.8h, v5.8h
282        FMIN    v26.8h, v26.8h, v5.8h
283        FMIN    v27.8h, v27.8h, v5.8h
284        FMIN    v28.8h, v28.8h, v5.8h
285        FMIN    v29.8h, v29.8h, v5.8h
286        FMIN    v30.8h, v30.8h, v5.8h
287        FMIN    v31.8h, v31.8h, v5.8h
288
289        # Store full 6 x 16
290        B.LO    6f
291
292        ST1     {v20.16b, v21.16b},  [x6], x8
293        SUB     x3,  x3, x2             // a0 -= kc
294        ST1     {v22.16b, v23.16b}, [x16], x8
295        SUB     x9,  x9, x2             // a1 -= kc
296        ST1     {v24.16b, v25.16b}, [x17], x8
297        SUB     x10, x10, x2            // a2 -= kc
298        ST1     {v26.16b, v27.16b}, [x14], x8
299        SUB     x11, x11, x2            // a3 -= kc
300        ST1     {v28.16b, v29.16b}, [x13], x8
301        SUB     x12, x12, x2            // a4 -= kc
302        ST1     {v30.16b, v31.16b},  [x7], x8
303        SUB     x4,  x4, x2             // a5 -= kc
304
305        B.HI    0b
306        RET
307
308        # Remainder- 1-3 halffloats of A (2-6 bytes)
3094:
310        TBZ     x0, 2, 5f
311        LDR     s0,  [x3], 4
312        LDR     q16, [x5], 16
313        LDR     q17, [x5], 16
314        LDR     s1,  [x9], 4
315        LDR     s2, [x10], 4
316        LDR     s3, [x11], 4
317        LDR     s4, [x12], 4
318        LDR     s5,  [x4], 4
319        LDR     q18, [x5], 16
320        LDR     q19, [x5], 16
321        FMLA    v20.8h, v16.8h,  v0.h[0]
322        FMLA    v22.8h, v16.8h,  v1.h[0]
323        FMLA    v24.8h, v16.8h,  v2.h[0]
324        FMLA    v26.8h, v16.8h,  v3.h[0]
325        FMLA    v28.8h, v16.8h,  v4.h[0]
326        FMLA    v30.8h, v16.8h,  v5.h[0]
327        FMLA    v21.8h, v17.8h,  v0.h[0]
328        FMLA    v23.8h, v17.8h,  v1.h[0]
329        FMLA    v25.8h, v17.8h,  v2.h[0]
330        FMLA    v27.8h, v17.8h,  v3.h[0]
331        FMLA    v29.8h, v17.8h,  v4.h[0]
332        FMLA    v31.8h, v17.8h,  v5.h[0]
333
334        FMLA    v20.8h, v18.8h,  v0.h[1]
335        FMLA    v22.8h, v18.8h,  v1.h[1]
336        FMLA    v24.8h, v18.8h,  v2.h[1]
337        FMLA    v26.8h, v18.8h,  v3.h[1]
338        FMLA    v28.8h, v18.8h,  v4.h[1]
339        FMLA    v30.8h, v18.8h,  v5.h[1]
340        FMLA    v21.8h, v19.8h,  v0.h[1]
341        FMLA    v23.8h, v19.8h,  v1.h[1]
342        FMLA    v25.8h, v19.8h,  v2.h[1]
343        FMLA    v27.8h, v19.8h,  v3.h[1]
344        FMLA    v29.8h, v19.8h,  v4.h[1]
345        FMLA    v31.8h, v19.8h,  v5.h[1]
346        TBZ     x0, 1, 3b
347
3485:
349        LDR     h0,  [x3], 2
350        LDR     q16, [x5], 16
351        LDR     q17, [x5], 16
352        LDR     h1,  [x9], 2
353        LDR     h2, [x10], 2
354        LDR     h3, [x11], 2
355        LDR     h4, [x12], 2
356        LDR     h5,  [x4], 2
357        FMLA    v20.8h, v16.8h,  v0.h[0]
358        FMLA    v22.8h, v16.8h,  v1.h[0]
359        FMLA    v24.8h, v16.8h,  v2.h[0]
360        FMLA    v26.8h, v16.8h,  v3.h[0]
361        FMLA    v28.8h, v16.8h,  v4.h[0]
362        FMLA    v30.8h, v16.8h,  v5.h[0]
363        FMLA    v21.8h, v17.8h,  v0.h[0]
364        FMLA    v23.8h, v17.8h,  v1.h[0]
365        FMLA    v25.8h, v17.8h,  v2.h[0]
366        FMLA    v27.8h, v17.8h,  v3.h[0]
367        FMLA    v29.8h, v17.8h,  v4.h[0]
368        FMLA    v31.8h, v17.8h,  v5.h[0]
369        B       3b
370
371        # Store odd width
3726:
373        TBZ     x1, 3, 7f
374        STR     q20,  [x6], 16
375        MOV     v20.16b, v21.16b
376        STR     q22, [x16], 16
377        MOV     v22.16b, v23.16b
378        STR     q24, [x17], 16
379        MOV     v24.16b, v25.16b
380        STR     q26, [x14], 16
381        MOV     v26.16b, v27.16b
382        STR     q28, [x13], 16
383        MOV     v28.16b, v29.16b
384        STR     q30,  [x7], 16
385        MOV     v30.16b, v31.16b
386
3877:
388        TBZ     x1, 2, 8f
389        STR     d20,  [x6], 8
390        STR     d22, [x16], 8
391        DUP     d20, v20.d[1]
392        DUP     d22, v22.d[1]
393        STR     d24, [x17], 8
394        STR     d26, [x14], 8
395        DUP     d24, v24.d[1]
396        DUP     d26, v26.d[1]
397        STR     d28, [x13], 8
398        STR     d30,  [x7], 8
399        DUP     d28, v28.d[1]
400        DUP     d30, v30.d[1]
401
4028:
403        TBZ     x1, 1, 9f
404        STR     s20,  [x6], 4
405        STR     s22, [x16], 4
406        DUP     s20, v20.s[1]
407        DUP     s22, v22.s[1]
408        STR     s24, [x17], 4
409        STR     s26, [x14], 4
410        DUP     s24, v24.s[1]
411        DUP     s26, v26.s[1]
412        STR     s28, [x13], 4
413        STR     s30,  [x7], 4
414        DUP     s28, v28.s[1]
415        DUP     s30, v30.s[1]
416
4179:
418        TBZ     x1, 0, 10f
419        STR     h20,  [x6]
420        STR     h22, [x16]
421        STR     h24, [x17]
422        STR     h26, [x14]
423        STR     h28, [x13]
424        STR     h30,  [x7]
42510:
426        RET
427
428END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75
429
430#ifdef __ELF__
431.section ".note.GNU-stack","",%progbits
432#endif
433