xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8.syntax unified
9
10// void xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64(
11//     size_t mr,                            r0
12//     size_t nc,                            r1
13//     size_t kc,                            r2 -> r5
14//     const uint8_t*restrict a,             r3
15//     size_t a_stride,          sp + 96  -> (r11)
16//     const void*restrict w,    sp + 100 -> r9
17//     uint8_t*restrict c,       sp + 104 -> r6
18//     size_t cm_stride,         sp + 108 -> (r7)
19//     size_t cn_stride,         sp + 112 -> r11
20//     const union xnn_f32_minmax_params params)  sp + 116 -> (r11)
21
22// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
23
24// Register usage
25
26// A0   r3  s0-s1  d0
27// A1  r12  s2-s3  d1
28// A2  r10  s4-s5  d2
29// A3   r0  s6-s7  d3
30
31// B    r9  s12, s13, s14, s15 d6-d7
32// B        s10, s11, s12, s13 d5-d6
33
34// C0   r6 s16-s17  d8  s18-s19  d9
35// C1   r4 s20-s21 d10  s22-s23 d11
36// C2   r8 s24-s25 d12  s26-s27 d13
37// C3   r7 s28-s29 d14  s30-s31 d15
38
39// Clamp (r5) s8, s9 d4
40
41BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64
42        .arm
43#ifndef __APPLE__
44        .arch   armv6
45        .fpu    vfp
46#endif
47        # Push 96 bytes
48        PUSH    {r4, r5, r6, r7, r8, r9, r10, r11}  // 32
49        VPUSH   {d8-d15}                            // +64 = 96
50
51        LDR     r11, [sp, 96]           // Load a_stride
52        LDRD    r6, r7, [sp, 104]       // Load c and cm_stride
53        LDR     r5,  [sp, 116]          // Load params
54
55        # Clamp A and C pointers
56        CMP     r0, 2                   // if mr >= 2
57        ADD     r12, r3, r11            //   a1 = a0 + a_stride
58        ADD     r4, r6, r7              //   c1 = c0 + cm_stride
59        MOVLO   r12, r3                 // a1
60        MOVLO   r4, r6                  // c1
61
62        LDR     r9, [sp, 100]           // Load w
63
64                                        // if mr > 2
65        ADD     r10, r12, r11           //   a2 = a1 + a_stride
66        ADD     r8, r4, r7              //   c2 = c1 + cm_stride
67        MOVLS   r10, r12                // a2
68        MOVLS   r8, r4                  // c2
69
70        VLDR    d4, [r5]                // Load min/max values
71
72        CMP     r0, 4                   // if mr >=4
73        ADD     r0, r10, r11            //   a3 = a2 + a_stride
74        ADD     r7, r8, r7              //   c3 = c2 + cm_stride
75        LDR     r11, [sp, 112]          // Load cn_stride
76        MOVLO   r0, r10                 // a3
77        MOVLO   r7, r8                  // c3
78
79
800:
81        # Load initial bias from w into accumulators
82        VLDM    r9!, {d8-d9}            // Bias
83        SUBS    r5, r2, 8
84        VMOV.F64 d10, d8
85        VMOV.F64 d12, d8
86        VMOV.F64 d14, d8
87        VMOV.F64 d11, d9
88        VMOV.F64 d13, d9
89        VMOV.F64 d15, d9
90        BLO     3f                      // less than 2 channels?
91
92        # Main loop - 2 floats of A (8 bytes)
931:
94        VLDM    r3!, {d0}               // A0
95        VLDM    r9!, {d6-d7}            // B0
96        VLDM    r12!, {d1}              // A1
97        VLDM    r10!, {d2}              // A2
98        VLDM    r0!, {d3}               // A3
99
100        VMLA.F32 s16, s12, s0
101        VMLA.F32 s17, s13, s0
102        VMLA.F32 s20, s12, s2
103        VMLA.F32 s21, s13, s2
104        VMLA.F32 s24, s12, s4
105        VMLA.F32 s25, s13, s4
106        VMLA.F32 s28, s12, s6
107        VMLA.F32 s29, s13, s6
108
109        VMLA.F32 s18, s14, s0
110        VMLA.F32 s19, s15, s0
111        VMLA.F32 s22, s14, s2
112        VMLA.F32 s23, s15, s2
113        VLDM    r9!, {d5-d6}            // B1
114        VMLA.F32 s26, s14, s4
115        VMLA.F32 s27, s15, s4
116        VMLA.F32 s30, s14, s6
117        VMLA.F32 s31, s15, s6
118
119        VMLA.F32 s16, s10, s1
120        VMLA.F32 s17, s11, s1
121        VMLA.F32 s20, s10, s3
122        VMLA.F32 s21, s11, s3
123        VMLA.F32 s24, s10, s5
124        VMLA.F32 s25, s11, s5
125        VMLA.F32 s28, s10, s7
126        VMLA.F32 s29, s11, s7
127
128        SUBS    r5, r5, 8
129
130        VMLA.F32 s18, s12, s1
131        VMLA.F32 s19, s13, s1
132        VMLA.F32 s22, s12, s3
133        VMLA.F32 s23, s13, s3
134        VMLA.F32 s26, s12, s5
135        VMLA.F32 s27, s13, s5
136        VMLA.F32 s30, s12, s7
137        VMLA.F32 s31, s13, s7
138
139        BHS     1b
140
141        # Is there a remainder?- 1 float of A (4 bytes)
142        TST     r5, 4
143        BNE     3f
144
1452:
146        # Clamp
147        VCMPE.F32 s8, s16
148        VMRS    APSR_nzcv, FPSCR
149        VCMPE.F32 s8, s17
150        VMOVPL.F32 s16, s8
151        VMRS    APSR_nzcv, FPSCR
152        VCMPE.F32 s8, s18
153        VMOVPL.F32 s17, s8
154        VMRS    APSR_nzcv, FPSCR
155        VCMPE.F32 s8, s19
156        VMOVPL.F32 s18, s8
157        VMRS    APSR_nzcv, FPSCR
158        VCMPE.F32 s8, s20
159        VMOVPL.F32 s19, s8
160        VMRS    APSR_nzcv, FPSCR
161        VCMPE.F32 s8, s21
162        VMOVPL.F32 s20, s8
163        VMRS    APSR_nzcv, FPSCR
164        VCMPE.F32 s8, s22
165        VMOVPL.F32 s21, s8
166        VMRS    APSR_nzcv, FPSCR
167        VCMPE.F32 s8, s23
168        VMOVPL.F32 s22, s8
169        VMRS    APSR_nzcv, FPSCR
170        VCMPE.F32 s8, s24
171        VMOVPL.F32 s23, s8
172        VMRS    APSR_nzcv, FPSCR
173        VCMPE.F32 s8, s25
174        VMOVPL.F32 s24, s8
175        VMRS    APSR_nzcv, FPSCR
176        VCMPE.F32 s8, s26
177        VMOVPL.F32 s25, s8
178        VMRS    APSR_nzcv, FPSCR
179        VCMPE.F32 s8, s27
180        VMOVPL.F32 s26, s8
181        VMRS    APSR_nzcv, FPSCR
182        VCMPE.F32 s8, s28
183        VMOVPL.F32 s27, s8
184        VMRS    APSR_nzcv, FPSCR
185        VCMPE.F32 s8, s29
186        VMOVPL.F32 s28, s8
187        VMRS    APSR_nzcv, FPSCR
188        VCMPE.F32 s8, s30
189        VMOVPL.F32 s29, s8
190        VMRS    APSR_nzcv, FPSCR
191        VCMPE.F32 s8, s31
192        VMOVPL.F32 s30, s8
193        VMRS    APSR_nzcv, FPSCR
194        VCMPE.F32 s9, s16
195        VMOVPL.F32 s31, s8
196        VMRS    APSR_nzcv, FPSCR
197        VCMPE.F32 s9, s17
198        VMOVMI.F32 s16, s9
199        VMRS    APSR_nzcv, FPSCR
200        VCMPE.F32 s9, s18
201        VMOVMI.F32 s17, s9
202        VMRS    APSR_nzcv, FPSCR
203        VCMPE.F32 s9, s19
204        VMOVMI.F32 s18, s9
205        VMRS    APSR_nzcv, FPSCR
206        VCMPE.F32 s9, s20
207        VMOVMI.F32 s19, s9
208        VMRS    APSR_nzcv, FPSCR
209        VCMPE.F32 s9, s21
210        VMOVMI.F32 s20, s9
211        VMRS    APSR_nzcv, FPSCR
212        VCMPE.F32 s9, s22
213        VMOVMI.F32 s21, s9
214        VMRS    APSR_nzcv, FPSCR
215        VCMPE.F32 s9, s23
216        VMOVMI.F32 s22, s9
217        VMRS    APSR_nzcv, FPSCR
218        VCMPE.F32 s9, s24
219        VMOVMI.F32 s23, s9
220        VMRS    APSR_nzcv, FPSCR
221        VCMPE.F32 s9, s25
222        VMOVMI.F32 s24, s9
223        VMRS    APSR_nzcv, FPSCR
224        VCMPE.F32 s9, s26
225        VMOVMI.F32 s25, s9
226        VMRS    APSR_nzcv, FPSCR
227        VCMPE.F32 s9, s27
228        VMOVMI.F32 s26, s9
229        VMRS    APSR_nzcv, FPSCR
230        VCMPE.F32 s9, s28
231        VMOVMI.F32 s27, s9
232        VMRS    APSR_nzcv, FPSCR
233        VCMPE.F32 s9, s29
234        VMOVMI.F32 s28, s9
235        VMRS    APSR_nzcv, FPSCR
236        VCMPE.F32 s9, s30
237        VMOVMI.F32 s29, s9
238        VMRS    APSR_nzcv, FPSCR
239        VCMPE.F32 s9, s31
240        VMOVMI.F32 s30, s9
241        VMRS    APSR_nzcv, FPSCR
242        VMOVMI.F32 s31, s9
243
244        SUBS    r1, r1, 4
245        BLO     4f
246
247        # Store full 4 x 4
248        VSTM    r6, {d8-d9}
249        SUB     r0, r0, r2
250        ADD     r6, r11
251        VSTM    r4, {d10-d11}
252        SUB     r10, r10, r2
253        ADD     r4, r11
254        VSTM    r8, {d12-d13}
255        SUB     r12, r12, r2
256        ADD     r8, r11
257        VSTM    r7, {d14-d15}
258        SUB     r3, r3, r2
259        ADD     r7, r11
260        BHI     0b
261
262        VPOP    {d8-d15}
263        POP     {r4, r5, r6, r7, r8, r9, r10, r11}
264        BX      lr
265
2663:
267        # Remainder- 1 float of A (4 bytes)
268        VLDM    r3!, {s0}               // A0
269        VLDM    r9!, {d6-d7}            // B
270        VLDM    r12!, {s1}              // A1
271        VLDM    r10!, {s2}              // A2
272        VLDM    r0!, {s3}               // A3
273
274        VMLA.F32 s16, s12, s0
275        VMLA.F32 s17, s13, s0
276        VMLA.F32 s18, s14, s0
277        VMLA.F32 s19, s15, s0
278
279        VMLA.F32 s20, s12, s1
280        VMLA.F32 s21, s13, s1
281        VMLA.F32 s22, s14, s1
282        VMLA.F32 s23, s15, s1
283
284        VMLA.F32 s24, s12, s2
285        VMLA.F32 s25, s13, s2
286        VMLA.F32 s26, s14, s2
287        VMLA.F32 s27, s15, s2
288
289        VMLA.F32 s28, s12, s3
290        VMLA.F32 s29, s13, s3
291        VMLA.F32 s30, s14, s3
292        VMLA.F32 s31, s15, s3
293
294        B       2b
295
296        # Store odd width
2974:
298        TST     r1, 2
299        BEQ     5f
300        VSTM    r6!, {d8}
301        VMOV.F32 s16, s18
302        VSTM    r4!, {d10}
303        VMOV.F32 s20, s22
304        VSTM    r8!, {d12}
305        VMOV.F32 s24, s26
306        VSTM    r7!, {d14}
307        VMOV.F32 s28, s30
308
3095:
310        TST     r1, 1
311        BEQ     6f
312        VSTR    s16, [r6]
313        VSTR    s20, [r4]
314        VSTR    s24, [r8]
315        VSTR    s28, [r7]
316
3176:
318        VPOP    {d8-d15}
319        POP     {r4, r5, r6, r7, r8, r9, r10, r11}
320        BX      lr
321
322END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64
323
324#ifdef __ELF__
325.section ".note.GNU-stack","",%progbits
326#endif
327