xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/4x4-aarch32-vfp-ld64.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8.syntax unified
9
10// void xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64(
11//     size_t mr,                            r0
12//     size_t nc,                            r1
13//     size_t kc,                            r2 -> r5
14//     const uint8_t*restrict a,             r3
15//     size_t a_stride,          sp + 96  -> (r11)
16//     const void*restrict w,    sp + 100 -> r9
17//     uint8_t*restrict c,       sp + 104 -> r6
18//     size_t cm_stride,         sp + 108 -> (r7)
19//     size_t cn_stride,         sp + 112 -> r11
20//     const union xnn_f32_default_params params)  sp + 116 -> (r11)
21
22// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
23
24// Register usage
25
26// A0   r3  s0-s1  d0
27// A1  r12  s2-s3  d1
28// A2  r10  s4-s5  d2
29// A3   r0  s6-s7  d3
30
31// B    r9   s8,  s9, s10, s11 d4-d5
32// B        s12, s13, s14, s15 d6-d7
33
34// C0   r6 s16-s17  d8  s18-s19  d9
35// C1   r4 s20-s21 d10  s22-s23 d11
36// C2   r8 s24-s25 d12  s26-s27 d13
37// C3   r7 s28-s29 d14  s30-s31 d15
38
39BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64
40        .arm
41#ifndef __APPLE__
42        .arch   armv6
43        .fpu    vfp
44#endif
45        # Push 96 bytes
46        PUSH    {r4, r5, r6, r7, r8, r9, r10, r11}  // 32
47        VPUSH   {d8-d15}                            // +64 = 96
48
49        LDR     r11, [sp, 96]           // Load a_stride
50        LDRD    r6, r7, [sp, 104]       // Load c and cm_stride
51
52        # Clamp A and C pointers
53        CMP     r0, 2                   // if mr >= 2
54        ADD     r12, r3, r11            //   a1 = a0 + a_stride
55        ADD     r4, r6, r7              //   c1 = c0 + cm_stride
56        MOVLO   r12, r3                 // a1
57        MOVLO   r4, r6                  // c1
58
59        LDR     r9, [sp, 100]           // Load w
60
61                                        // if mr > 2
62        ADD     r10, r12, r11           //   a2 = a1 + a_stride
63        ADD     r8, r4, r7              //   c2 = c1 + cm_stride
64        MOVLS   r10, r12                // a2
65        MOVLS   r8, r4                  // c2
66
67        CMP     r0, 4                   // if mr >=4
68        ADD     r0, r10, r11            //   a3 = a2 + a_stride
69        ADD     r7, r8, r7              //   c3 = c2 + cm_stride
70        LDR     r11, [sp, 112]          // Load cn_stride
71        MOVLO   r0, r10                 // a3
72        MOVLO   r7, r8                  // c3
73
740:
75        # Load initial bias from w into accumulators
76        VLDM    r9!, {d8-d9}            // Bias
77        SUBS    r5, r2, 8
78        VMOV.F64 d10, d8
79        VMOV.F64 d12, d8
80        VMOV.F64 d14, d8
81        VMOV.F64 d11, d9
82        VMOV.F64 d13, d9
83        VMOV.F64 d15, d9
84        BLO     3f                      // less than 2 channels?
85
86        # Main loop - 2 floats of A (8 bytes)
871:
88        VLDM    r3!, {d0}               // A0
89        VLDM    r9!, {d4-d5}            // B0
90        VLDM    r12!, {d1}              // A1
91        VLDM    r10!, {d2}              // A2
92        VLDM    r0!, {d3}               // A3
93
94        VMLA.F32 s16, s8, s0
95        VMLA.F32 s17, s9, s0
96        VMLA.F32 s20, s8, s2
97        VMLA.F32 s21, s9, s2
98        VMLA.F32 s24, s8, s4
99        VMLA.F32 s25, s9, s4
100        VMLA.F32 s28, s8, s6
101        VMLA.F32 s29, s9, s6
102
103        VLDM    r9!, {d6-d7}            // B1
104
105        VMLA.F32 s18, s10, s0
106        VMLA.F32 s19, s11, s0
107        VMLA.F32 s22, s10, s2
108        VMLA.F32 s23, s11, s2
109        VMLA.F32 s26, s10, s4
110        VMLA.F32 s27, s11, s4
111        VMLA.F32 s30, s10, s6
112        VMLA.F32 s31, s11, s6
113
114        VMLA.F32 s16, s12, s1
115        VMLA.F32 s17, s13, s1
116        VMLA.F32 s20, s12, s3
117        VMLA.F32 s21, s13, s3
118        VMLA.F32 s24, s12, s5
119        VMLA.F32 s25, s13, s5
120        VMLA.F32 s28, s12, s7
121        VMLA.F32 s29, s13, s7
122
123        SUBS    r5, r5, 8
124
125        VMLA.F32 s18, s14, s1
126        VMLA.F32 s19, s15, s1
127        VMLA.F32 s22, s14, s3
128        VMLA.F32 s23, s15, s3
129        VMLA.F32 s26, s14, s5
130        VMLA.F32 s27, s15, s5
131        VMLA.F32 s30, s14, s7
132        VMLA.F32 s31, s15, s7
133
134        BHS     1b
135
136        # Is there a remainder?- 1 float of A (4 bytes)
137        TST     r5, 4
138        BNE     3f
139
1402:
141
142        SUBS    r1, r1, 4
143        BLO     4f
144
145        # Store full 4 x 4
146        VSTM    r6, {d8-d9}
147        SUB     r0, r0, r2
148        ADD     r6, r11
149        VSTM    r4, {d10-d11}
150        SUB     r10, r10, r2
151        ADD     r4, r11
152        VSTM    r8, {d12-d13}
153        SUB     r12, r12, r2
154        ADD     r8, r11
155        VSTM    r7, {d14-d15}
156        SUB     r3, r3, r2
157        ADD     r7, r11
158        BHI     0b
159
160        VPOP    {d8-d15}
161        POP     {r4, r5, r6, r7, r8, r9, r10, r11}
162        BX      lr
163
1643:
165        # Remainder- 1 float of A (4 bytes)
166        VLDM    r3!, {s0}               // A0
167        VLDM    r9!, {d6-d7}            // B
168        VLDM    r12!, {s1}              // A1
169        VLDM    r10!, {s2}              // A2
170        VLDM    r0!, {s3}               // A3
171
172        VMLA.F32 s16, s12, s0
173        VMLA.F32 s17, s13, s0
174        VMLA.F32 s18, s14, s0
175        VMLA.F32 s19, s15, s0
176
177        VMLA.F32 s20, s12, s1
178        VMLA.F32 s21, s13, s1
179        VMLA.F32 s22, s14, s1
180        VMLA.F32 s23, s15, s1
181
182        VMLA.F32 s24, s12, s2
183        VMLA.F32 s25, s13, s2
184        VMLA.F32 s26, s14, s2
185        VMLA.F32 s27, s15, s2
186
187        VMLA.F32 s28, s12, s3
188        VMLA.F32 s29, s13, s3
189        VMLA.F32 s30, s14, s3
190        VMLA.F32 s31, s15, s3
191
192        B       2b
193
194        # Store odd width
1954:
196        TST     r1, 2
197        BEQ     5f
198        VSTM    r6!, {d8}
199        VMOV.F32 s16, s18
200        VSTM    r4!, {d10}
201        VMOV.F32 s20, s22
202        VSTM    r8!, {d12}
203        VMOV.F32 s24, s26
204        VSTM    r7!, {d14}
205        VMOV.F32 s28, s30
206
2075:
208        TST     r1, 1
209        BEQ     6f
210        VSTR    s16, [r6]
211        VSTR    s20, [r4]
212        VSTR    s24, [r8]
213        VSTR    s28, [r7]
214
2156:
216        VPOP    {d8-d15}
217        POP     {r4, r5, r6, r7, r8, r9, r10, r11}
218        BX      lr
219
220END_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64
221
222#ifdef __ELF__
223.section ".note.GNU-stack","",%progbits
224#endif
225