xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifdef __aarch64__
25 
26 #include <arm_neon.h>
27 
28 #include "../../asmlib.hpp"
29 
30 // Kernel implementation.
31 //
32 // Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
33 // Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
34 // Assume that "Cpanel" points to a chunk of C output blocks (each size
35 // 12x8), the chunks being arranged in a row major fashion.
36 //
37 // Note that the intent of this is that either ablocks or bblocks will be 1
38 // - this construction allows the output loop to proceed in either order.
39 
40 namespace arm_gemm {
41 
a64_sgemm_asimd_8x6(const float * Apanel,const float * Bpanel,float * Cpanel,int ablocks,int bblocks,int K)42 void a64_sgemm_asimd_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
43     const float *a_ptr = Apanel;
44     float *c_ptr = Cpanel;
45 
46     for (int yb=0; yb<ablocks; yb++) {
47         const float *a_ptr0 = a_ptr;
48         const float *b_ptr = Bpanel;
49 
50         for (int xb=0; xb<bblocks; xb++) {
51             a_ptr = a_ptr0;
52             // Fix up for odd lengths - set a flag if K is odd, but make
53             // sure we round up the iteration count.
54             int oddk = (K & 1);
55             int k = ((K+1)/2) - 1;
56 
57             register float32x4_t a0  asm("v0");
58             register float32x4_t a1  asm("v1");
59             register float32x4_t a2  asm("v2");
60             register float32x4_t a3  asm("v3");
61             register float32x4_t b0  asm("v4");
62             register float32x4_t b1  asm("v5");
63             register float32x4_t b2  asm("v6");
64 
65             __asm __volatile (
66                 // Initialize result registers, load initial operands, prime prefetches.
67                 "movi	v8.2s, #0x0\n"
68                 "ld1r	{ %[a0].2s }, [%[a_ptr]], #4\n"
69                 "movi	v9.2s, #0x0\n"
70                 "movi	v10.2s, #0x0\n"
71                 "ld1r	{ %[a1].2s }, [%[a_ptr]], #4\n"
72                 "movi	v11.2s, #0x0\n"
73                 "movi	v12.2s, #0x0\n"
74                 "movi	v13.2s, #0x0\n"
75                 "movi	v14.2s, #0x0\n"
76                 ASM_PREFETCH("[%[b_ptr], #64]")
77                 ASM_PREFETCHU("[%[a_ptr], #52]")
78                 ASM_PREFETCHU("[%[a_ptr], #116]")
79                 ASM_PREFETCH("[%[b_ptr], #128]")
80                 "movi	v15.2s, #0x0\n"
81                 "movi	v16.2s, #0x0\n"
82                 "movi	v17.2s, #0x0\n"
83                 "movi	v18.2s, #0x0\n"
84                 "movi	v19.2s, #0x0\n"
85                 "movi	v20.2s, #0x0\n"
86                 "movi	v21.2s, #0x0\n"
87                 "movi	v22.2s, #0x0\n"
88                 "movi	v23.2s, #0x0\n"
89                 "movi	v24.2s, #0x0\n"
90                 "movi	v25.2s, #0x0\n"
91                 "movi	v26.2s, #0x0\n"
92                 "movi	v27.2s, #0x0\n"
93                 "movi	v28.2s, #0x0\n"
94                 "movi	v29.2s, #0x0\n"
95                 "movi	v30.2s, #0x0\n"
96                 "movi	v31.2s, #0x0\n"
97 
98                 // Skip loop if we are doing zero iterations of it.
99                 "cbz	%w[k], 4f\n"
100 
101                 // Loop proper
102                 "1:\n"
103                 "ldr	%d[b0], [%[b_ptr], #0]\n"
104                 "ld1r	{ %[a2].2s }, [%[a_ptr]], #4\n"
105                 "ldr	%d[b1], [%[b_ptr], #8]\n"
106                 "fmla 	v8.2s , %[b0].2s, %[a0].2s\n"
107                 "fmla  	v9.2s , %[b0].2s, %[a1].2s\n"
108                 "fmla	v10.2s, %[b0].2s, %[a2].2s\n"
109 
110                 "ld1r	{ %[a3].2s }, [%[a_ptr]], #4\n"
111                 "fmla	v16.2s, %[b1].2s, %[a0].2s\n"
112                 "fmla	v17.2s, %[b1].2s, %[a1].2s\n"
113                 "fmla	v11.2s, %[b0].2s, %[a3].2s\n"
114 
115                 "ldr	%d[b2], [%[b_ptr], #16]\n"
116                 "fmla	v18.2s, %[b1].2s, %[a2].2s\n"
117                 "fmla	v19.2s, %[b1].2s, %[a3].2s\n"
118                 "fmla	v24.2s, %[b2].2s, %[a0].2s\n"
119 
120                 "ld1r	{ %[a0].2s }, [%[a_ptr]], #4\n"
121                 "fmla	v25.2s, %[b2].2s, %[a1].2s\n"
122                 "fmla	v26.2s, %[b2].2s, %[a2].2s\n"
123                 "fmla	v27.2s, %[b2].2s, %[a3].2s\n"
124 
125                 "ld1r	{ %[a1].2s }, [%[a_ptr]], #4\n"
126                 "fmla 	v12.2s, %[b0].2s, %[a0].2s\n"
127                 "fmla	v20.2s, %[b1].2s, %[a0].2s\n"
128                 "fmla	v28.2s, %[b2].2s, %[a0].2s\n"
129 
130                 "ld1r	{ %[a2].2s }, [%[a_ptr]], #4\n"
131                 "fmla	v13.2s, %[b0].2s, %[a1].2s\n"
132                 "fmla	v21.2s, %[b1].2s, %[a1].2s\n"
133                 "fmla	v29.2s, %[b2].2s, %[a1].2s\n"
134 
135                 "ld1r	{ %[a3].2s }, [%[a_ptr]], #4\n"
136                 "fmla	v14.2s, %[b0].2s, %[a2].2s\n"
137                 "fmla	v22.2s, %[b1].2s, %[a2].2s\n"
138                 "fmla	v30.2s, %[b2].2s, %[a2].2s\n"
139 
140                 "ld1r	{ %[a0].2s }, [%[a_ptr]], #4\n"
141                 "fmla	v15.2s, %[b0].2s, %[a3].2s\n"
142                 "fmla	v23.2s, %[b1].2s, %[a3].2s\n"
143                 "fmla	v31.2s, %[b2].2s, %[a3].2s\n"
144 
145                 "ld1r	{ %[a1].2s }, [%[a_ptr]], #4\n"
146                  ASM_PREFETCH("[%[b_ptr], #128]")
147                 "subs	%w[k], %w[k], #1\n"
148                  ASM_PREFETCHU("[%[a_ptr], #156]")
149                 "ldr	%d[b0], [%[b_ptr], #24]\n"
150                 "ld1r	{ %[a2].2s }, [%[a_ptr]], #4\n"
151 
152                 "ldr	%d[b1], [%[b_ptr], #32]\n"
153                 "fmla 	v8.2s , %[b0].2s, %[a0].2s\n"
154                 "fmla  	v9.2s , %[b0].2s, %[a1].2s\n"
155                 "fmla	v10.2s, %[b0].2s, %[a2].2s\n"
156 
157                 "ld1r	{ %[a3].2s }, [%[a_ptr]], #4\n"
158                 "fmla	v16.2s, %[b1].2s, %[a0].2s\n"
159                 "fmla	v17.2s, %[b1].2s, %[a1].2s\n"
160                 "fmla	v11.2s, %[b0].2s, %[a3].2s\n"
161 
162                 "ldr	%d[b2], [%[b_ptr], #40]\n"
163                 "fmla	v18.2s, %[b1].2s, %[a2].2s\n"
164                 "fmla	v19.2s, %[b1].2s, %[a3].2s\n"
165                 "fmla	v24.2s, %[b2].2s, %[a0].2s\n"
166 
167                 "ld1r	{ %[a0].2s }, [%[a_ptr]], #4\n"
168                 "fmla	v25.2s, %[b2].2s, %[a1].2s\n"
169                 "fmla	v26.2s, %[b2].2s, %[a2].2s\n"
170                 "fmla	v27.2s, %[b2].2s, %[a3].2s\n"
171 
172                 "ld1r	{ %[a1].2s }, [%[a_ptr]], #4\n"
173                 "fmla 	v12.2s, %[b0].2s, %[a0].2s\n"
174                 "fmla	v20.2s, %[b1].2s, %[a0].2s\n"
175                 "fmla	v28.2s, %[b2].2s, %[a0].2s\n"
176 
177                 "ld1r	{ %[a2].2s }, [%[a_ptr]], #4\n"
178                 "fmla	v13.2s, %[b0].2s, %[a1].2s\n"
179                 "fmla	v21.2s, %[b1].2s, %[a1].2s\n"
180                 "fmla	v29.2s, %[b2].2s, %[a1].2s\n"
181 
182                 "ld1r	{ %[a3].2s }, [%[a_ptr]], #4\n"
183                 "fmla	v14.2s, %[b0].2s, %[a2].2s\n"
184                 "fmla	v22.2s, %[b1].2s, %[a2].2s\n"
185                 "fmla	v30.2s, %[b2].2s, %[a2].2s\n"
186 
187                 "ld1r	{ %[a0].2s }, [%[a_ptr]], #4\n"
188                 "fmla	v15.2s, %[b0].2s, %[a3].2s\n"
189                 "fmla	v23.2s, %[b1].2s, %[a3].2s\n"
190                 "fmla	v31.2s, %[b2].2s, %[a3].2s\n"
191 
192                 "ld1r	{ %[a1].2s }, [%[a_ptr]], #4\n"
193                 "add	%[b_ptr], %[b_ptr], #48\n"
194                  ASM_PREFETCHU("[%[a_ptr], #188]")
195                 "bne	1b\n"
196 
197                 // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
198                 "4:\n"
199                 ASM_PREFETCH("[%[c_ptr]]")
200                 ASM_PREFETCH("[%[c_ptr], #64]")
201 
202                 "ldr	%d[b0], [%[b_ptr]]\n"
203                 "ld1r	{ %[a2].2s }, [%[a_ptr]], #4\n"
204 
205                 // Branch to alternative tail for odd K
206                 "cbnz	%w[oddk], 2f\n"
207 
208                 // Detached final iteration (even K)
209                 "ldr	%d[b1], [%[b_ptr], #8]\n"
210                 "fmla 	v8.2s , %[b0].2s, %[a0].2s\n"
211                 "fmla  	v9.2s , %[b0].2s, %[a1].2s\n"
212                 "fmla	v10.2s, %[b0].2s, %[a2].2s\n"
213 
214                 "ld1r	{ %[a3].2s }, [%[a_ptr]], #4\n"
215                 "fmla	v16.2s, %[b1].2s, %[a0].2s\n"
216                 "fmla	v17.2s, %[b1].2s, %[a1].2s\n"
217                 "fmla	v11.2s, %[b0].2s, %[a3].2s\n"
218 
219                 "ldr	%d[b2], [%[b_ptr], #16]\n"
220                 "fmla	v18.2s, %[b1].2s, %[a2].2s\n"
221                 "fmla	v19.2s, %[b1].2s, %[a3].2s\n"
222                 "fmla	v24.2s, %[b2].2s, %[a0].2s\n"
223 
224                 "ld1r	{ %[a0].2s }, [%[a_ptr]], #4\n"
225                 "fmla	v25.2s, %[b2].2s, %[a1].2s\n"
226                 "fmla	v26.2s, %[b2].2s, %[a2].2s\n"
227                 "fmla	v27.2s, %[b2].2s, %[a3].2s\n"
228 
229                 "ld1r	{ %[a1].2s }, [%[a_ptr]], #4\n"
230                 "fmla 	v12.2s, %[b0].2s, %[a0].2s\n"
231                 "fmla	v20.2s, %[b1].2s, %[a0].2s\n"
232                 "fmla	v28.2s, %[b2].2s, %[a0].2s\n"
233 
234                 "ld1r	{ %[a2].2s }, [%[a_ptr]], #4\n"
235                 "fmla	v13.2s, %[b0].2s, %[a1].2s\n"
236                 "fmla	v21.2s, %[b1].2s, %[a1].2s\n"
237                 "fmla	v29.2s, %[b2].2s, %[a1].2s\n"
238 
239                 "ld1r	{ %[a3].2s }, [%[a_ptr]], #4\n"
240                 "fmla	v14.2s, %[b0].2s, %[a2].2s\n"
241                 "fmla	v22.2s, %[b1].2s, %[a2].2s\n"
242                 "fmla	v30.2s, %[b2].2s, %[a2].2s\n"
243 
244                 "ld1r	{ %[a0].2s }, [%[a_ptr]], #4\n"
245                 "fmla	v15.2s, %[b0].2s, %[a3].2s\n"
246                 "fmla	v23.2s, %[b1].2s, %[a3].2s\n"
247                 "fmla	v31.2s, %[b2].2s, %[a3].2s\n"
248 
249                 "ldr	%d[b0], [%[b_ptr], #24]\n"
250                 "add	%[b_ptr], %[b_ptr], #48\n"
251                  ASM_PREFETCH("[%[b_ptr], #128]")
252                 "ld1r	{ %[a1].2s }, [%[a_ptr]], #4\n"
253                 "ld1r	{ %[a2].2s }, [%[a_ptr]], #4\n"
254 
255                 "ldr	%d[b1], [%[b_ptr], #-16]\n"
256                 "fmla 	v8.2s , %[b0].2s, %[a0].2s\n"
257                 "fmla  	v9.2s , %[b0].2s, %[a1].2s\n"
258                 "fmla	v10.2s, %[b0].2s, %[a2].2s\n"
259 
260                 "ld1r	{ %[a3].2s }, [%[a_ptr]], #4\n"
261                 "fmla	v16.2s, %[b1].2s, %[a0].2s\n"
262                 "fmla	v17.2s, %[b1].2s, %[a1].2s\n"
263                 "fmla	v11.2s, %[b0].2s, %[a3].2s\n"
264 
265                 "ldr	%d[b2], [%[b_ptr], #-8]\n"
266                 "fmla	v18.2s, %[b1].2s, %[a2].2s\n"
267                 "fmla	v19.2s, %[b1].2s, %[a3].2s\n"
268                 "fmla	v24.2s, %[b2].2s, %[a0].2s\n"
269 
270                 "ld1r	{ %[a0].2s }, [%[a_ptr]], #4\n"
271                 "fmla	v25.2s, %[b2].2s, %[a1].2s\n"
272                 "fmla	v26.2s, %[b2].2s, %[a2].2s\n"
273                 "fmla	v27.2s, %[b2].2s, %[a3].2s\n"
274 
275                 "ld1r	{ %[a1].2s }, [%[a_ptr]], #4\n"
276                 "fmla 	v12.2s, %[b0].2s, %[a0].2s\n"
277                 "fmla	v20.2s, %[b1].2s, %[a0].2s\n"
278                 "fmla	v28.2s, %[b2].2s, %[a0].2s\n"
279 
280                 "ld1r	{ %[a2].2s }, [%[a_ptr]], #4\n"
281                 "fmla	v13.2s, %[b0].2s, %[a1].2s\n"
282                 "fmla	v21.2s, %[b1].2s, %[a1].2s\n"
283                 "fmla	v29.2s, %[b2].2s, %[a1].2s\n"
284 
285                 "ld1r	{ %[a3].2s }, [%[a_ptr]], #4\n"
286                 "fmla	v14.2s, %[b0].2s, %[a2].2s\n"
287                 "fmla	v22.2s, %[b1].2s, %[a2].2s\n"
288                 "fmla	v30.2s, %[b2].2s, %[a2].2s\n"
289 
290                 "fmla	v15.2s, %[b0].2s, %[a3].2s\n"
291                 "fmla	v23.2s, %[b1].2s, %[a3].2s\n"
292                 "fmla	v31.2s, %[b2].2s, %[a3].2s\n"
293 
294                 "b	3f\n"
295 
296                 // Detached final iteration (odd K)
297                 "2:\n"
298                 "ldr	%d[b1], [%[b_ptr], #8]\n"
299                 "fmla 	v8.2s , %[b0].2s, %[a0].2s\n"
300                 "fmla  	v9.2s , %[b0].2s, %[a1].2s\n"
301                 "fmla	v10.2s, %[b0].2s, %[a2].2s\n"
302 
303                 "ld1r	{ %[a3].2s }, [%[a_ptr]], #4\n"
304                 "fmla	v16.2s, %[b1].2s, %[a0].2s\n"
305                 "fmla	v17.2s, %[b1].2s, %[a1].2s\n"
306                 "fmla	v11.2s, %[b0].2s, %[a3].2s\n"
307 
308                 "ldr	%d[b2], [%[b_ptr], #16]\n"
309                 "fmla	v18.2s, %[b1].2s, %[a2].2s\n"
310                 "fmla	v19.2s, %[b1].2s, %[a3].2s\n"
311                 "fmla	v24.2s, %[b2].2s, %[a0].2s\n"
312 
313                 "ld1r	{ %[a0].2s }, [%[a_ptr]], #4\n"
314                 "fmla	v25.2s, %[b2].2s, %[a1].2s\n"
315                 "fmla	v26.2s, %[b2].2s, %[a2].2s\n"
316                 "fmla	v27.2s, %[b2].2s, %[a3].2s\n"
317 
318                 "ld1r	{ %[a1].2s }, [%[a_ptr]], #4\n"
319                 "fmla 	v12.2s, %[b0].2s, %[a0].2s\n"
320                 "fmla	v20.2s, %[b1].2s, %[a0].2s\n"
321                 "fmla	v28.2s, %[b2].2s, %[a0].2s\n"
322 
323                 "ld1r	{ %[a2].2s }, [%[a_ptr]], #4\n"
324                 "fmla	v13.2s, %[b0].2s, %[a1].2s\n"
325                 "fmla	v21.2s, %[b1].2s, %[a1].2s\n"
326                 "fmla	v29.2s, %[b2].2s, %[a1].2s\n"
327 
328                 "ld1r	{ %[a3].2s }, [%[a_ptr]], #4\n"
329                 "fmla	v14.2s, %[b0].2s, %[a2].2s\n"
330                 "fmla	v22.2s, %[b1].2s, %[a2].2s\n"
331                 "fmla	v30.2s, %[b2].2s, %[a2].2s\n"
332 
333                 "fmla	v15.2s, %[b0].2s, %[a3].2s\n"
334                 "fmla	v23.2s, %[b1].2s, %[a3].2s\n"
335                 "fmla	v31.2s, %[b2].2s, %[a3].2s\n"
336 
337                 "add	%[b_ptr], %[b_ptr], #24\n"
338 
339                 // Common tail
340                 "3:\n"
341                 "str	d8, [%[c_ptr], #0]\n"
342                 "str	d16, [%[c_ptr], #8]\n"
343                 "str	d24, [%[c_ptr], #16]\n"
344                 "str	d9, [%[c_ptr], #24]\n"
345                 "str	d17, [%[c_ptr], #32]\n"
346                 "str	d25, [%[c_ptr], #40]\n"
347                 "str	d10, [%[c_ptr], #48]\n"
348                 "str	d18, [%[c_ptr], #56]\n"
349                 "str	d26, [%[c_ptr], #64]\n"
350                 "str	d11, [%[c_ptr], #72]\n"
351                 "str	d19, [%[c_ptr], #80]\n"
352                 "str	d27, [%[c_ptr], #88]\n"
353                 "str	d12, [%[c_ptr], #96]\n"
354                 "str	d20, [%[c_ptr], #104]\n"
355                 "str	d28, [%[c_ptr], #112]\n"
356                 "str	d13, [%[c_ptr], #120]\n"
357                 "str	d21, [%[c_ptr], #128]\n"
358                 "str	d29, [%[c_ptr], #136]\n"
359                 "str	d14, [%[c_ptr], #144]\n"
360                 "str	d22, [%[c_ptr], #152]\n"
361                 "str	d30, [%[c_ptr], #160]\n"
362                 "str	d15, [%[c_ptr], #168]\n"
363                 "str	d23, [%[c_ptr], #176]\n"
364                 "str	d31, [%[c_ptr], #184]\n"
365                 "add	%[c_ptr], %[c_ptr], #192\n"
366 
367             :
368               [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
369               [a0] "+w" (a0), [a1] "+w" (a1), [a2] "+w" (a2), [a3] "+w" (a3),
370               [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
371             : [oddk] "r" (oddk)
372             : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
373               "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
374             );
375         }
376     }
377 }
378 
379 } // namespace arm_gemm
380 
381 #endif
382