xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# A1 x15  v1  v7
32# B   x5  v4  v5  v8  v9
33# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
34# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
35# temp0   v2 v10 v12 v14
36# temp1   v3 v11 v13 v15
37# x16, x17, x20, x21 tenporary a53 gpr load data
38
39
40BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
41
42        # Clamp C pointers
43        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
44        CMP     x0, 2                   // if mr < 2
45        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
46        ADD     x7, x6, x7              // c1 = c0 + cm_stride
47        STP     d8, d9, [sp, -80]!
48        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
49        STP     d10, d11, [sp, 16]
50        CSEL    x7, x6, x7, LO          //   c1 = c0
51        STP     d12, d13, [sp, 32]
52        BIC     x2, x2, 7
53        STP     d14, d15, [sp, 48]
54        STP     x20, x21, [sp, 64]      // Save x20,x21 on stack
55
56        .p2align 3
570:
58        # Load initial bias from w into accumulators
59        LDP     s16, s18, [x5], 8
60        MOV     v17.16b, v16.16b
61        MOV     v19.16b, v18.16b
62        LDP     s20, s22, [x5], 8
63        MOV     v21.16b, v20.16b
64        MOV     v23.16b, v22.16b
65        LDP     s24, s26, [x5], 8
66        MOV     v25.16b, v24.16b
67        MOV     v27.16b, v26.16b
68        LDP     s28, s30, [x5], 8
69        MOV     v29.16b, v28.16b
70        MOV     v31.16b, v30.16b
71        MOV     x9, x3                  // p = ks
72
73        .p2align 3
741:
75        # Load next 2 A pointers
76        LDP     x13, x15, [x4], 16
77        CMP     x13, x12                // if a0 == zero
78        ADD     x13, x13, x8            // a0 += a_offset
79        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
80        CMP     x15, x12                // if a1 == zero
81        ADD     x15, x15, x8            // a1 += a_offset
82        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
83
84        # Is there at least 16 bytes for epilogue?
85        SUBS    x0, x2, 16              // k = kc - 16
86        B.LO    5f
87
88        # Prologue: load A0, A1 and 2 B's
89        LDP     d4, d5, [x5]            // Read B
90        LDP     d0, d6, [x13], 16
91        LDP     d1, d7, [x15], 16
92//        LDP     d8, d9, [x5, 64]
93        LDR     x17, [x5, 64]           // Read B
94        LDR     x16, [x5, 16]
95
96        # Is there at least 16 bytes for main loop?
97        SUBS    x0, x0, 16              // k = k - 16
98        B.LO    3f
99
100         # Main loop - 16 bytes of A
101         # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles.
102         # 2 loads for A0 = +2 cycles.  Total 18 * 4 + 2 = 74 cycles.
103
104        .p2align 3
1052:
106        # BLOCK 0 - 18 cycles - includes prfm
107        LDR     d9, [x5, 72]            // Read B
108        INS     v8.d[0], x17
109        SMULL   v2.8h, v4.8b, v0.8b
110        SMULL   v3.8h, v4.8b, v1.8b
111        LDR     x17, [x5, 80]
112        SMULL   v10.8h, v5.8b, v0.8b
113        SMULL   v11.8h, v5.8b, v1.8b
114        LDR     d5, [x5, 24]
115        INS     v4.d[0], x16
116        SMLAL   v2.8h, v8.8b, v6.8b
117        SMLAL   v3.8h, v8.8b, v7.8b
118        LDR     x16, [x5, 32]
119        SMLAL   v10.8h, v9.8b, v6.8b
120        SMLAL   v11.8h, v9.8b, v7.8b
121        SADALP  v16.4s,  v2.8h
122        SADALP  v17.4s,  v3.8h
123        SADALP  v18.4s, v10.8h
124        SADALP  v19.4s, v11.8h
125
126        # BLOCK 1- 18 cycles
127        LDR     d9, [x5, 88]
128        INS     v8.d[0], x17
129        SMULL   v12.8h, v4.8b, v0.8b
130        SMULL   v13.8h, v4.8b, v1.8b
131        LDR     x17, [x5, 96]
132        SMULL   v14.8h, v5.8b, v0.8b
133        SMULL   v15.8h, v5.8b, v1.8b
134        LDR     d5, [x5, 40]
135        INS     v4.d[0], x16
136        SMLAL   v12.8h, v8.8b, v6.8b
137        SMLAL   v13.8h, v8.8b, v7.8b
138        LDR     x16, [x5, 48]
139        SMLAL   v14.8h, v9.8b, v6.8b
140        SMLAL   v15.8h, v9.8b, v7.8b
141        SADALP  v20.4s, v12.8h
142        SADALP  v21.4s, v13.8h
143        SADALP  v22.4s, v14.8h
144        SADALP  v23.4s, v15.8h
145
146        # BLOCK 2 - 18 cycles
147        LDR     d9, [x5, 104]
148        INS     v8.d[0], x17
149        SMULL   v2.8h, v4.8b, v0.8b
150        SMULL   v3.8h, v4.8b, v1.8b
151        LDR     x17, [x5, 112]
152        SMULL   v10.8h, v5.8b, v0.8b
153        SMULL   v11.8h, v5.8b, v1.8b
154        LDR     d5, [x5, 56]
155        INS     v4.d[0], x16
156        SMLAL   v2.8h, v8.8b, v6.8b
157        SMLAL   v3.8h, v8.8b, v7.8b
158        LDR     x16, [x5, 128]
159        SMLAL   v10.8h, v9.8b, v6.8b
160        SMLAL   v11.8h, v9.8b, v7.8b
161        SADALP  v24.4s,  v2.8h
162        LDR     x20, [x13], 8           // Read A0
163        SADALP  v25.4s,  v3.8h
164        LDR     x21, [x15], 8           // Read A1
165        SADALP  v26.4s, v10.8h
166        SADALP  v27.4s, v11.8h
167        SUBS    x0, x0, 16
168
169        # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles
170        LDR     d9, [x5, 120]
171        INS     v8.d[0], x17
172        SMULL   v12.8h, v4.8b, v0.8b
173        SMULL   v13.8h, v4.8b, v1.8b
174        LDR     x17, [x5, 192]          // Read B
175        SMULL   v14.8h, v5.8b, v0.8b
176        SMULL   v15.8h, v5.8b, v1.8b
177        LDR     d5, [x5, 136]           // Read B
178        INS     v4.d[0], x16
179        SMLAL   v12.8h, v8.8b, v6.8b
180        SMLAL   v13.8h, v8.8b, v7.8b
181        LDR     x16, [x5, 144]
182        SMLAL   v14.8h, v9.8b, v6.8b
183        SMLAL   v15.8h, v9.8b, v7.8b
184        LDR     d6, [x13], 8            // Read A0
185        INS     v0.d[0], x20
186        LDR     d7, [x15], 8            // Read A1
187        INS     v1.d[0], x21
188        SADALP  v28.4s, v12.8h
189        SADALP  v29.4s, v13.8h
190        ADD     x5, x5, 128
191        SADALP  v30.4s, v14.8h
192        SADALP  v31.4s, v15.8h
193        B.HS    2b
194
195        # Epilogue
196        # Same as main loop except no loads at end of loop
197        .p2align 3
1983:
199        # BLOCK 0 - 18 cycles
200        LDR     d9, [x5, 72]            // Read B
201        INS     v8.d[0], x17
202        SMULL   v2.8h, v4.8b, v0.8b
203        SMULL   v3.8h, v4.8b, v1.8b
204        LDR     x17, [x5, 80]
205        SMULL   v10.8h, v5.8b, v0.8b
206        SMULL   v11.8h, v5.8b, v1.8b
207        LDR     d5, [x5, 24]
208        INS     v4.d[0], x16
209        SMLAL   v2.8h, v8.8b, v6.8b
210        SMLAL   v3.8h, v8.8b, v7.8b
211        LDR     x16, [x5, 32]
212        SMLAL   v10.8h, v9.8b, v6.8b
213        SMLAL   v11.8h, v9.8b, v7.8b
214        SADALP  v16.4s,  v2.8h
215        SADALP  v17.4s,  v3.8h
216        SADALP  v18.4s, v10.8h
217        SADALP  v19.4s, v11.8h
218
219        # BLOCK 1- 18 cycles
220        LDR     d9, [x5, 88]
221        INS     v8.d[0], x17
222        SMULL   v12.8h, v4.8b, v0.8b
223        SMULL   v13.8h, v4.8b, v1.8b
224        LDR     x17, [x5, 96]
225        SMULL   v14.8h, v5.8b, v0.8b
226        SMULL   v15.8h, v5.8b, v1.8b
227        LDR     d5, [x5, 40]
228        INS     v4.d[0], x16
229        SMLAL   v12.8h, v8.8b, v6.8b
230        SMLAL   v13.8h, v8.8b, v7.8b
231        LDR     x16, [x5, 48]
232        SMLAL   v14.8h, v9.8b, v6.8b
233        SMLAL   v15.8h, v9.8b, v7.8b
234        SADALP  v20.4s, v12.8h
235        SADALP  v21.4s, v13.8h
236        SADALP  v22.4s, v14.8h
237        SADALP  v23.4s, v15.8h
238
239        # BLOCK 2 - 18 cycles
240        LDR     d9, [x5, 104]
241        INS     v8.d[0], x17
242        SMULL   v2.8h, v4.8b, v0.8b
243        SMULL   v3.8h, v4.8b, v1.8b
244        LDR     x17, [x5, 112]
245        SMULL   v10.8h, v5.8b, v0.8b
246        SMULL   v11.8h, v5.8b, v1.8b
247        LDR     d5, [x5, 56]
248        INS     v4.d[0], x16
249        SMLAL   v2.8h, v8.8b, v6.8b
250        SMLAL   v3.8h, v8.8b, v7.8b
251        SMLAL   v10.8h, v9.8b, v6.8b
252        SMLAL   v11.8h, v9.8b, v7.8b
253        SADALP  v24.4s,  v2.8h
254        SADALP  v25.4s,  v3.8h
255        SADALP  v26.4s, v10.8h
256        SADALP  v27.4s, v11.8h
257
258        # BLOCK 3 - 17 cycles
259        LDR     d9, [x5, 120]
260        INS     v8.d[0], x17
261        SMULL   v12.8h, v4.8b, v0.8b
262        SMULL   v13.8h, v4.8b, v1.8b
263        SMULL   v14.8h, v5.8b, v0.8b
264        SMULL   v15.8h, v5.8b, v1.8b
265        SMLAL   v12.8h, v8.8b, v6.8b
266        SMLAL   v13.8h, v8.8b, v7.8b
267        SMLAL   v14.8h, v9.8b, v6.8b
268        SMLAL   v15.8h, v9.8b, v7.8b
269        SADALP  v28.4s, v12.8h
270        SADALP  v29.4s, v13.8h
271        ADD     x5, x5, 128
272        SADALP  v30.4s, v14.8h
273        SADALP  v31.4s, v15.8h
274
275        # Is there a remainder?- 8 bytes of A
276        TBNZ    x0, 3, 5f
277
278        # ks loop
279        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
280        B.HI    1b
281
2824:
283        # Add columns
284        ADDP    v16.4s, v16.4s, v18.4s
285        ADDP    v20.4s, v20.4s, v22.4s
286        ADDP    v24.4s, v24.4s, v26.4s
287        ADDP    v28.4s, v28.4s, v30.4s
288        ADDP    v17.4s, v17.4s, v19.4s
289        ADDP    v21.4s, v21.4s, v23.4s
290        ADDP    v25.4s, v25.4s, v27.4s
291        ADDP    v29.4s, v29.4s, v31.4s
292        ADDP    v0.4s, v16.4s, v20.4s
293        ADDP    v1.4s, v24.4s, v28.4s
294        ADDP    v2.4s, v17.4s, v21.4s
295        ADDP    v3.4s, v25.4s, v29.4s
296
297        # Apply params - scale, bias and clamp
298        SCVTF   v0.4s, v0.4s
299        LD1R    {v4.4s}, [x11], 4
300        SCVTF   v1.4s, v1.4s
301        SCVTF   v2.4s, v2.4s
302        SCVTF   v3.4s, v3.4s
303        FMUL    v0.4s, v0.4s, v4.4s
304        FMUL    v1.4s, v1.4s, v4.4s
305        FMUL    v2.4s, v2.4s, v4.4s
306        FMUL    v3.4s, v3.4s, v4.4s
307
308        FCVTNS  v0.4s, v0.4s
309        FCVTNS  v1.4s, v1.4s
310        FCVTNS  v2.4s, v2.4s
311        FCVTNS  v3.4s, v3.4s
312
313        LD1R    {v5.8h}, [x11], 2
314        SQXTN   v0.4h, v0.4s
315        SQXTN   v2.4h, v2.4s
316        SQXTN2  v0.8h, v1.4s
317        SQXTN2  v2.8h, v3.4s
318        SUBS    x1, x1, 8
319        SQADD   v0.8h, v0.8h, v5.8h
320        SQADD   v1.8h, v2.8h, v5.8h
321        SQXTN   v0.8b, v0.8h
322        SQXTN2  v0.16b, v1.8h
323        LD1R    {v1.16b}, [x11], 1
324        LD1R    {v2.16b}, [x11]
325        SMAX    v0.16b, v0.16b, v1.16b
326        SUB     x11, x11, 7          // rewind params pointer
327        SMIN    v0.16b, v0.16b, v2.16b
328        B.LO    6f
329
330        # Store full 2 x 8
331        ST1     {v0.d}[1], [x7], x10
332        ST1     {v0.8b}, [x6], x10
333
334        SUB     x4, x4, x3              // a -= ks
335
336        # nc loop
337        B.HI    0b
338
339        # Restore x20,x21 from stack
340        LDP     x20, x21, [sp, 64]
341
342        # Restore d8-d15 from stack
343        LDP     d14, d15, [sp, 48]
344        LDP     d12, d13, [sp, 32]
345        LDP     d10, d11, [sp, 16]
346        LDP     d8, d9, [sp], 80
347        RET
348
349        # Remainder - 8 bytes of A
350        .p2align 3
3515:
352        LDR     d0, [x13], 8
353        LDP     d4, d5, [x5]
354        LDR     d1, [x15], 8
355        LDP     d6, d7, [x5, 16]
356        SMULL   v2.8h, v4.8b, v0.8b
357        SMULL   v3.8h, v4.8b, v1.8b
358        SMULL   v10.8h, v5.8b, v0.8b
359        SMULL   v11.8h, v5.8b, v1.8b
360        SMULL   v12.8h, v6.8b, v0.8b
361        SADALP  v16.4s,  v2.8h
362        SMULL   v13.8h, v6.8b, v1.8b
363        SADALP  v17.4s,  v3.8h
364        SMULL   v14.8h, v7.8b, v0.8b
365        SADALP  v18.4s, v10.8h
366        SMULL   v15.8h, v7.8b, v1.8b
367        SADALP  v19.4s, v11.8h
368        LDP     d4, d5, [x5, 32]
369        SMULL   v2.8h, v4.8b, v0.8b
370        SADALP  v20.4s, v12.8h
371        SMULL   v3.8h, v4.8b, v1.8b
372        SADALP  v21.4s, v13.8h
373        SMULL   v10.8h, v5.8b, v0.8b
374        SADALP  v22.4s, v14.8h
375        SMULL   v11.8h, v5.8b, v1.8b
376        SADALP  v23.4s, v15.8h
377        LDP     d6, d7, [x5, 48]
378        SMULL   v12.8h, v6.8b, v0.8b
379        SADALP  v24.4s,  v2.8h
380        SMULL   v13.8h, v6.8b, v1.8b
381        SADALP  v25.4s,  v3.8h
382        SMULL   v14.8h, v7.8b, v0.8b
383        SADALP  v26.4s, v10.8h
384        SMULL   v15.8h, v7.8b, v1.8b
385        SADALP  v27.4s, v11.8h
386        ADD     x5, x5, 64
387        SADALP  v28.4s, v12.8h
388        SADALP  v29.4s, v13.8h
389        SADALP  v30.4s, v14.8h
390        SADALP  v31.4s, v15.8h
391
392        # ks loop
393        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
394        B.HI    1b
395        B       4b
396
397        # Store odd width
398        .p2align 3
3996:
400        TBZ     x1, 2, 7f
401        ST1     {v0.s}[2], [x7], 4
402        STR     s0, [x6], 4
403        EXT     v0.16b, v0.16b, v0.16b, 4
404
4057:
406        TBZ     x1, 1, 8f
407        ST1     {v0.h}[4], [x7], 2
408        STR     h0, [x6], 2
409        EXT     v0.16b, v0.16b, v0.16b, 2
4108:
411        TBZ     x1, 0, 9f
412        ST1     {v0.b}[8], [x7]
413        STR     b0, [x6]
4149:
415        # Restore x20,x21 from stack
416        LDP     x20, x21, [sp, 64]
417
418        # Restore d8-d15 from stack
419        LDP     d14, d15, [sp, 48]
420        LDP     d12, d13, [sp, 32]
421        LDP     d10, d11, [sp, 16]
422        LDP     d8, d9, [sp], 80
423        RET
424
425END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
426
427#ifdef __ELF__
428.section ".note.GNU-stack","",%progbits
429#endif
430