xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# A1 x15  v1  v7
32# B   x5  v4  v5  v8  v9
33# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
34# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
35# temp0   v2 v10 v12 v14
36# temp1   v3 v11 v13 v15
37# x16, x17, x20, x21 tenporary a53 gpr load data
38
39
40BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
41
42        # Clamp C pointers
43        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
44        CMP     x0, 2                   // if mr < 2
45        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
46        ADD     x7, x6, x7              // c1 = c0 + cm_stride
47        STP     d8, d9, [sp, -80]!
48        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
49        STP     d10, d11, [sp, 16]
50        CSEL    x7, x6, x7, LO          //   c1 = c0
51        STP     d12, d13, [sp, 32]
52        BIC     x2, x2, 7
53        STP     d14, d15, [sp, 48]
54        STP     x20, x21, [sp, 64]      // Save x20,x21 on stack
55
56        .p2align 3
570:
58        # Load initial bias from w into accumulators
59        LDP     s16, s18, [x5], 8
60        MOV     v17.16b, v16.16b
61        MOV     v19.16b, v18.16b
62        LDP     s20, s22, [x5], 8
63        MOV     v21.16b, v20.16b
64        MOV     v23.16b, v22.16b
65        LDP     s24, s26, [x5], 8
66        MOV     v25.16b, v24.16b
67        MOV     v27.16b, v26.16b
68        LDP     s28, s30, [x5], 8
69        MOV     v29.16b, v28.16b
70        MOV     v31.16b, v30.16b
71        MOV     x9, x3                  // p = ks
72
73        .p2align 3
741:
75        # Load next 2 A pointers
76        LDP     x13, x15, [x4], 16
77        CMP     x13, x12                // if a0 == zero
78        ADD     x13, x13, x8            // a0 += a_offset
79        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
80        CMP     x15, x12                // if a1 == zero
81        ADD     x15, x15, x8            // a1 += a_offset
82        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
83
84        # Is there at least 16 bytes for epilogue?
85        SUBS    x0, x2, 16              // k = kc - 16
86        B.LO    5f
87
88        # Prologue: load A0, A1 and 2 B's
89        LDP     d4, d5, [x5]            // Read B
90        LDP     d0, d6, [x13], 16
91        LDP     d1, d7, [x15], 16
92//        LDP     d8, d9, [x5, 64]
93        LDR     x17, [x5, 64]           // Read B
94        LDR     x16, [x5, 16]
95
96        # Is there at least 16 bytes for main loop?
97        SUBS    x0, x0, 16              // k = k - 16
98        B.LO    3f
99
100         # Main loop - 16 bytes of A
101         # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles.
102         # 2 loads for A0 = +2 cycles.  Total 18 * 4 + 2 = 74 cycles.
103
104        .p2align 3
1052:
106        # BLOCK 0 - 18 cycles - includes prfm
107        LDR     d9, [x5, 72]            // Read B
108        INS     v8.d[0], x17
109        SMULL   v2.8h, v4.8b, v0.8b
110        SMULL   v3.8h, v4.8b, v1.8b
111        LDR     x17, [x5, 80]
112        SMULL   v10.8h, v5.8b, v0.8b
113        SMULL   v11.8h, v5.8b, v1.8b
114        LDR     d5, [x5, 24]
115        INS     v4.d[0], x16
116        SMLAL   v2.8h, v8.8b, v6.8b
117        SMLAL   v3.8h, v8.8b, v7.8b
118        LDR     x16, [x5, 32]
119        SMLAL   v10.8h, v9.8b, v6.8b
120        SMLAL   v11.8h, v9.8b, v7.8b
121        SADALP  v16.4s,  v2.8h
122        SADALP  v17.4s,  v3.8h
123        SADALP  v18.4s, v10.8h
124        SADALP  v19.4s, v11.8h
125
126        # BLOCK 1- 18 cycles
127        LDR     d9, [x5, 88]
128        INS     v8.d[0], x17
129        SMULL   v12.8h, v4.8b, v0.8b
130        SMULL   v13.8h, v4.8b, v1.8b
131        LDR     x17, [x5, 96]
132        SMULL   v14.8h, v5.8b, v0.8b
133        SMULL   v15.8h, v5.8b, v1.8b
134        LDR     d5, [x5, 40]
135        INS     v4.d[0], x16
136        SMLAL   v12.8h, v8.8b, v6.8b
137        SMLAL   v13.8h, v8.8b, v7.8b
138        LDR     x16, [x5, 48]
139        SMLAL   v14.8h, v9.8b, v6.8b
140        SMLAL   v15.8h, v9.8b, v7.8b
141        SADALP  v20.4s, v12.8h
142        SADALP  v21.4s, v13.8h
143        SADALP  v22.4s, v14.8h
144        SADALP  v23.4s, v15.8h
145
146        # BLOCK 2 - 18 cycles
147        LDR     d9, [x5, 104]
148        INS     v8.d[0], x17
149        SMULL   v2.8h, v4.8b, v0.8b
150        SMULL   v3.8h, v4.8b, v1.8b
151        LDR     x17, [x5, 112]
152        SMULL   v10.8h, v5.8b, v0.8b
153        SMULL   v11.8h, v5.8b, v1.8b
154        LDR     d5, [x5, 56]
155        INS     v4.d[0], x16
156        SMLAL   v2.8h, v8.8b, v6.8b
157        SMLAL   v3.8h, v8.8b, v7.8b
158        LDR     x16, [x5, 128]
159        SMLAL   v10.8h, v9.8b, v6.8b
160        SMLAL   v11.8h, v9.8b, v7.8b
161        SADALP  v24.4s,  v2.8h
162        LDR     x20, [x13], 8           // Read A0
163        SADALP  v25.4s,  v3.8h
164        LDR     x21, [x15], 8           // Read A1
165        SADALP  v26.4s, v10.8h
166        SADALP  v27.4s, v11.8h
167        SUBS    x0, x0, 16
168
169        # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles
170        LDR     d9, [x5, 120]
171        INS     v8.d[0], x17
172        SMULL   v12.8h, v4.8b, v0.8b
173        SMULL   v13.8h, v4.8b, v1.8b
174        LDR     x17, [x5, 192]          // Read B
175        SMULL   v14.8h, v5.8b, v0.8b
176        SMULL   v15.8h, v5.8b, v1.8b
177        LDR     d5, [x5, 136]           // Read B
178        INS     v4.d[0], x16
179        SMLAL   v12.8h, v8.8b, v6.8b
180        SMLAL   v13.8h, v8.8b, v7.8b
181        LDR     x16, [x5, 144]
182        SMLAL   v14.8h, v9.8b, v6.8b
183        SMLAL   v15.8h, v9.8b, v7.8b
184        LDR     d6, [x13], 8            // Read A0
185        INS     v0.d[0], x20
186        LDR     d7, [x15], 8            // Read A1
187        INS     v1.d[0], x21
188        SADALP  v28.4s, v12.8h
189        SADALP  v29.4s, v13.8h
190        ADD     x5, x5, 128
191        SADALP  v30.4s, v14.8h
192        SADALP  v31.4s, v15.8h
193        B.HS    2b
194
195        # Epilogue
196        # Same as main loop except no loads at end of loop
197        .p2align 3
1983:
199        # BLOCK 0 - 18 cycles
200        LDR     d9, [x5, 72]            // Read B
201        INS     v8.d[0], x17
202        SMULL   v2.8h, v4.8b, v0.8b
203        SMULL   v3.8h, v4.8b, v1.8b
204        LDR     x17, [x5, 80]
205        SMULL   v10.8h, v5.8b, v0.8b
206        SMULL   v11.8h, v5.8b, v1.8b
207        LDR     d5, [x5, 24]
208        INS     v4.d[0], x16
209        SMLAL   v2.8h, v8.8b, v6.8b
210        SMLAL   v3.8h, v8.8b, v7.8b
211        LDR     x16, [x5, 32]
212        SMLAL   v10.8h, v9.8b, v6.8b
213        SMLAL   v11.8h, v9.8b, v7.8b
214        SADALP  v16.4s,  v2.8h
215        SADALP  v17.4s,  v3.8h
216        SADALP  v18.4s, v10.8h
217        SADALP  v19.4s, v11.8h
218
219        # BLOCK 1- 18 cycles
220        LDR     d9, [x5, 88]
221        INS     v8.d[0], x17
222        SMULL   v12.8h, v4.8b, v0.8b
223        SMULL   v13.8h, v4.8b, v1.8b
224        LDR     x17, [x5, 96]
225        SMULL   v14.8h, v5.8b, v0.8b
226        SMULL   v15.8h, v5.8b, v1.8b
227        LDR     d5, [x5, 40]
228        INS     v4.d[0], x16
229        SMLAL   v12.8h, v8.8b, v6.8b
230        SMLAL   v13.8h, v8.8b, v7.8b
231        LDR     x16, [x5, 48]
232        SMLAL   v14.8h, v9.8b, v6.8b
233        SMLAL   v15.8h, v9.8b, v7.8b
234        SADALP  v20.4s, v12.8h
235        SADALP  v21.4s, v13.8h
236        SADALP  v22.4s, v14.8h
237        SADALP  v23.4s, v15.8h
238
239        # BLOCK 2 - 18 cycles
240        LDR     d9, [x5, 104]
241        INS     v8.d[0], x17
242        SMULL   v2.8h, v4.8b, v0.8b
243        SMULL   v3.8h, v4.8b, v1.8b
244        LDR     x17, [x5, 112]
245        SMULL   v10.8h, v5.8b, v0.8b
246        SMULL   v11.8h, v5.8b, v1.8b
247        LDR     d5, [x5, 56]
248        INS     v4.d[0], x16
249        SMLAL   v2.8h, v8.8b, v6.8b
250        SMLAL   v3.8h, v8.8b, v7.8b
251        SMLAL   v10.8h, v9.8b, v6.8b
252        SMLAL   v11.8h, v9.8b, v7.8b
253        SADALP  v24.4s,  v2.8h
254        SADALP  v25.4s,  v3.8h
255        SADALP  v26.4s, v10.8h
256        SADALP  v27.4s, v11.8h
257
258        # BLOCK 3 - 17 cycles
259        LDR     d9, [x5, 120]
260        INS     v8.d[0], x17
261        SMULL   v12.8h, v4.8b, v0.8b
262        SMULL   v13.8h, v4.8b, v1.8b
263        SMULL   v14.8h, v5.8b, v0.8b
264        SMULL   v15.8h, v5.8b, v1.8b
265        SMLAL   v12.8h, v8.8b, v6.8b
266        SMLAL   v13.8h, v8.8b, v7.8b
267        SMLAL   v14.8h, v9.8b, v6.8b
268        SMLAL   v15.8h, v9.8b, v7.8b
269        SADALP  v28.4s, v12.8h
270        SADALP  v29.4s, v13.8h
271        ADD     x5, x5, 128
272        SADALP  v30.4s, v14.8h
273        SADALP  v31.4s, v15.8h
274
275        # Is there a remainder?- 8 bytes of A
276        TBNZ    x0, 3, 5f
277
278        # ks loop
279        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
280        B.HI    1b
281
2824:
283        # Add columns
284        ADDP    v16.4s, v16.4s, v18.4s
285        ADDP    v20.4s, v20.4s, v22.4s
286        LD1R    {v4.4s}, [x11], 4
287        ADDP    v24.4s, v24.4s, v26.4s
288        ADDP    v28.4s, v28.4s, v30.4s
289        LD1R    {v7.4s}, [x11], 4
290        ADDP    v17.4s, v17.4s, v19.4s
291        ADDP    v21.4s, v21.4s, v23.4s
292        ADDP    v25.4s, v25.4s, v27.4s
293        ADDP    v29.4s, v29.4s, v31.4s
294        ADDP    v0.4s, v16.4s, v20.4s
295        ADDP    v1.4s, v24.4s, v28.4s
296        ADDP    v2.4s, v17.4s, v21.4s
297        ADDP    v3.4s, v25.4s, v29.4s
298
299        # Apply params - preshift, scale, postshift, bias and clamp
300        LD1R    {v5.4s}, [x11], 4
301        SQSHL   v0.4s, v0.4s, v4.4s     // shift to upper bits
302        SQSHL   v1.4s, v1.4s, v4.4s
303        SQSHL   v2.4s, v2.4s, v4.4s
304        SQSHL   v3.4s, v3.4s, v4.4s
305        SQDMULH v0.4s, v0.4s, v7.4s     // scale without rounding
306        SQDMULH v1.4s, v1.4s, v7.4s
307        SQDMULH v2.4s, v2.4s, v7.4s
308        SQDMULH v3.4s, v3.4s, v7.4s
309        SRSHL   v0.4s, v0.4s, v5.4s     // signed rounding shift left
310        SRSHL   v1.4s, v1.4s, v5.4s
311        SRSHL   v2.4s, v2.4s, v5.4s
312        SRSHL   v3.4s, v3.4s, v5.4s
313
314        LD1R    {v5.8h}, [x11], 2
315        SQXTN   v0.4h, v0.4s
316        SQXTN   v2.4h, v2.4s
317        SQXTN2  v0.8h, v1.4s
318        SQXTN2  v2.8h, v3.4s
319        SUBS    x1, x1, 8
320        SQADD   v0.8h, v0.8h, v5.8h
321        SQADD   v1.8h, v2.8h, v5.8h
322        SQXTN   v0.8b, v0.8h
323        SQXTN2  v0.16b, v1.8h
324        LD1R    {v1.16b}, [x11], 1
325        LD1R    {v2.16b}, [x11]
326        SMAX    v0.16b, v0.16b, v1.16b
327        SUB     x11, x11, 15          // rewind params pointer
328        SMIN    v0.16b, v0.16b, v2.16b
329        B.LO    6f
330
331        # Store full 2 x 8
332        ST1     {v0.d}[1], [x7], x10
333        ST1     {v0.8b}, [x6], x10
334
335        SUB     x4, x4, x3              // a -= ks
336
337        # nc loop
338        B.HI    0b
339
340        # Restore x20,x21 from stack
341        LDP     x20, x21, [sp, 64]
342
343        # Restore d8-d15 from stack
344        LDP     d14, d15, [sp, 48]
345        LDP     d12, d13, [sp, 32]
346        LDP     d10, d11, [sp, 16]
347        LDP     d8, d9, [sp], 80
348        RET
349
350        # Remainder - 8 bytes of A
351        .p2align 3
3525:
353        LDR     d0, [x13], 8
354        LDP     d4, d5, [x5]
355        LDR     d1, [x15], 8
356        LDP     d6, d7, [x5, 16]
357        SMULL   v2.8h, v4.8b, v0.8b
358        SMULL   v3.8h, v4.8b, v1.8b
359        SMULL   v10.8h, v5.8b, v0.8b
360        SMULL   v11.8h, v5.8b, v1.8b
361        SMULL   v12.8h, v6.8b, v0.8b
362        SADALP  v16.4s,  v2.8h
363        SMULL   v13.8h, v6.8b, v1.8b
364        SADALP  v17.4s,  v3.8h
365        SMULL   v14.8h, v7.8b, v0.8b
366        SADALP  v18.4s, v10.8h
367        SMULL   v15.8h, v7.8b, v1.8b
368        SADALP  v19.4s, v11.8h
369        LDP     d4, d5, [x5, 32]
370        SMULL   v2.8h, v4.8b, v0.8b
371        SADALP  v20.4s, v12.8h
372        SMULL   v3.8h, v4.8b, v1.8b
373        SADALP  v21.4s, v13.8h
374        SMULL   v10.8h, v5.8b, v0.8b
375        SADALP  v22.4s, v14.8h
376        SMULL   v11.8h, v5.8b, v1.8b
377        SADALP  v23.4s, v15.8h
378        LDP     d6, d7, [x5, 48]
379        SMULL   v12.8h, v6.8b, v0.8b
380        SADALP  v24.4s,  v2.8h
381        SMULL   v13.8h, v6.8b, v1.8b
382        SADALP  v25.4s,  v3.8h
383        SMULL   v14.8h, v7.8b, v0.8b
384        SADALP  v26.4s, v10.8h
385        SMULL   v15.8h, v7.8b, v1.8b
386        SADALP  v27.4s, v11.8h
387        ADD     x5, x5, 64
388        SADALP  v28.4s, v12.8h
389        SADALP  v29.4s, v13.8h
390        SADALP  v30.4s, v14.8h
391        SADALP  v31.4s, v15.8h
392
393        # ks loop
394        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
395        B.HI    1b
396        B       4b
397
398        # Store odd width
399        .p2align 3
4006:
401        TBZ     x1, 2, 7f
402        ST1     {v0.s}[2], [x7], 4
403        STR     s0, [x6], 4
404        EXT     v0.16b, v0.16b, v0.16b, 4
405
4067:
407        TBZ     x1, 1, 8f
408        ST1     {v0.h}[4], [x7], 2
409        STR     h0, [x6], 2
410        EXT     v0.16b, v0.16b, v0.16b, 2
4118:
412        TBZ     x1, 0, 9f
413        ST1     {v0.b}[8], [x7]
414        STR     b0, [x6]
4159:
416        # Restore x20,x21 from stack
417        LDP     x20, x21, [sp, 64]
418
419        # Restore d8-d15 from stack
420        LDP     d14, d15, [sp, 48]
421        LDP     d12, d13, [sp, 32]
422        LDP     d10, d11, [sp, 16]
423        LDP     d8, d9, [sp], 80
424        RET
425
426END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53
427
428#ifdef __ELF__
429.section ".note.GNU-stack","",%progbits
430#endif
431