1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# A1 x15  v1  v7
32# B   x5  v4  v5  v8  v9
33# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
34# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
35# temp0   v2 v10 v12 v14
36# temp1   v3 v11 v13 v15
37# x16, x17, x20, x21 tenporary a53 gpr load data
38
39
40BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
41
42        # Clamp C pointers
43        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
44        CMP     x0, 2                   // if mr < 2
45        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
46        ADD     x7, x6, x7              // c1 = c0 + cm_stride
47        STP     d8, d9, [sp, -80]!
48        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
49        STP     d10, d11, [sp, 16]
50        CSEL    x7, x6, x7, LO          //   c1 = c0
51        STP     d12, d13, [sp, 32]
52        BIC     x2, x2, 7
53        STP     d14, d15, [sp, 48]
54        STP     x20, x21, [sp, 64]      // Save x20,x21 on stack
55
56        .p2align 3
570:
58        # Load initial bias from w into accumulators
59        LDP     s16, s18, [x5], 8
60        MOV     v17.16b, v16.16b
61        MOV     v19.16b, v18.16b
62        LDP     s20, s22, [x5], 8
63        MOV     v21.16b, v20.16b
64        MOV     v23.16b, v22.16b
65        LDP     s24, s26, [x5], 8
66        MOV     v25.16b, v24.16b
67        MOV     v27.16b, v26.16b
68        LDP     s28, s30, [x5], 8
69        MOV     v29.16b, v28.16b
70        MOV     v31.16b, v30.16b
71        MOV     x9, x3                  // p = ks
72
73        .p2align 3
741:
75        # Load next 2 A pointers
76        LDP     x13, x15, [x4], 16
77        CMP     x13, x12                // if a0 == zero
78        ADD     x13, x13, x8            // a0 += a_offset
79        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
80        CMP     x15, x12                // if a1 == zero
81        ADD     x15, x15, x8            // a1 += a_offset
82        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
83
84        # Is there at least 16 bytes for epilogue?
85        SUBS    x0, x2, 16              // k = kc - 16
86        B.LO    5f
87
88        # Prologue: load A0, A1 and 2 B's
89        LDP     d4, d5, [x5]            // Read B
90        LDP     d0, d6, [x13], 16
91        LDP     d1, d7, [x15], 16
92//        LDP     d8, d9, [x5, 64]
93        LDR     x17, [x5, 64]           // Read B
94        LDR     x16, [x5, 16]
95
96        # Is there at least 16 bytes for main loop?
97        SUBS    x0, x0, 16              // k = k - 16
98        B.LO    3f
99
100         # Main loop - 16 bytes of A
101         # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles.
102         # 2 loads for A0 = +2 cycles.  Total 18 * 4 + 2 = 74 cycles.
103
104        .p2align 3
1052:
106        # BLOCK 0 - 18 cycles - includes prfm
107        LDR     d9, [x5, 72]            // Read B
108        INS     v8.d[0], x17
109        SMULL   v2.8h, v4.8b, v0.8b
110        SMULL   v3.8h, v4.8b, v1.8b
111        LDR     x17, [x5, 80]
112        SMULL   v10.8h, v5.8b, v0.8b
113        SMULL   v11.8h, v5.8b, v1.8b
114        LDR     d5, [x5, 24]
115        INS     v4.d[0], x16
116        SMLAL   v2.8h, v8.8b, v6.8b
117        SMLAL   v3.8h, v8.8b, v7.8b
118        LDR     x16, [x5, 32]
119        SMLAL   v10.8h, v9.8b, v6.8b
120        SMLAL   v11.8h, v9.8b, v7.8b
121        PRFM    PLDL1KEEP, [x5, 448]
122        SADALP  v16.4s,  v2.8h
123        SADALP  v17.4s,  v3.8h
124        PRFM    PLDL1KEEP, [x5, 512]
125        SADALP  v18.4s, v10.8h
126        SADALP  v19.4s, v11.8h
127
128        # BLOCK 1- 18 cycles
129        LDR     d9, [x5, 88]
130        INS     v8.d[0], x17
131        SMULL   v12.8h, v4.8b, v0.8b
132        SMULL   v13.8h, v4.8b, v1.8b
133        LDR     x17, [x5, 96]
134        SMULL   v14.8h, v5.8b, v0.8b
135        SMULL   v15.8h, v5.8b, v1.8b
136        LDR     d5, [x5, 40]
137        INS     v4.d[0], x16
138        SMLAL   v12.8h, v8.8b, v6.8b
139        SMLAL   v13.8h, v8.8b, v7.8b
140        LDR     x16, [x5, 48]
141        SMLAL   v14.8h, v9.8b, v6.8b
142        SMLAL   v15.8h, v9.8b, v7.8b
143        PRFM    PLDL1KEEP, [x13, 128]
144        SADALP  v20.4s, v12.8h
145        SADALP  v21.4s, v13.8h
146        PRFM    PLDL1KEEP, [x15, 128]
147        SADALP  v22.4s, v14.8h
148        SADALP  v23.4s, v15.8h
149
150        # BLOCK 2 - 18 cycles
151        LDR     d9, [x5, 104]
152        INS     v8.d[0], x17
153        SMULL   v2.8h, v4.8b, v0.8b
154        SMULL   v3.8h, v4.8b, v1.8b
155        LDR     x17, [x5, 112]
156        SMULL   v10.8h, v5.8b, v0.8b
157        SMULL   v11.8h, v5.8b, v1.8b
158        LDR     d5, [x5, 56]
159        INS     v4.d[0], x16
160        SMLAL   v2.8h, v8.8b, v6.8b
161        SMLAL   v3.8h, v8.8b, v7.8b
162        LDR     x16, [x5, 128]
163        SMLAL   v10.8h, v9.8b, v6.8b
164        SMLAL   v11.8h, v9.8b, v7.8b
165        SADALP  v24.4s,  v2.8h
166        LDR     x20, [x13], 8           // Read A0
167        SADALP  v25.4s,  v3.8h
168        LDR     x21, [x15], 8           // Read A1
169        SADALP  v26.4s, v10.8h
170        SADALP  v27.4s, v11.8h
171        SUBS    x0, x0, 16
172
173        # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles
174        LDR     d9, [x5, 120]
175        INS     v8.d[0], x17
176        SMULL   v12.8h, v4.8b, v0.8b
177        SMULL   v13.8h, v4.8b, v1.8b
178        LDR     x17, [x5, 192]          // Read B
179        SMULL   v14.8h, v5.8b, v0.8b
180        SMULL   v15.8h, v5.8b, v1.8b
181        LDR     d5, [x5, 136]           // Read B
182        INS     v4.d[0], x16
183        SMLAL   v12.8h, v8.8b, v6.8b
184        SMLAL   v13.8h, v8.8b, v7.8b
185        LDR     x16, [x5, 144]
186        SMLAL   v14.8h, v9.8b, v6.8b
187        SMLAL   v15.8h, v9.8b, v7.8b
188        LDR     d6, [x13], 8            // Read A0
189        INS     v0.d[0], x20
190        LDR     d7, [x15], 8            // Read A1
191        INS     v1.d[0], x21
192        SADALP  v28.4s, v12.8h
193        SADALP  v29.4s, v13.8h
194        ADD     x5, x5, 128
195        SADALP  v30.4s, v14.8h
196        SADALP  v31.4s, v15.8h
197        B.HS    2b
198
199        # Epilogue
200        # Same as main loop except no loads at end of loop
201        .p2align 3
2023:
203        # BLOCK 0 - 18 cycles
204        LDR     d9, [x5, 72]            // Read B
205        INS     v8.d[0], x17
206        SMULL   v2.8h, v4.8b, v0.8b
207        SMULL   v3.8h, v4.8b, v1.8b
208        LDR     x17, [x5, 80]
209        SMULL   v10.8h, v5.8b, v0.8b
210        SMULL   v11.8h, v5.8b, v1.8b
211        LDR     d5, [x5, 24]
212        INS     v4.d[0], x16
213        SMLAL   v2.8h, v8.8b, v6.8b
214        SMLAL   v3.8h, v8.8b, v7.8b
215        LDR     x16, [x5, 32]
216        SMLAL   v10.8h, v9.8b, v6.8b
217        SMLAL   v11.8h, v9.8b, v7.8b
218        SADALP  v16.4s,  v2.8h
219        SADALP  v17.4s,  v3.8h
220        SADALP  v18.4s, v10.8h
221        SADALP  v19.4s, v11.8h
222
223        # BLOCK 1- 18 cycles
224        LDR     d9, [x5, 88]
225        INS     v8.d[0], x17
226        SMULL   v12.8h, v4.8b, v0.8b
227        SMULL   v13.8h, v4.8b, v1.8b
228        LDR     x17, [x5, 96]
229        SMULL   v14.8h, v5.8b, v0.8b
230        SMULL   v15.8h, v5.8b, v1.8b
231        LDR     d5, [x5, 40]
232        INS     v4.d[0], x16
233        SMLAL   v12.8h, v8.8b, v6.8b
234        SMLAL   v13.8h, v8.8b, v7.8b
235        LDR     x16, [x5, 48]
236        SMLAL   v14.8h, v9.8b, v6.8b
237        SMLAL   v15.8h, v9.8b, v7.8b
238        SADALP  v20.4s, v12.8h
239        SADALP  v21.4s, v13.8h
240        SADALP  v22.4s, v14.8h
241        SADALP  v23.4s, v15.8h
242
243        # BLOCK 2 - 18 cycles
244        LDR     d9, [x5, 104]
245        INS     v8.d[0], x17
246        SMULL   v2.8h, v4.8b, v0.8b
247        SMULL   v3.8h, v4.8b, v1.8b
248        LDR     x17, [x5, 112]
249        SMULL   v10.8h, v5.8b, v0.8b
250        SMULL   v11.8h, v5.8b, v1.8b
251        LDR     d5, [x5, 56]
252        INS     v4.d[0], x16
253        SMLAL   v2.8h, v8.8b, v6.8b
254        SMLAL   v3.8h, v8.8b, v7.8b
255        SMLAL   v10.8h, v9.8b, v6.8b
256        SMLAL   v11.8h, v9.8b, v7.8b
257        SADALP  v24.4s,  v2.8h
258        SADALP  v25.4s,  v3.8h
259        SADALP  v26.4s, v10.8h
260        SADALP  v27.4s, v11.8h
261
262        # BLOCK 3 - 17 cycles
263        LDR     d9, [x5, 120]
264        INS     v8.d[0], x17
265        SMULL   v12.8h, v4.8b, v0.8b
266        SMULL   v13.8h, v4.8b, v1.8b
267        SMULL   v14.8h, v5.8b, v0.8b
268        SMULL   v15.8h, v5.8b, v1.8b
269        SMLAL   v12.8h, v8.8b, v6.8b
270        SMLAL   v13.8h, v8.8b, v7.8b
271        SMLAL   v14.8h, v9.8b, v6.8b
272        SMLAL   v15.8h, v9.8b, v7.8b
273        SADALP  v28.4s, v12.8h
274        SADALP  v29.4s, v13.8h
275        ADD     x5, x5, 128
276        SADALP  v30.4s, v14.8h
277        SADALP  v31.4s, v15.8h
278
279        # Is there a remainder?- 8 bytes of A
280        TBNZ    x0, 3, 5f
281
282        # ks loop
283        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
284        B.HI    1b
285
2864:
287        # Add columns
288        ADDP    v16.4s, v16.4s, v18.4s
289        ADDP    v20.4s, v20.4s, v22.4s
290        ADDP    v24.4s, v24.4s, v26.4s
291        ADDP    v28.4s, v28.4s, v30.4s
292        ADDP    v17.4s, v17.4s, v19.4s
293        ADDP    v21.4s, v21.4s, v23.4s
294        ADDP    v25.4s, v25.4s, v27.4s
295        ADDP    v29.4s, v29.4s, v31.4s
296        ADDP    v0.4s, v16.4s, v20.4s
297        ADDP    v1.4s, v24.4s, v28.4s
298        ADDP    v2.4s, v17.4s, v21.4s
299        ADDP    v3.4s, v25.4s, v29.4s
300
301        # Apply params - scale, bias and clamp
302        SCVTF   v0.4s, v0.4s
303        LD1R    {v4.4s}, [x11], 4
304        SCVTF   v1.4s, v1.4s
305        SCVTF   v2.4s, v2.4s
306        SCVTF   v3.4s, v3.4s
307        FMUL    v0.4s, v0.4s, v4.4s
308        FMUL    v1.4s, v1.4s, v4.4s
309        FMUL    v2.4s, v2.4s, v4.4s
310        FMUL    v3.4s, v3.4s, v4.4s
311
312        FCVTNS  v0.4s, v0.4s
313        FCVTNS  v1.4s, v1.4s
314        FCVTNS  v2.4s, v2.4s
315        FCVTNS  v3.4s, v3.4s
316
317        LD1R    {v5.8h}, [x11], 2
318        SQXTN   v0.4h, v0.4s
319        SQXTN   v2.4h, v2.4s
320        SQXTN2  v0.8h, v1.4s
321        SQXTN2  v2.8h, v3.4s
322        SUBS    x1, x1, 8
323        SQADD   v0.8h, v0.8h, v5.8h
324        SQADD   v1.8h, v2.8h, v5.8h
325        SQXTN   v0.8b, v0.8h
326        SQXTN2  v0.16b, v1.8h
327        LD1R    {v1.16b}, [x11], 1
328        LD1R    {v2.16b}, [x11]
329        SMAX    v0.16b, v0.16b, v1.16b
330        SUB     x11, x11, 7          // rewind params pointer
331        SMIN    v0.16b, v0.16b, v2.16b
332        B.LO    6f
333
334        # Store full 2 x 8
335        ST1     {v0.d}[1], [x7], x10
336        ST1     {v0.8b}, [x6], x10
337
338        SUB     x4, x4, x3              // a -= ks
339
340        # nc loop
341        B.HI    0b
342
343        # Restore x20,x21 from stack
344        LDP     x20, x21, [sp, 64]
345
346        # Restore d8-d15 from stack
347        LDP     d14, d15, [sp, 48]
348        LDP     d12, d13, [sp, 32]
349        LDP     d10, d11, [sp, 16]
350        LDP     d8, d9, [sp], 80
351        RET
352
353        # Remainder - 8 bytes of A
354        .p2align 3
3555:
356        LDR     d0, [x13], 8
357        LDP     d4, d5, [x5]
358        LDR     d1, [x15], 8
359        LDP     d6, d7, [x5, 16]
360        SMULL   v2.8h, v4.8b, v0.8b
361        SMULL   v3.8h, v4.8b, v1.8b
362        SMULL   v10.8h, v5.8b, v0.8b
363        SMULL   v11.8h, v5.8b, v1.8b
364        SMULL   v12.8h, v6.8b, v0.8b
365        SADALP  v16.4s,  v2.8h
366        SMULL   v13.8h, v6.8b, v1.8b
367        SADALP  v17.4s,  v3.8h
368        SMULL   v14.8h, v7.8b, v0.8b
369        SADALP  v18.4s, v10.8h
370        SMULL   v15.8h, v7.8b, v1.8b
371        SADALP  v19.4s, v11.8h
372        LDP     d4, d5, [x5, 32]
373        SMULL   v2.8h, v4.8b, v0.8b
374        SADALP  v20.4s, v12.8h
375        SMULL   v3.8h, v4.8b, v1.8b
376        SADALP  v21.4s, v13.8h
377        SMULL   v10.8h, v5.8b, v0.8b
378        SADALP  v22.4s, v14.8h
379        SMULL   v11.8h, v5.8b, v1.8b
380        SADALP  v23.4s, v15.8h
381        LDP     d6, d7, [x5, 48]
382        SMULL   v12.8h, v6.8b, v0.8b
383        SADALP  v24.4s,  v2.8h
384        SMULL   v13.8h, v6.8b, v1.8b
385        SADALP  v25.4s,  v3.8h
386        SMULL   v14.8h, v7.8b, v0.8b
387        SADALP  v26.4s, v10.8h
388        SMULL   v15.8h, v7.8b, v1.8b
389        SADALP  v27.4s, v11.8h
390        ADD     x5, x5, 64
391        SADALP  v28.4s, v12.8h
392        SADALP  v29.4s, v13.8h
393        SADALP  v30.4s, v14.8h
394        SADALP  v31.4s, v15.8h
395
396        # ks loop
397        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
398        B.HI    1b
399        B       4b
400
401        # Store odd width
402        .p2align 3
4036:
404        TBZ     x1, 2, 7f
405        ST1     {v0.s}[2], [x7], 4
406        STR     s0, [x6], 4
407        EXT     v0.16b, v0.16b, v0.16b, 4
408
4097:
410        TBZ     x1, 1, 8f
411        ST1     {v0.h}[4], [x7], 2
412        STR     h0, [x6], 2
413        EXT     v0.16b, v0.16b, v0.16b, 2
4148:
415        TBZ     x1, 0, 9f
416        ST1     {v0.b}[8], [x7]
417        STR     b0, [x6]
4189:
419        # Restore x20,x21 from stack
420        LDP     x20, x21, [sp, 64]
421
422        # Restore d8-d15 from stack
423        LDP     d14, d15, [sp, 48]
424        LDP     d12, d13, [sp, 32]
425        LDP     d10, d11, [sp, 16]
426        LDP     d8, d9, [sp], 80
427        RET
428
429END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
430
431#ifdef __ELF__
432.section ".note.GNU-stack","",%progbits
433#endif
434