1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> x10
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_minmax_params params [sp + 24] -> x11
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0 x13  v0  v6
31# A1 x15  v1  v7
32# B   x5  v4  v5  v8  v9
33# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
34# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
35# temp0   v2 v10 v12 v14
36# temp1   v3 v11 v13 v15
37# x16, x17, x20, x21 tenporary a53 gpr load data
38
39
40BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
41
42        # Clamp C pointers
43        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
44        CMP     x0, 2                   // if mr < 2
45        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
46        ADD     x7, x6, x7              // c1 = c0 + cm_stride
47        STP     d8, d9, [sp, -80]!
48        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
49        STP     d10, d11, [sp, 16]
50        CSEL    x7, x6, x7, LO          //   c1 = c0
51        STP     d12, d13, [sp, 32]
52        BIC     x2, x2, 7
53        STP     d14, d15, [sp, 48]
54        STP     x20, x21, [sp, 64]      // Save x20,x21 on stack
55
56        .p2align 3
570:
58        # Load initial bias from w into accumulators
59        LDP     s16, s18, [x5], 8
60        MOV     v17.16b, v16.16b
61        MOV     v19.16b, v18.16b
62        LDP     s20, s22, [x5], 8
63        MOV     v21.16b, v20.16b
64        MOV     v23.16b, v22.16b
65        LDP     s24, s26, [x5], 8
66        MOV     v25.16b, v24.16b
67        MOV     v27.16b, v26.16b
68        LDP     s28, s30, [x5], 8
69        MOV     v29.16b, v28.16b
70        MOV     v31.16b, v30.16b
71        MOV     x9, x3                  // p = ks
72
73        .p2align 3
741:
75        # Load next 2 A pointers
76        LDP     x13, x15, [x4], 16
77        CMP     x13, x12                // if a0 == zero
78        ADD     x13, x13, x8            // a0 += a_offset
79        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
80        CMP     x15, x12                // if a1 == zero
81        ADD     x15, x15, x8            // a1 += a_offset
82        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
83
84        # Is there at least 16 bytes for epilogue?
85        SUBS    x0, x2, 16              // k = kc - 16
86        B.LO    5f
87
88        # Prologue: load A0, A1 and 2 B's
89        LDP     d4, d5, [x5]            // Read B
90        LDP     d0, d6, [x13], 16
91        LDP     d1, d7, [x15], 16
92//        LDP     d8, d9, [x5, 64]
93        LDR     x17, [x5, 64]           // Read B
94        LDR     x16, [x5, 16]
95
96        # Is there at least 16 bytes for main loop?
97        SUBS    x0, x0, 16              // k = k - 16
98        B.LO    3f
99
100         # Main loop - 16 bytes of A
101         # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles.
102         # 2 loads for A0 = +2 cycles.  Total 18 * 4 + 2 = 74 cycles.
103
104        .p2align 3
1052:
106        # BLOCK 0 - 18 cycles - includes prfm
107        LDR     d9, [x5, 72]            // Read B
108        INS     v8.d[0], x17
109        SMULL   v2.8h, v4.8b, v0.8b
110        SMULL   v3.8h, v4.8b, v1.8b
111        LDR     x17, [x5, 80]
112        SMULL   v10.8h, v5.8b, v0.8b
113        SMULL   v11.8h, v5.8b, v1.8b
114        LDR     d5, [x5, 24]
115        INS     v4.d[0], x16
116        SMLAL   v2.8h, v8.8b, v6.8b
117        SMLAL   v3.8h, v8.8b, v7.8b
118        LDR     x16, [x5, 32]
119        SMLAL   v10.8h, v9.8b, v6.8b
120        SMLAL   v11.8h, v9.8b, v7.8b
121        PRFM    PLDL1KEEP, [x5, 448]
122        SADALP  v16.4s,  v2.8h
123        SADALP  v17.4s,  v3.8h
124        PRFM    PLDL1KEEP, [x5, 512]
125        SADALP  v18.4s, v10.8h
126        SADALP  v19.4s, v11.8h
127
128        # BLOCK 1- 18 cycles
129        LDR     d9, [x5, 88]
130        INS     v8.d[0], x17
131        SMULL   v12.8h, v4.8b, v0.8b
132        SMULL   v13.8h, v4.8b, v1.8b
133        LDR     x17, [x5, 96]
134        SMULL   v14.8h, v5.8b, v0.8b
135        SMULL   v15.8h, v5.8b, v1.8b
136        LDR     d5, [x5, 40]
137        INS     v4.d[0], x16
138        SMLAL   v12.8h, v8.8b, v6.8b
139        SMLAL   v13.8h, v8.8b, v7.8b
140        LDR     x16, [x5, 48]
141        SMLAL   v14.8h, v9.8b, v6.8b
142        SMLAL   v15.8h, v9.8b, v7.8b
143        PRFM    PLDL1KEEP, [x13, 128]
144        SADALP  v20.4s, v12.8h
145        SADALP  v21.4s, v13.8h
146        PRFM    PLDL1KEEP, [x15, 128]
147        SADALP  v22.4s, v14.8h
148        SADALP  v23.4s, v15.8h
149
150        # BLOCK 2 - 18 cycles
151        LDR     d9, [x5, 104]
152        INS     v8.d[0], x17
153        SMULL   v2.8h, v4.8b, v0.8b
154        SMULL   v3.8h, v4.8b, v1.8b
155        LDR     x17, [x5, 112]
156        SMULL   v10.8h, v5.8b, v0.8b
157        SMULL   v11.8h, v5.8b, v1.8b
158        LDR     d5, [x5, 56]
159        INS     v4.d[0], x16
160        SMLAL   v2.8h, v8.8b, v6.8b
161        SMLAL   v3.8h, v8.8b, v7.8b
162        LDR     x16, [x5, 128]
163        SMLAL   v10.8h, v9.8b, v6.8b
164        SMLAL   v11.8h, v9.8b, v7.8b
165        SADALP  v24.4s,  v2.8h
166        LDR     x20, [x13], 8           // Read A0
167        SADALP  v25.4s,  v3.8h
168        LDR     x21, [x15], 8           // Read A1
169        SADALP  v26.4s, v10.8h
170        SADALP  v27.4s, v11.8h
171        SUBS    x0, x0, 16
172
173        # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles
174        LDR     d9, [x5, 120]
175        INS     v8.d[0], x17
176        SMULL   v12.8h, v4.8b, v0.8b
177        SMULL   v13.8h, v4.8b, v1.8b
178        LDR     x17, [x5, 192]          // Read B
179        SMULL   v14.8h, v5.8b, v0.8b
180        SMULL   v15.8h, v5.8b, v1.8b
181        LDR     d5, [x5, 136]           // Read B
182        INS     v4.d[0], x16
183        SMLAL   v12.8h, v8.8b, v6.8b
184        SMLAL   v13.8h, v8.8b, v7.8b
185        LDR     x16, [x5, 144]
186        SMLAL   v14.8h, v9.8b, v6.8b
187        SMLAL   v15.8h, v9.8b, v7.8b
188        LDR     d6, [x13], 8            // Read A0
189        INS     v0.d[0], x20
190        LDR     d7, [x15], 8            // Read A1
191        INS     v1.d[0], x21
192        SADALP  v28.4s, v12.8h
193        SADALP  v29.4s, v13.8h
194        ADD     x5, x5, 128
195        SADALP  v30.4s, v14.8h
196        SADALP  v31.4s, v15.8h
197        B.HS    2b
198
199        # Epilogue
200        # Same as main loop except no loads at end of loop
201        .p2align 3
2023:
203        # BLOCK 0 - 18 cycles
204        LDR     d9, [x5, 72]            // Read B
205        INS     v8.d[0], x17
206        SMULL   v2.8h, v4.8b, v0.8b
207        SMULL   v3.8h, v4.8b, v1.8b
208        LDR     x17, [x5, 80]
209        SMULL   v10.8h, v5.8b, v0.8b
210        SMULL   v11.8h, v5.8b, v1.8b
211        LDR     d5, [x5, 24]
212        INS     v4.d[0], x16
213        SMLAL   v2.8h, v8.8b, v6.8b
214        SMLAL   v3.8h, v8.8b, v7.8b
215        LDR     x16, [x5, 32]
216        SMLAL   v10.8h, v9.8b, v6.8b
217        SMLAL   v11.8h, v9.8b, v7.8b
218        SADALP  v16.4s,  v2.8h
219        SADALP  v17.4s,  v3.8h
220        SADALP  v18.4s, v10.8h
221        SADALP  v19.4s, v11.8h
222
223        # BLOCK 1- 18 cycles
224        LDR     d9, [x5, 88]
225        INS     v8.d[0], x17
226        SMULL   v12.8h, v4.8b, v0.8b
227        SMULL   v13.8h, v4.8b, v1.8b
228        LDR     x17, [x5, 96]
229        SMULL   v14.8h, v5.8b, v0.8b
230        SMULL   v15.8h, v5.8b, v1.8b
231        LDR     d5, [x5, 40]
232        INS     v4.d[0], x16
233        SMLAL   v12.8h, v8.8b, v6.8b
234        SMLAL   v13.8h, v8.8b, v7.8b
235        LDR     x16, [x5, 48]
236        SMLAL   v14.8h, v9.8b, v6.8b
237        SMLAL   v15.8h, v9.8b, v7.8b
238        SADALP  v20.4s, v12.8h
239        SADALP  v21.4s, v13.8h
240        SADALP  v22.4s, v14.8h
241        SADALP  v23.4s, v15.8h
242
243        # BLOCK 2 - 18 cycles
244        LDR     d9, [x5, 104]
245        INS     v8.d[0], x17
246        SMULL   v2.8h, v4.8b, v0.8b
247        SMULL   v3.8h, v4.8b, v1.8b
248        LDR     x17, [x5, 112]
249        SMULL   v10.8h, v5.8b, v0.8b
250        SMULL   v11.8h, v5.8b, v1.8b
251        LDR     d5, [x5, 56]
252        INS     v4.d[0], x16
253        SMLAL   v2.8h, v8.8b, v6.8b
254        SMLAL   v3.8h, v8.8b, v7.8b
255        SMLAL   v10.8h, v9.8b, v6.8b
256        SMLAL   v11.8h, v9.8b, v7.8b
257        SADALP  v24.4s,  v2.8h
258        SADALP  v25.4s,  v3.8h
259        SADALP  v26.4s, v10.8h
260        SADALP  v27.4s, v11.8h
261
262        # BLOCK 3 - 17 cycles
263        LDR     d9, [x5, 120]
264        INS     v8.d[0], x17
265        SMULL   v12.8h, v4.8b, v0.8b
266        SMULL   v13.8h, v4.8b, v1.8b
267        SMULL   v14.8h, v5.8b, v0.8b
268        SMULL   v15.8h, v5.8b, v1.8b
269        SMLAL   v12.8h, v8.8b, v6.8b
270        SMLAL   v13.8h, v8.8b, v7.8b
271        SMLAL   v14.8h, v9.8b, v6.8b
272        SMLAL   v15.8h, v9.8b, v7.8b
273        SADALP  v28.4s, v12.8h
274        SADALP  v29.4s, v13.8h
275        ADD     x5, x5, 128
276        SADALP  v30.4s, v14.8h
277        SADALP  v31.4s, v15.8h
278
279        # Is there a remainder?- 8 bytes of A
280        TBNZ    x0, 3, 5f
281
282        # ks loop
283        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
284        B.HI    1b
285
2864:
287        # Add columns
288        ADDP    v16.4s, v16.4s, v18.4s
289        ADDP    v20.4s, v20.4s, v22.4s
290        ADDP    v24.4s, v24.4s, v26.4s
291        ADDP    v28.4s, v28.4s, v30.4s
292        ADDP    v17.4s, v17.4s, v19.4s
293        ADDP    v21.4s, v21.4s, v23.4s
294        ADDP    v25.4s, v25.4s, v27.4s
295        ADDP    v29.4s, v29.4s, v31.4s
296        ADDP    v0.4s, v16.4s, v20.4s
297        ADDP    v1.4s, v24.4s, v28.4s
298        ADDP    v2.4s, v17.4s, v21.4s
299        ADDP    v3.4s, v25.4s, v29.4s
300
301        # Load per channel scale values from weights
302        SCVTF   v0.4s, v0.4s
303        LDR     q4, [x5], 16
304        SCVTF   v1.4s, v1.4s
305        LDR     q5, [x5], 16
306        SCVTF   v2.4s, v2.4s
307        SCVTF   v3.4s, v3.4s
308        FMUL    v0.4s, v0.4s, v4.4s
309        FMUL    v1.4s, v1.4s, v5.4s
310        FMUL    v2.4s, v2.4s, v4.4s
311        FMUL    v3.4s, v3.4s, v5.4s
312
313        FCVTNS  v0.4s, v0.4s
314        FCVTNS  v1.4s, v1.4s
315        FCVTNS  v2.4s, v2.4s
316        FCVTNS  v3.4s, v3.4s
317
318        LD1R    {v5.8h}, [x11], 2
319        SQXTN   v0.4h, v0.4s
320        SQXTN   v2.4h, v2.4s
321        SQXTN2  v0.8h, v1.4s
322        SQXTN2  v2.8h, v3.4s
323        SUBS    x1, x1, 8
324        SQADD   v0.8h, v0.8h, v5.8h
325        SQADD   v1.8h, v2.8h, v5.8h
326        SQXTN   v0.8b, v0.8h
327        SQXTN2  v0.16b, v1.8h
328        LD1R    {v1.16b}, [x11], 1
329        LD1R    {v2.16b}, [x11]
330        SMAX    v0.16b, v0.16b, v1.16b
331        SUB     x11, x11, 3          // rewind params pointer
332        SMIN    v0.16b, v0.16b, v2.16b
333        B.LO    6f
334
335        # Store full 2 x 8
336        ST1     {v0.d}[1], [x7], x10
337        ST1     {v0.8b}, [x6], x10
338
339        SUB     x4, x4, x3              // a -= ks
340
341        # nc loop
342        B.HI    0b
343
344        # Restore x20,x21 from stack
345        LDP     x20, x21, [sp, 64]
346
347        # Restore d8-d15 from stack
348        LDP     d14, d15, [sp, 48]
349        LDP     d12, d13, [sp, 32]
350        LDP     d10, d11, [sp, 16]
351        LDP     d8, d9, [sp], 80
352        RET
353
354        # Remainder - 8 bytes of A
355        .p2align 3
3565:
357        LDR     d0, [x13], 8
358        LDP     d4, d5, [x5]
359        LDR     d1, [x15], 8
360        LDP     d6, d7, [x5, 16]
361        SMULL   v2.8h, v4.8b, v0.8b
362        SMULL   v3.8h, v4.8b, v1.8b
363        SMULL   v10.8h, v5.8b, v0.8b
364        SMULL   v11.8h, v5.8b, v1.8b
365        SMULL   v12.8h, v6.8b, v0.8b
366        SADALP  v16.4s,  v2.8h
367        SMULL   v13.8h, v6.8b, v1.8b
368        SADALP  v17.4s,  v3.8h
369        SMULL   v14.8h, v7.8b, v0.8b
370        SADALP  v18.4s, v10.8h
371        SMULL   v15.8h, v7.8b, v1.8b
372        SADALP  v19.4s, v11.8h
373        LDP     d4, d5, [x5, 32]
374        SMULL   v2.8h, v4.8b, v0.8b
375        SADALP  v20.4s, v12.8h
376        SMULL   v3.8h, v4.8b, v1.8b
377        SADALP  v21.4s, v13.8h
378        SMULL   v10.8h, v5.8b, v0.8b
379        SADALP  v22.4s, v14.8h
380        SMULL   v11.8h, v5.8b, v1.8b
381        SADALP  v23.4s, v15.8h
382        LDP     d6, d7, [x5, 48]
383        SMULL   v12.8h, v6.8b, v0.8b
384        SADALP  v24.4s,  v2.8h
385        SMULL   v13.8h, v6.8b, v1.8b
386        SADALP  v25.4s,  v3.8h
387        SMULL   v14.8h, v7.8b, v0.8b
388        SADALP  v26.4s, v10.8h
389        SMULL   v15.8h, v7.8b, v1.8b
390        SADALP  v27.4s, v11.8h
391        ADD     x5, x5, 64
392        SADALP  v28.4s, v12.8h
393        SADALP  v29.4s, v13.8h
394        SADALP  v30.4s, v14.8h
395        SADALP  v31.4s, v15.8h
396
397        # ks loop
398        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
399        B.HI    1b
400        B       4b
401
402        # Store odd width
403        .p2align 3
4046:
405        TBZ     x1, 2, 7f
406        ST1     {v0.s}[2], [x7], 4
407        STR     s0, [x6], 4
408        EXT     v0.16b, v0.16b, v0.16b, 4
409
4107:
411        TBZ     x1, 1, 8f
412        ST1     {v0.h}[4], [x7], 2
413        STR     h0, [x6], 2
414        EXT     v0.16b, v0.16b, v0.16b, 2
4158:
416        TBZ     x1, 0, 9f
417        ST1     {v0.b}[8], [x7]
418        STR     b0, [x6]
4199:
420        # Restore x20,x21 from stack
421        LDP     x20, x21, [sp, 64]
422
423        # Restore d8-d15 from stack
424        LDP     d14, d15, [sp, 48]
425        LDP     d12, d13, [sp, 32]
426        LDP     d10, d11, [sp, 16]
427        LDP     d8, d9, [sp], 80
428        RET
429
430END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
431
432#ifdef __ELF__
433.section ".note.GNU-stack","",%progbits
434#endif
435