1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x10
23#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v6
29# A1  x4  v1  v7
30# B   x5  v4  v5  v8  v9
31# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
32# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
33# temp0   v2 v10 v12 v14
34# temp1   v3 v11 v13 v15
35# x16, x17, x20, x21 tenporary a53 gpr load data
36
37
38BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
39
40        # Clamp A and C pointers
41        CMP     x0, 2                   // if mr < 2
42        STP     d8, d9, [sp, -80]!
43        ADD     x4, x3, x4              // a1 = a0 + a_stride
44        STP     d10, d11, [sp, 16]
45        ADD     x7, x6, x7              // c1 = c0 + cm_stride
46        STP     d12, d13, [sp, 32]
47        CSEL    x4, x3, x4, LO          //   a1 = a0
48        STP     d14, d15, [sp, 48]
49        ADD     x2, x2, 7               // kc = (kc + 7) & ~7
50        CSEL    x7, x6, x7, LO          //   c1 = c0
51        BIC     x2, x2, 7
52        STP     x20, x21, [sp, 64]      // Save x20,x21 on stack
53
54        .p2align 3
550:
56        # Load initial bias from w into accumulators
57        SUBS    x0, x2, 16              // k = kc - 16
58        LDP     s16, s18, [x5], 8
59        MOV     v17.16b, v16.16b
60        MOV     v19.16b, v18.16b
61        LDP     s20, s22, [x5], 8
62        MOV     v21.16b, v20.16b
63        MOV     v23.16b, v22.16b
64        LDP     s24, s26, [x5], 8
65        MOV     v25.16b, v24.16b
66        MOV     v27.16b, v26.16b
67        LDP     s28, s30, [x5], 8
68        MOV     v29.16b, v28.16b
69        LDP     x10, x11, [sp, 80]       // cn_stride, params
70        MOV     v31.16b, v30.16b
71        # Is there at least 16 bytes for epilogue?
72        B.LO    4f
73
74        # Prologue: load A0, A1 and 2 B's
75        LDP     d4, d5, [x5]            // Read B
76        LDP     d0, d6, [x3], 16        // Read A0
77        LDR     x17, [x5, 64]           // Read B
78        LDP     d1, d7, [x4], 16        // Read A1
79        LDR     x16, [x5, 16]
80
81        # Is there at least 16 bytes for main loop?
82        SUBS    x0, x0, 16              // k = k - 16
83        B.LO    2f
84
85         # Main loop - 16 bytes of A
86         # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles.
87         # 2 loads for A0 = +2 cycles.  Total 18 * 4 + 2 = 74 cycles.
88
89        .p2align 3
901:
91        # BLOCK 0 - 18 cycles - includes prfm
92        LDR     d9, [x5, 72]            // Read B
93        INS     v8.d[0], x17
94        SMULL   v2.8h, v4.8b, v0.8b
95        SMULL   v3.8h, v4.8b, v1.8b
96        LDR     x17, [x5, 80]
97        SMULL   v10.8h, v5.8b, v0.8b
98        SMULL   v11.8h, v5.8b, v1.8b
99        LDR     d5, [x5, 24]
100        INS     v4.d[0], x16
101        SMLAL   v2.8h, v8.8b, v6.8b
102        SMLAL   v3.8h, v8.8b, v7.8b
103        LDR     x16, [x5, 32]
104        SMLAL   v10.8h, v9.8b, v6.8b
105        SMLAL   v11.8h, v9.8b, v7.8b
106        PRFM    PLDL1KEEP, [x5, 448]
107        SADALP  v16.4s,  v2.8h
108        SADALP  v17.4s,  v3.8h
109        PRFM    PLDL1KEEP, [x5, 512]
110        SADALP  v18.4s, v10.8h
111        SADALP  v19.4s, v11.8h
112
113        # BLOCK 1- 18 cycles
114        LDR     d9, [x5, 88]
115        INS     v8.d[0], x17
116        SMULL   v12.8h, v4.8b, v0.8b
117        SMULL   v13.8h, v4.8b, v1.8b
118        LDR     x17, [x5, 96]
119        SMULL   v14.8h, v5.8b, v0.8b
120        SMULL   v15.8h, v5.8b, v1.8b
121        LDR     d5, [x5, 40]
122        INS     v4.d[0], x16
123        SMLAL   v12.8h, v8.8b, v6.8b
124        SMLAL   v13.8h, v8.8b, v7.8b
125        LDR     x16, [x5, 48]
126        SMLAL   v14.8h, v9.8b, v6.8b
127        SMLAL   v15.8h, v9.8b, v7.8b
128        PRFM    PLDL1KEEP, [x3, 128]
129        SADALP  v20.4s, v12.8h
130        SADALP  v21.4s, v13.8h
131        PRFM    PLDL1KEEP, [x4, 128]
132        SADALP  v22.4s, v14.8h
133        SADALP  v23.4s, v15.8h
134
135        # BLOCK 2 - 18 cycles
136        LDR     d9, [x5, 104]
137        INS     v8.d[0], x17
138        SMULL   v2.8h, v4.8b, v0.8b
139        SMULL   v3.8h, v4.8b, v1.8b
140        LDR     x17, [x5, 112]
141        SMULL   v10.8h, v5.8b, v0.8b
142        SMULL   v11.8h, v5.8b, v1.8b
143        LDR     d5, [x5, 56]
144        INS     v4.d[0], x16
145        SMLAL   v2.8h, v8.8b, v6.8b
146        SMLAL   v3.8h, v8.8b, v7.8b
147        LDR     x16, [x5, 128]
148        SMLAL   v10.8h, v9.8b, v6.8b
149        SMLAL   v11.8h, v9.8b, v7.8b
150        SADALP  v24.4s,  v2.8h
151        LDR     x20, [x3], 8            // Read A0
152        SADALP  v25.4s,  v3.8h
153        LDR     x21, [x4], 8            // Read A1
154        SADALP  v26.4s, v10.8h
155        SADALP  v27.4s, v11.8h
156        SUBS    x0, x0, 16
157
158        # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles
159        LDR     d9, [x5, 120]
160        INS     v8.d[0], x17
161        SMULL   v12.8h, v4.8b, v0.8b
162        SMULL   v13.8h, v4.8b, v1.8b
163        LDR     x17, [x5, 192]          // Read B
164        SMULL   v14.8h, v5.8b, v0.8b
165        SMULL   v15.8h, v5.8b, v1.8b
166        LDR     d5, [x5, 136]           // Read B
167        INS     v4.d[0], x16
168        SMLAL   v12.8h, v8.8b, v6.8b
169        SMLAL   v13.8h, v8.8b, v7.8b
170        LDR     x16, [x5, 144]
171        SMLAL   v14.8h, v9.8b, v6.8b
172        SMLAL   v15.8h, v9.8b, v7.8b
173        LDR     d6, [x3], 8             // Read A0
174        INS     v0.d[0], x20
175        LDR     d7, [x4], 8             // Read A1
176        INS     v1.d[0], x21
177        SADALP  v28.4s, v12.8h
178        SADALP  v29.4s, v13.8h
179        ADD     x5, x5, 128
180        SADALP  v30.4s, v14.8h
181        SADALP  v31.4s, v15.8h
182        B.HS    1b
183
184        # Epilogue
185        # Same as main loop except no loads at end of loop
186
187        .p2align 3
1882:
189        # BLOCK 0 - 18 cycles
190        LDR     d9, [x5, 72]            // Read B
191        INS     v8.d[0], x17
192        SMULL   v2.8h, v4.8b, v0.8b
193        SMULL   v3.8h, v4.8b, v1.8b
194        LDR     x17, [x5, 80]
195        SMULL   v10.8h, v5.8b, v0.8b
196        SMULL   v11.8h, v5.8b, v1.8b
197        LDR     d5, [x5, 24]
198        INS     v4.d[0], x16
199        SMLAL   v2.8h, v8.8b, v6.8b
200        SMLAL   v3.8h, v8.8b, v7.8b
201        LDR     x16, [x5, 32]
202        SMLAL   v10.8h, v9.8b, v6.8b
203        SMLAL   v11.8h, v9.8b, v7.8b
204        SADALP  v16.4s,  v2.8h
205        SADALP  v17.4s,  v3.8h
206        SADALP  v18.4s, v10.8h
207        SADALP  v19.4s, v11.8h
208
209        # BLOCK 1- 18 cycles
210        LDR     d9, [x5, 88]
211        INS     v8.d[0], x17
212        SMULL   v12.8h, v4.8b, v0.8b
213        SMULL   v13.8h, v4.8b, v1.8b
214        LDR     x17, [x5, 96]
215        SMULL   v14.8h, v5.8b, v0.8b
216        SMULL   v15.8h, v5.8b, v1.8b
217        LDR     d5, [x5, 40]
218        INS     v4.d[0], x16
219        SMLAL   v12.8h, v8.8b, v6.8b
220        SMLAL   v13.8h, v8.8b, v7.8b
221        LDR     x16, [x5, 48]
222        SMLAL   v14.8h, v9.8b, v6.8b
223        SMLAL   v15.8h, v9.8b, v7.8b
224        SADALP  v20.4s, v12.8h
225        SADALP  v21.4s, v13.8h
226        SADALP  v22.4s, v14.8h
227        SADALP  v23.4s, v15.8h
228
229        # BLOCK 2 - 18 cycles
230        LDR     d9, [x5, 104]
231        INS     v8.d[0], x17
232        SMULL   v2.8h, v4.8b, v0.8b
233        SMULL   v3.8h, v4.8b, v1.8b
234        LDR     x17, [x5, 112]
235        SMULL   v10.8h, v5.8b, v0.8b
236        SMULL   v11.8h, v5.8b, v1.8b
237        LDR     d5, [x5, 56]
238        INS     v4.d[0], x16
239        SMLAL   v2.8h, v8.8b, v6.8b
240        SMLAL   v3.8h, v8.8b, v7.8b
241        SMLAL   v10.8h, v9.8b, v6.8b
242        SMLAL   v11.8h, v9.8b, v7.8b
243        SADALP  v24.4s,  v2.8h
244        SADALP  v25.4s,  v3.8h
245        SADALP  v26.4s, v10.8h
246        SADALP  v27.4s, v11.8h
247
248        # BLOCK 3 - 17 cycles
249        LDR     d9, [x5, 120]
250        INS     v8.d[0], x17
251        SMULL   v12.8h, v4.8b, v0.8b
252        SMULL   v13.8h, v4.8b, v1.8b
253        SMULL   v14.8h, v5.8b, v0.8b
254        SMULL   v15.8h, v5.8b, v1.8b
255        SMLAL   v12.8h, v8.8b, v6.8b
256        SMLAL   v13.8h, v8.8b, v7.8b
257        SMLAL   v14.8h, v9.8b, v6.8b
258        SMLAL   v15.8h, v9.8b, v7.8b
259        SADALP  v28.4s, v12.8h
260        SADALP  v29.4s, v13.8h
261        ADD     x5, x5, 128
262        SADALP  v30.4s, v14.8h
263        SADALP  v31.4s, v15.8h
264
265        # Is there a remainder?- 8 bytes of A
266        TBNZ    x0, 3, 4f
267
268        .p2align 3
2693:
270        # Add columns
271        ADDP    v16.4s, v16.4s, v18.4s
272        ADDP    v20.4s, v20.4s, v22.4s
273        ADDP    v24.4s, v24.4s, v26.4s
274        ADDP    v28.4s, v28.4s, v30.4s
275        ADDP    v17.4s, v17.4s, v19.4s
276        ADDP    v21.4s, v21.4s, v23.4s
277        ADDP    v25.4s, v25.4s, v27.4s
278        ADDP    v29.4s, v29.4s, v31.4s
279        ADDP    v0.4s, v16.4s, v20.4s
280        ADDP    v1.4s, v24.4s, v28.4s
281        ADDP    v2.4s, v17.4s, v21.4s
282        ADDP    v3.4s, v25.4s, v29.4s
283
284        # Load per channel scale values from weights
285        SCVTF   v0.4s, v0.4s
286        LDR     q4, [x5], 16
287        SCVTF   v1.4s, v1.4s
288        LDR     q5, [x5], 16
289        SCVTF   v2.4s, v2.4s
290        SCVTF   v3.4s, v3.4s
291        FMUL    v0.4s, v0.4s, v4.4s
292        FMUL    v1.4s, v1.4s, v5.4s
293        FMUL    v2.4s, v2.4s, v4.4s
294        FMUL    v3.4s, v3.4s, v5.4s
295
296        FCVTNS  v0.4s, v0.4s
297        FCVTNS  v1.4s, v1.4s
298        FCVTNS  v2.4s, v2.4s
299        FCVTNS  v3.4s, v3.4s
300
301        LD1R    {v5.8h}, [x11], 2
302        SQXTN   v0.4h, v0.4s
303        SQXTN   v2.4h, v2.4s
304        SQXTN2  v0.8h, v1.4s
305        SQXTN2  v2.8h, v3.4s
306        SUBS    x1, x1, 8
307        SQADD   v0.8h, v0.8h, v5.8h
308        SQADD   v1.8h, v2.8h, v5.8h
309        SQXTN   v0.8b, v0.8h
310        SQXTN2  v0.16b, v1.8h
311        LD1R    {v1.16b}, [x11], 1
312        LD1R    {v2.16b}, [x11]
313        SMAX    v0.16b, v0.16b, v1.16b
314        SMIN    v0.16b, v0.16b, v2.16b
315        B.LO    5f
316
317        # Store full 2 x 8
318        ST1     {v0.8b}, [x6], x10
319        SUB     x3, x3, x2              // a0 -= kc
320        ST1     {v0.d}[1], [x7], x10
321        SUB     x4, x4, x2              // a1 -= kc
322        B.HI    0b
323
324        # Restore x20,x21 from stack
325        LDP     x20, x21, [sp, 64]
326
327        # Restore d8-d15 from stack
328        LDP     d14, d15, [sp, 48]
329        LDP     d12, d13, [sp, 32]
330        LDP     d10, d11, [sp, 16]
331        LDP     d8, d9, [sp], 80
332        RET
333
334        # Remainder - 8 bytes of A
335        .p2align 3
3364:
337        LDR     d0, [x3], 8
338        LDP     d4, d5, [x5]
339        LDR     d1, [x4], 8
340        LDP     d6, d7, [x5, 16]
341        SMULL   v2.8h, v4.8b, v0.8b
342        SMULL   v3.8h, v4.8b, v1.8b
343        SMULL   v10.8h, v5.8b, v0.8b
344        SMULL   v11.8h, v5.8b, v1.8b
345        SMULL   v12.8h, v6.8b, v0.8b
346        SADALP  v16.4s,  v2.8h
347        SMULL   v13.8h, v6.8b, v1.8b
348        SADALP  v17.4s,  v3.8h
349        SMULL   v14.8h, v7.8b, v0.8b
350        SADALP  v18.4s, v10.8h
351        SMULL   v15.8h, v7.8b, v1.8b
352        SADALP  v19.4s, v11.8h
353        LDP     d4, d5, [x5, 32]
354        SMULL   v2.8h, v4.8b, v0.8b
355        SADALP  v20.4s, v12.8h
356        SMULL   v3.8h, v4.8b, v1.8b
357        SADALP  v21.4s, v13.8h
358        SMULL   v10.8h, v5.8b, v0.8b
359        SADALP  v22.4s, v14.8h
360        SMULL   v11.8h, v5.8b, v1.8b
361        SADALP  v23.4s, v15.8h
362        LDP     d6, d7, [x5, 48]
363        SMULL   v12.8h, v6.8b, v0.8b
364        SADALP  v24.4s,  v2.8h
365        SMULL   v13.8h, v6.8b, v1.8b
366        SADALP  v25.4s,  v3.8h
367        SMULL   v14.8h, v7.8b, v0.8b
368        SADALP  v26.4s, v10.8h
369        SMULL   v15.8h, v7.8b, v1.8b
370        SADALP  v27.4s, v11.8h
371        ADD     x5, x5, 64
372        SADALP  v28.4s, v12.8h
373        SADALP  v29.4s, v13.8h
374        SADALP  v30.4s, v14.8h
375        SADALP  v31.4s, v15.8h
376        B       3b
377
378        # Store odd width
379        .p2align 3
3805:
381        TBZ     x1, 2, 6f
382        STR     s0, [x6], 4
383        ST1     {v0.s}[2], [x7], 4
384        EXT     v0.16b, v0.16b, v0.16b, 4
385
3866:
387        TBZ     x1, 1, 7f
388        STR     h0, [x6], 2
389        ST1     {v0.h}[4], [x7], 2
390        EXT     v0.16b, v0.16b, v0.16b, 2
3917:
392        TBZ     x1, 0, 8f
393        STR     b0, [x6]
394        ST1     {v0.b}[8], [x7]
3958:
396        # Restore x20,x21 from stack
397        LDP     x20, x21, [sp, 64]
398
399        # Restore d8-d15 from stack
400        LDP     d14, d15, [sp, 48]
401        LDP     d12, d13, [sp, 32]
402        LDP     d10, d11, [sp, 16]
403        LDP     d8, d9, [sp], 80
404        RET
405
406END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53
407
408#ifdef __ELF__
409.section ".note.GNU-stack","",%progbits
410#endif
411
412