xref: /aosp_15_r20/external/XNNPACK/src/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qu8-igemm/4x8c4-aarch64-neondot-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> (x10)
23#     size_t a_offset,           [sp + 8] -> x8
24#     const int8_t* zero,        [sp + 16] -> x12
25#     const union xnn_qu8_conv_minmax_params [sp + 24] -> (x11)
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0  v4
31# A1  x14  v1  v5
32# A2  x15  v2  v6
33# A3  x10  v3  v7
34# B    x5 v28 v29 v30 v31
35# C0   x6 v16 v20
36# C1  x16 v17 v21
37# C2  x17 v18 v22
38# C3   x7 v19 v23
39# zero_point v8 v24 v25 v26 v27
40# unused v9 v10 v11 v12 v13 v14 v15
41
42# x11 temp for Cortex-A55 loads
43
44BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55
45
46        # Clamp C pointers
47        CMP     x0, 2                   // if mr < 2
48        LDR     x8, [sp, 8]             // Load a_offset
49        ADD     x16, x6, x7             // c1 = c0 + cm_stride
50        CSEL    x16, x6,  x16, LO       //   c1 = c0
51        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
52        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
53        ADD     x17, x16, x7            // c2 = c1 + cm_stride
54        STR     d8, [sp, -16]!          // Save d8 on stack
55                                        // if mr <= 2
56        CSEL    x17, x16, x17, LS       //   c2 = c1
57        BIC     x2, x2, 3
58        CMP     x0, 4                   // if mr < 4
59        LD1R    {v8.4s}, [x11], 4       // kernel_zero_point
60        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
61        CSEL    x7,  x17, x7, LO        //   c3 = c2
62
63        .p2align 3
640:
65        # Load initial bias from w into accumulators
66        LDP     q16, q20, [x5], 32
67        MOV     v17.16b, v16.16b
68        MOV     v18.16b, v16.16b
69        MOV     v19.16b, v16.16b
70        MOV     v21.16b, v20.16b
71        MOV     v22.16b, v20.16b
72        MOV     v23.16b, v20.16b
73        MOVI    v24.16b, 0
74        MOVI    v25.16b, 0
75        MOVI    v26.16b, 0
76        MOVI    v27.16b, 0
77        MOV     x9, x3                  // p = ks
78
79        .p2align 3
801:
81        # Load next 4 A pointers
82        LDP     x13, x14, [x4], 16
83        LDP     x15, x10, [x4], 16
84
85        CMP     x13, x12                // if a0 == zero
86        ADD     x13, x13, x8            // a0 += a_offset
87        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
88        CMP     x14, x12                // if a1 == zero
89        ADD     x14, x14, x8            // a1 += a_offset
90        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
91        CMP     x15, x12                // if a2 == zero
92        ADD     x15, x15, x8            // a2 += a_offset
93        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
94        CMP     x10, x12                // if a3 == zero
95        ADD     x10, x10, x8            // a3 += a_offset
96        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
97
98        # Is there at least 16 bytes for prologue/epilogue?
99        SUBS    x0, x2, 16              // k = kc - 16
100        B.LO    5f
101
102        # prologue - read A and B values for block 0 and 1
103        LDR     d0, [x13], 8
104        LDR     q28, [x5], 16
105        LDR     d1, [x14], 8
106        LDR     d2, [x15], 8
107        LDR     d3, [x10], 8
108        SUBS    x0, x0, 16              // is there 16 for main loop?
109        LDR     d29, [x5], 8
110        LDR     x11, [x5], 8
111        # Is there at least 16 bytes for main loop?
112        B.LO    3f
113
114        # Main loop - 16 bytes of A in 4 groups of 2 blocks
115        # 4 row of 2 vectors wide = 8 UDOT instructions for 4 channels
116        # 4 LD64 for A
117        # 4 LD128 for W. = 2 LD64 + INS.
118        # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS.
119
120        .p2align 3
1212:
122        # BLOCK 0
123        UDOT    v16.4s,  v28.16b, v0.4b[0]
124        LDR     d30,  [x5], 8
125        UDOT    v17.4s,  v28.16b, v1.4b[0]
126        INS     v29.d[1], x11
127        UDOT    v18.4s,  v28.16b, v2.4b[0]
128        LDR     x11,  [x5], 8
129        UDOT    v19.4s,  v28.16b, v3.4b[0]
130        LDR     d4, [x13], 8
131
132        # BLOCK 1
133        UDOT    v20.4s,  v29.16b, v0.4b[0]
134        LDR     d31,  [x5], 8
135        UDOT    v21.4s,  v29.16b, v1.4b[0]
136        INS     v30.d[1], x11
137        UDOT    v22.4s,  v29.16b, v2.4b[0]
138        LDR     x11,  [x5], 8
139        UDOT    v23.4s,  v29.16b, v3.4b[0]
140        LDR     d5, [x14], 8
141
142        # BLOCK 0
143        UDOT    v16.4s, v30.16b, v0.4b[1]
144        LDR     d28,  [x5], 8
145        UDOT    v17.4s, v30.16b, v1.4b[1]
146        INS     v31.d[1], x11
147        UDOT    v18.4s, v30.16b, v2.4b[1]
148        LDR     x11,  [x5], 8
149        UDOT    v19.4s, v30.16b, v3.4b[1]
150        LDR     d6, [x15], 8
151
152        # BLOCK 1
153        UDOT    v20.4s, v31.16b, v0.4b[1]
154        LDR     d29,  [x5], 8
155        UDOT    v21.4s, v31.16b, v1.4b[1]
156        INS     v28.d[1], x11
157        UDOT    v22.4s, v31.16b, v2.4b[1]
158        LDR     x11,  [x5], 8
159        UDOT    v23.4s, v31.16b, v3.4b[1]
160        LDR     d7, [x10], 8
161
162        UDOT    v24.2s, v8.8b, v0.8b
163        UDOT    v25.2s, v8.8b, v1.8b
164        UDOT    v26.2s, v8.8b, v2.8b
165        UDOT    v27.2s, v8.8b, v3.8b
166
167        # BLOCK 0
168        UDOT    v16.4s,  v28.16b, v4.4b[0]
169        LDR     d30,  [x5], 8
170        UDOT    v17.4s,  v28.16b, v5.4b[0]
171        INS     v29.d[1], x11
172        UDOT    v18.4s,  v28.16b, v6.4b[0]
173        LDR     x11,  [x5], 8
174        UDOT    v19.4s,  v28.16b, v7.4b[0]
175        LDR     d0, [x13], 8
176
177        # BLOCK 1
178        UDOT    v20.4s,  v29.16b, v4.4b[0]
179        LDR     d31,  [x5], 8
180        UDOT    v21.4s,  v29.16b, v5.4b[0]
181        INS     v30.d[1], x11
182        UDOT    v22.4s,  v29.16b, v6.4b[0]
183        LDR     x11,  [x5], 8
184        UDOT    v23.4s,  v29.16b, v7.4b[0]
185        LDR     d1, [x14], 8
186
187        # BLOCK 0
188        UDOT    v16.4s, v30.16b, v4.4b[1]
189        LDR     d28,  [x5], 8
190        UDOT    v17.4s, v30.16b, v5.4b[1]
191        INS     v31.d[1], x11
192        UDOT    v18.4s, v30.16b, v6.4b[1]
193        LDR     x11,  [x5], 8
194        UDOT    v19.4s, v30.16b, v7.4b[1]
195        LDR     d2, [x15], 8
196
197        # BLOCK 1
198        UDOT    v20.4s, v31.16b, v4.4b[1]
199        LDR     d29,  [x5], 8
200        UDOT    v21.4s, v31.16b, v5.4b[1]
201        INS     v28.d[1], x11
202        UDOT    v22.4s, v31.16b, v6.4b[1]
203        LDR     x11,  [x5], 8
204        UDOT    v23.4s, v31.16b, v7.4b[1]
205        LDR     d3, [x10], 8
206
207        UDOT    v24.2s, v8.8b, v4.8b
208        UDOT    v25.2s, v8.8b, v5.8b
209        SUBS    x0, x0, 16
210        UDOT    v26.2s, v8.8b, v6.8b
211        UDOT    v27.2s, v8.8b, v7.8b
212
213        B.HS    2b
214
215        # Epilogue.  Same as main loop but no preloads in final group
2163:
217        # BLOCK 0
218        UDOT    v16.4s,  v28.16b, v0.4b[0]
219        LDR     d30,  [x5], 8
220        UDOT    v17.4s,  v28.16b, v1.4b[0]
221        INS     v29.d[1], x11
222        UDOT    v18.4s,  v28.16b, v2.4b[0]
223        LDR     x11,  [x5], 8
224        UDOT    v19.4s,  v28.16b, v3.4b[0]
225        LDR     d4, [x13], 8
226
227        # BLOCK 1
228        UDOT    v20.4s,  v29.16b, v0.4b[0]
229        LDR     d31,  [x5], 8
230        UDOT    v21.4s,  v29.16b, v1.4b[0]
231        INS     v30.d[1], x11
232        UDOT    v22.4s,  v29.16b, v2.4b[0]
233        LDR     x11,  [x5], 8
234        UDOT    v23.4s,  v29.16b, v3.4b[0]
235        LDR     d5, [x14], 8
236
237        # BLOCK 0
238        UDOT    v16.4s, v30.16b, v0.4b[1]
239        LDR     d28,  [x5], 8
240        UDOT    v17.4s, v30.16b, v1.4b[1]
241        INS     v31.d[1], x11
242        UDOT    v18.4s, v30.16b, v2.4b[1]
243        LDR     x11,  [x5], 8
244        UDOT    v19.4s, v30.16b, v3.4b[1]
245        LDR     d6, [x15], 8
246
247        # BLOCK 1
248        UDOT    v20.4s, v31.16b, v0.4b[1]
249        LDR     d29,  [x5], 8
250        UDOT    v21.4s, v31.16b, v1.4b[1]
251        INS     v28.d[1], x11
252        UDOT    v22.4s, v31.16b, v2.4b[1]
253        LDR     x11,  [x5], 8
254        UDOT    v23.4s, v31.16b, v3.4b[1]
255        LDR     d7, [x10], 8
256
257        UDOT    v24.2s, v8.8b, v0.8b
258        UDOT    v25.2s, v8.8b, v1.8b
259        UDOT    v26.2s, v8.8b, v2.8b
260        UDOT    v27.2s, v8.8b, v3.8b
261
262        # BLOCK 0
263        UDOT    v16.4s,  v28.16b, v4.4b[0]
264        LDR     d30,  [x5], 8
265        UDOT    v17.4s,  v28.16b, v5.4b[0]
266        INS     v29.d[1], x11
267        UDOT    v18.4s,  v28.16b, v6.4b[0]
268        LDR     x11,  [x5], 8
269        UDOT    v19.4s,  v28.16b, v7.4b[0]
270
271        # BLOCK 1
272        UDOT    v20.4s,  v29.16b, v4.4b[0]
273        LDR     d31,  [x5], 8
274        UDOT    v21.4s,  v29.16b, v5.4b[0]
275        INS     v30.d[1], x11
276        UDOT    v22.4s,  v29.16b, v6.4b[0]
277        LDR     x11,  [x5], 8
278        UDOT    v23.4s,  v29.16b, v7.4b[0]
279
280        # BLOCK 0
281        UDOT    v16.4s, v30.16b, v4.4b[1]
282        UDOT    v17.4s, v30.16b, v5.4b[1]
283        INS     v31.d[1], x11
284        UDOT    v18.4s, v30.16b, v6.4b[1]
285        UDOT    v19.4s, v30.16b, v7.4b[1]
286
287        # BLOCK 1
288        UDOT    v20.4s, v31.16b, v4.4b[1]
289        UDOT    v21.4s, v31.16b, v5.4b[1]
290        UDOT    v22.4s, v31.16b, v6.4b[1]
291        UDOT    v23.4s, v31.16b, v7.4b[1]
292
293        AND     x0, x2, 15              // kc remainder 0 to 12
294
295        UDOT    v24.2s, v8.8b, v4.8b
296        UDOT    v25.2s, v8.8b, v5.8b
297        UDOT    v26.2s, v8.8b, v6.8b
298        UDOT    v27.2s, v8.8b, v7.8b
299
300        # Is there a remainder?- 4 to 12 bytes of A
301        CBNZ    x0, 5f
302
303        .p2align 3
3044:
305        # ks loop
306        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
307        B.HI    1b
308
309        ADDP    v0.2s, v24.2s, v25.2s
310        ADDP    v1.2s, v26.2s, v27.2s
311        LDR     x11, [sp, 40]           // reload params pointer
312        DUP     v24.4s, v0.s[0]
313        DUP     v25.4s, v0.s[1]
314        DUP     v26.4s, v1.s[0]
315        DUP     v27.4s, v1.s[1]
316        ADD     x11, x11, 4
317
318        # Subtract zero point from accumulators
319        SUB     v16.4s, v16.4s, v24.4s
320        SUB     v17.4s, v17.4s, v25.4s
321        SUB     v18.4s, v18.4s, v26.4s
322        SUB     v19.4s, v19.4s, v27.4s
323        SUB     v20.4s, v20.4s, v24.4s
324        SUB     v21.4s, v21.4s, v25.4s
325        SUB     v22.4s, v22.4s, v26.4s
326        SUB     v23.4s, v23.4s, v27.4s
327
328        # Apply params - preshift, scale, postshift, bias and clamp
329        LD1R    {v4.4s}, [x11], 4
330        SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
331        SSHL    v17.4s, v17.4s, v4.4s
332        SSHL    v18.4s, v18.4s, v4.4s
333        SSHL    v19.4s, v19.4s, v4.4s
334        LD1R    {v5.4s}, [x11], 4
335        SSHL    v20.4s, v20.4s, v4.4s
336        SSHL    v21.4s, v21.4s, v4.4s
337        SSHL    v22.4s, v22.4s, v4.4s
338        SSHL    v23.4s, v23.4s, v4.4s
339        LD1R    {v6.4s}, [x11], 4
340        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
341        SQDMULH v17.4s, v17.4s, v5.4s
342        SQDMULH v18.4s, v18.4s, v5.4s
343        SQDMULH v19.4s, v19.4s, v5.4s
344        SQDMULH v20.4s, v20.4s, v5.4s
345        SQDMULH v21.4s, v21.4s, v5.4s
346        SQDMULH v22.4s, v22.4s, v5.4s
347        SQDMULH v23.4s, v23.4s, v5.4s
348        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
349        SRSHL   v17.4s, v17.4s, v6.4s
350        SRSHL   v18.4s, v18.4s, v6.4s
351        SRSHL   v19.4s, v19.4s, v6.4s
352        SRSHL   v20.4s, v20.4s, v6.4s
353        SRSHL   v21.4s, v21.4s, v6.4s
354        SRSHL   v22.4s, v22.4s, v6.4s
355        SRSHL   v23.4s, v23.4s, v6.4s
356
357        SQXTN   v16.4h, v16.4s
358        SQXTN   v17.4h, v17.4s
359        SQXTN   v18.4h, v18.4s
360        SQXTN   v19.4h, v19.4s
361        LD1R    {v6.8h}, [x11], 2        // add bias
362
363        SQXTN2  v16.8h, v20.4s
364        SQXTN2  v17.8h, v21.4s
365        SQXTN2  v18.8h, v22.4s
366        SQXTN2  v19.8h, v23.4s
367
368        SQADD   v16.8h, v16.8h, v6.8h
369        SQADD   v17.8h, v17.8h, v6.8h
370        LDR     x10, [sp, 16]            // Load cn_stride
371        SQADD   v18.8h, v18.8h, v6.8h
372        SQADD   v19.8h, v19.8h, v6.8h
373        LD1R    {v4.16b}, [x11], 1       // clamp min value
374
375        SQXTUN  v0.8b, v16.8h
376        SQXTUN  v1.8b, v18.8h
377        LD1R    {v5.16b}, [x11]          // clamp max value
378        SQXTUN2 v0.16b, v17.8h
379        SQXTUN2 v1.16b, v19.8h
380
381        UMAX    v0.16b, v0.16b, v4.16b
382        UMAX    v1.16b, v1.16b, v4.16b
383        SUBS    x1, x1, 8
384        UMIN    v0.16b, v0.16b, v5.16b
385        UMIN    v1.16b, v1.16b, v5.16b
386        B.LO    7f
387
388        # Store full 4 x 8
389        ST1     {v1.d}[1],  [x7], x10
390        ST1     {v1.8b}, [x17], x10
391        ST1     {v0.d}[1], [x16], x10
392        ST1     {v0.8b},  [x6], x10
393        SUB     x4, x4, x3              // a -= ks
394
395        # nc loop
396        B.HI    0b
397
398        # Restore d8 from stack
399        LDR     d8, [sp], 16
400        RET
401
402        # Remainder- 4 to 12 bytes of A
403        .p2align 3
4045:
405        TBZ     x0, 3, 6f
406
407        LDR     d0, [x13], 8
408        LDR     q4, [x5], 16
409        LDR     d1, [x14], 8
410        LDR     d2, [x15], 8
411        LDR     d3, [x10], 8
412        LDR     q5, [x5], 16
413        UDOT    v24.2s, v8.8b, v0.8b
414        UDOT    v25.2s, v8.8b, v1.8b
415        UDOT    v26.2s, v8.8b, v2.8b
416        UDOT    v27.2s, v8.8b, v3.8b
417        UDOT    v16.4s, v4.16b, v0.4b[0]
418        UDOT    v17.4s, v4.16b, v1.4b[0]
419        UDOT    v18.4s, v4.16b, v2.4b[0]
420        UDOT    v19.4s, v4.16b, v3.4b[0]
421        LDR     q6, [x5], 16
422        UDOT    v20.4s, v5.16b, v0.4b[0]
423        UDOT    v21.4s, v5.16b, v1.4b[0]
424        UDOT    v22.4s, v5.16b, v2.4b[0]
425        UDOT    v23.4s, v5.16b, v3.4b[0]
426        LDR     q4, [x5], 16
427        UDOT    v16.4s, v6.16b, v0.4b[1]
428        UDOT    v17.4s, v6.16b, v1.4b[1]
429        UDOT    v18.4s, v6.16b, v2.4b[1]
430        UDOT    v19.4s, v6.16b, v3.4b[1]
431        UDOT    v20.4s, v4.16b, v0.4b[1]
432        UDOT    v21.4s, v4.16b, v1.4b[1]
433        UDOT    v22.4s, v4.16b, v2.4b[1]
434        UDOT    v23.4s, v4.16b, v3.4b[1]
435        TBZ     x0, 2, 4b
4366:
437        LDR     s0, [x13], 4
438        LDR     q4, [x5], 16
439        LDR     s1, [x14], 4
440        LDR     s2, [x15], 4
441        LDR     s3, [x10], 4
442        LDR     q5, [x5], 16
443        UDOT    v24.2s, v8.8b, v0.8b
444        UDOT    v25.2s, v8.8b, v1.8b
445        UDOT    v26.2s, v8.8b, v2.8b
446        UDOT    v27.2s, v8.8b, v3.8b
447        UDOT    v16.4s, v4.16b, v0.4b[0]
448        UDOT    v17.4s, v4.16b, v1.4b[0]
449        UDOT    v18.4s, v4.16b, v2.4b[0]
450        UDOT    v19.4s, v4.16b, v3.4b[0]
451        UDOT    v20.4s, v5.16b, v0.4b[0]
452        UDOT    v21.4s, v5.16b, v1.4b[0]
453        UDOT    v22.4s, v5.16b, v2.4b[0]
454        UDOT    v23.4s, v5.16b, v3.4b[0]
455        B       4b
456
457        # Store odd width
458        .p2align 3
4597:
460        TBZ     x1, 2, 8f
461        ST1     {v1.s}[2], [x7], 4
462        STR     s1, [x17], 4
463        ST1     {v0.s}[2], [x16], 4
464        STR     s0, [x6], 4
465        EXT     v0.16b, v0.16b, v0.16b, 4
466        EXT     v1.16b, v1.16b, v1.16b, 4
4678:
468        TBZ     x1, 1, 9f
469        ST1     {v1.h}[4], [x7], 2
470        STR     h1, [x17], 2
471        ST1     {v0.h}[4], [x16], 2
472        STR     h0, [x6], 2
473        EXT     v0.16b, v0.16b, v0.16b, 2
474        EXT     v1.16b, v1.16b, v1.16b, 2
4759:
476        TBZ     x1, 0, 10f
477        ST1     {v1.b}[8], [x7]
478        STR     b1, [x17]
479        ST1     {v0.b}[8], [x16]
480        STR     b0, [x6]
48110:
482        # Restore d8 from stack
483        LDR     d8, [sp], 16
484        RET
485
486END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55
487
488#ifdef __ELF__
489.section ".note.GNU-stack","",%progbits
490#endif
491