xref: /aosp_15_r20/external/XNNPACK/src/qu8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert REQUANTIZATION in ["FP32", "RNDNU"]
7
8#include <xnnpack/assembly.h>
9
10# void xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55(
11#     size_t mr,                 x0
12#     size_t nc,                 x1
13#     size_t kc,                 x2 / x0
14#     size_t ks,                 x3 / x9
15#     const int8_t**restrict a,  x4
16#     const int8_t* restrict w,  x5
17#     int8_t* restrict c,        x6
18#     size_t cm_stride,          x7
19#     size_t cn_stride,          [sp] -> (x0)
20#     size_t a_offset,           [sp + 8] -> x8
21#     const int8_t* zero,        [sp + 16] -> x12
22#     const union xnn_qu8_conv_minmax_params params) [sp + 24] -> (x11)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# Register usage
27# A0  x13  v0  v4
28# A1  x14  v1  v5
29# A2  x15  v2  v6
30# A3  x10  v3  (v0)
31# B    x5  v8  v9 v10 v11
32# C0   x6 v16 v20 v24 v28
33# C1  x16 v17 v21 v25 v29
34# C2  x17 v18 v22 v26 v30
35# C3   x7 v19 v23 v27 v31
36# zero point v7 v12 v13 v14 v15
37
38# x11 temp for Cortex-A55 loads
39
40BEGIN_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
41
42        # Clamp C pointers
43        CMP     x0, 2                   // if mr < 2
44        LDR     x8, [sp, 8]             // Load a_offset
45        ADD     x16, x6, x7             // c1 = c0 + cm_stride
46        CSEL    x16, x6,  x16, LO       //   c1 = c0
47        LDP     x12, x11, [sp, 16]      // Load zero pointer, params
48        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
49        ADD     x17, x16, x7            // c2 = c1 + cm_stride
50                                        // if mr <= 2
51        # Save d8-d15 to stack
52        STP     d8, d9, [sp, -64]!
53
54        CSEL    x17, x16, x17, LS       //   c2 = c1
55        BIC     x2, x2, 3
56        STP     d10, d11, [sp, 16]
57        CMP     x0, 4                   // if mr < 4
58        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
59        STP     d12, d13, [sp, 32]
60        CSEL    x7,  x17, x7, LO        //   c3 = c2
61        LD1R    {v7.4s}, [x11], 4       // kernel_zero_point
62        STP     d14, d15, [sp, 48]
63
64        .p2align 3
650:
66        # Load initial bias from w into accumulators
67        LDP     q16, q20, [x5], 32
68
69        MOVI    v12.4s, 0
70        MOVI    v13.4s, 0
71        MOVI    v14.4s, 0
72        MOVI    v15.4s, 0
73
74        MOV     v17.16b, v16.16b
75        MOV     v18.16b, v16.16b
76        LDP     q24, q28, [x5], 32
77        MOV     v19.16b, v16.16b
78        MOV     v21.16b, v20.16b
79        MOV     v22.16b, v20.16b
80        MOV     v23.16b, v20.16b
81        MOV     v25.16b, v24.16b
82        MOV     v26.16b, v24.16b
83        MOV     v27.16b, v24.16b
84        MOV     v29.16b, v28.16b
85        MOV     v30.16b, v28.16b
86        MOV     v31.16b, v28.16b
87
88        MOV     x9, x3                  // p = ks
89
90        .p2align 3
911:
92        # Load next 4 A pointers
93        LDP     x13, x14, [x4], 16
94        LDP     x15, x10, [x4], 16
95
96        CMP     x13, x12                // if a0 == zero
97        ADD     x13, x13, x8            // a0 += a_offset
98        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
99        CMP     x14, x12                // if a1 == zero
100        ADD     x14, x14, x8            // a1 += a_offset
101        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
102        CMP     x15, x12                // if a2 == zero
103        ADD     x15, x15, x8            // a2 += a_offset
104        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
105        CMP     x10, x12                // if a3 == zero
106        ADD     x10, x10, x8            // a3 += a_offset
107        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
108
109        # Is there at least 16 bytes for prologue/epilogue?
110        SUBS    x0, x2, 16              // k = kc - 16
111        B.LO    5f
112
113        # prologue - read A and B values for block 0 and 1
114        LDR     q8,  [x5], 16
115        LDR     d0, [x13], 8
116        LDR     d1, [x14], 8
117        LDR     d2, [x15], 8
118        LDR     d3, [x10], 8
119        SUBS    x0, x0, 16              // is there 16 for main loop?
120        LDR     d9,  [x5], 8
121        LDR     x11, [x5], 8
122        # Is there at least 16 bytes for main loop?
123        B.LO    3f
124
125        # Main loop - 16 bytes of A in 4 groups.
126        # 4 row of 4 vectors wide = 16 UDOT instructions for 4 channels
127        # 4 LD64 for A
128        # 4 LD128 for W. = 2 LD64 + INS.
129        # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS.
130
131        .p2align 3
1322:
133        # BLOCK 0
134        UDOT    v16.4s,  v8.16b, v0.4b[0]
135        LDR     d10,  [x5], 8
136        UDOT    v17.4s,  v8.16b, v1.4b[0]
137        INS     v9.d[1], x11
138        UDOT    v18.4s,  v8.16b, v2.4b[0]
139        LDR     x11,  [x5], 8
140        UDOT    v19.4s,  v8.16b, v3.4b[0]
141
142        # BLOCK 1
143        UDOT    v20.4s,  v9.16b, v0.4b[0]
144        LDR     d11,  [x5], 8
145        UDOT    v21.4s,  v9.16b, v1.4b[0]
146        INS     v10.d[1], x11
147        UDOT    v22.4s,  v9.16b, v2.4b[0]
148        LDR     x11,  [x5], 8
149        UDOT    v23.4s,  v9.16b, v3.4b[0]
150
151        # BLOCK 2
152        UDOT    v24.4s, v10.16b, v0.4b[0]
153        LDR     d8,  [x5], 8
154        UDOT    v25.4s, v10.16b, v1.4b[0]
155        INS     v11.d[1], x11
156        UDOT    v26.4s, v10.16b, v2.4b[0]
157        LDR     x11,  [x5], 8
158        UDOT    v27.4s, v10.16b, v3.4b[0]
159
160        # BLOCK 3
161        UDOT    v28.4s, v11.16b, v0.4b[0]
162        LDR     d9,  [x5], 8
163        UDOT    v29.4s, v11.16b, v1.4b[0]
164        INS     v8.d[1], x11
165        UDOT    v30.4s, v11.16b, v2.4b[0]
166        LDR     x11,  [x5], 8
167        UDOT    v31.4s, v11.16b, v3.4b[0]
168
169        UDOT    v12.2s, v7.8b, v0.8b
170        UDOT    v13.2s, v7.8b, v1.8b
171        UDOT    v14.2s, v7.8b, v2.8b
172        UDOT    v15.2s, v7.8b, v3.8b
173
174        # BLOCK 0
175        UDOT    v16.4s,  v8.16b, v0.4b[1]
176        LDR     d10,  [x5], 8
177        UDOT    v17.4s,  v8.16b, v1.4b[1]
178        INS     v9.d[1], x11
179        UDOT    v18.4s,  v8.16b, v2.4b[1]
180        LDR     x11,  [x5], 8
181        UDOT    v19.4s,  v8.16b, v3.4b[1]
182        LDR     d4, [x13], 8
183
184        # BLOCK 1
185        UDOT    v20.4s,  v9.16b, v0.4b[1]
186        LDR     d11,  [x5], 8
187        UDOT    v21.4s,  v9.16b, v1.4b[1]
188        INS     v10.d[1], x11
189        UDOT    v22.4s,  v9.16b, v2.4b[1]
190        LDR     x11,  [x5], 8
191        UDOT    v23.4s,  v9.16b, v3.4b[1]
192        LDR     d5, [x14], 8
193
194        # BLOCK 2
195        UDOT    v24.4s, v10.16b, v0.4b[1]
196        LDR     d8,  [x5], 8
197        UDOT    v25.4s, v10.16b, v1.4b[1]
198        INS     v11.d[1], x11
199        UDOT    v26.4s, v10.16b, v2.4b[1]
200        LDR     x11,  [x5], 8
201        UDOT    v27.4s, v10.16b, v3.4b[1]
202        LDR     d6, [x15], 8
203
204        # BLOCK 3
205        UDOT    v28.4s, v11.16b, v0.4b[1]
206        LDR     d9,  [x5], 8
207        UDOT    v29.4s, v11.16b, v1.4b[1]
208        INS     v8.d[1], x11
209        UDOT    v30.4s, v11.16b, v2.4b[1]
210        LDR     x11,  [x5], 8
211        UDOT    v31.4s, v11.16b, v3.4b[1]
212        LDR     d0, [x10], 8
213
214        # BLOCK 0
215        UDOT    v16.4s,  v8.16b, v4.4b[0]
216        LDR     d10,  [x5], 8
217        UDOT    v17.4s,  v8.16b, v5.4b[0]
218        INS     v9.d[1], x11
219        UDOT    v18.4s,  v8.16b, v6.4b[0]
220        LDR     x11,  [x5], 8
221        UDOT    v19.4s,  v8.16b, v0.4b[0]
222
223        # BLOCK 1
224        UDOT    v20.4s,  v9.16b, v4.4b[0]
225        LDR     d11,  [x5], 8
226        UDOT    v21.4s,  v9.16b, v5.4b[0]
227        INS     v10.d[1], x11
228        UDOT    v22.4s,  v9.16b, v6.4b[0]
229        LDR     x11,  [x5], 8
230        UDOT    v23.4s,  v9.16b, v0.4b[0]
231
232        # BLOCK 2
233        UDOT    v24.4s, v10.16b, v4.4b[0]
234        LDR     d8,  [x5], 8
235        UDOT    v25.4s, v10.16b, v5.4b[0]
236        INS     v11.d[1], x11
237        UDOT    v26.4s, v10.16b, v6.4b[0]
238        LDR     x11,  [x5], 8
239        UDOT    v27.4s, v10.16b, v0.4b[0]
240
241        # BLOCK 3
242        UDOT    v28.4s, v11.16b, v4.4b[0]
243        LDR     d9,  [x5], 8
244        UDOT    v29.4s, v11.16b, v5.4b[0]
245        INS     v8.d[1], x11
246        UDOT    v30.4s, v11.16b, v6.4b[0]
247        LDR     x11,  [x5], 8
248        UDOT    v31.4s, v11.16b, v0.4b[0]
249
250        # BLOCK 0
251        UDOT    v16.4s,  v8.16b, v4.4b[1]
252        LDR     d10,  [x5], 8
253        UDOT    v17.4s,  v8.16b, v5.4b[1]
254        INS     v9.d[1], x11
255        UDOT    v18.4s,  v8.16b, v6.4b[1]
256        LDR     x11,  [x5], 8
257        UDOT    v19.4s,  v8.16b, v0.4b[1]
258        LDR     d1, [x14], 8
259
260        # BLOCK 1
261        UDOT    v20.4s,  v9.16b, v4.4b[1]
262        LDR     d11,  [x5], 8
263        UDOT    v21.4s,  v9.16b, v5.4b[1]
264        INS     v10.d[1], x11
265        UDOT    v22.4s,  v9.16b, v6.4b[1]
266        LDR     x11,  [x5], 8
267        UDOT    v23.4s,  v9.16b, v0.4b[1]
268        LDR     d2, [x15], 8
269
270        # BLOCK 2
271        UDOT    v24.4s, v10.16b, v4.4b[1]
272        LDR     d8,  [x5], 8            // First B values for block 0 and 1
273        UDOT    v25.4s, v10.16b, v5.4b[1]
274        INS     v11.d[1], x11
275        UDOT    v26.4s, v10.16b, v6.4b[1]
276        LDR     x11,  [x5], 8
277        UDOT    v27.4s, v10.16b, v0.4b[1]
278        LDR     d3, [x10], 8
279
280        # BLOCK 3 special
281        UDOT    v31.4s, v11.16b, v0.4b[1]
282        LDR     d9,  [x5], 8
283        UDOT    v15.2s, v7.8b, v0.8b    // free up v0 early
284        INS     v8.d[1], x11
285        UDOT    v28.4s, v11.16b, v4.4b[1]
286        LDR     x11,  [x5], 8
287        UDOT    v29.4s, v11.16b, v5.4b[1]
288        LDR     d0, [x13], 8
289        UDOT    v30.4s, v11.16b, v6.4b[1]
290        SUBS    x0, x0, 16
291
292        UDOT    v12.2s, v7.8b, v4.8b
293        UDOT    v13.2s, v7.8b, v5.8b
294        UDOT    v14.2s, v7.8b, v6.8b
295        B.HS    2b
296
297        # Epilogue.  Same as main loop but no preloads in final group
2983:
299        # BLOCK 0
300        UDOT    v16.4s,  v8.16b, v0.4b[0]
301        LDR     d10,  [x5], 8
302        UDOT    v17.4s,  v8.16b, v1.4b[0]
303        INS     v9.d[1], x11
304        UDOT    v18.4s,  v8.16b, v2.4b[0]
305        LDR     x11,  [x5], 8
306        UDOT    v19.4s,  v8.16b, v3.4b[0]
307
308        # BLOCK 1
309        UDOT    v20.4s,  v9.16b, v0.4b[0]
310        LDR     d11,  [x5], 8
311        UDOT    v21.4s,  v9.16b, v1.4b[0]
312        INS     v10.d[1], x11
313        UDOT    v22.4s,  v9.16b, v2.4b[0]
314        LDR     x11,  [x5], 8
315        UDOT    v23.4s,  v9.16b, v3.4b[0]
316
317        # BLOCK 2
318        UDOT    v24.4s, v10.16b, v0.4b[0]
319        LDR     d8,  [x5], 8
320        UDOT    v25.4s, v10.16b, v1.4b[0]
321        INS     v11.d[1], x11
322        UDOT    v26.4s, v10.16b, v2.4b[0]
323        LDR     x11,  [x5], 8
324        UDOT    v27.4s, v10.16b, v3.4b[0]
325
326        # BLOCK 3
327        UDOT    v28.4s, v11.16b, v0.4b[0]
328        LDR     d9,  [x5], 8
329        UDOT    v29.4s, v11.16b, v1.4b[0]
330        INS     v8.d[1], x11
331        UDOT    v30.4s, v11.16b, v2.4b[0]
332        LDR     x11,  [x5], 8
333        UDOT    v31.4s, v11.16b, v3.4b[0]
334
335        UDOT    v12.2s, v7.8b, v0.8b
336        UDOT    v13.2s, v7.8b, v1.8b
337        UDOT    v14.2s, v7.8b, v2.8b
338        UDOT    v15.2s, v7.8b, v3.8b
339
340        # BLOCK 0
341        UDOT    v16.4s,  v8.16b, v0.4b[1]
342        LDR     d10,  [x5], 8
343        UDOT    v17.4s,  v8.16b, v1.4b[1]
344        INS     v9.d[1], x11
345        UDOT    v18.4s,  v8.16b, v2.4b[1]
346        LDR     x11,  [x5], 8
347        UDOT    v19.4s,  v8.16b, v3.4b[1]
348        LDR     d4, [x13], 8
349
350        # BLOCK 1
351        UDOT    v20.4s,  v9.16b, v0.4b[1]
352        LDR     d11,  [x5], 8
353        UDOT    v21.4s,  v9.16b, v1.4b[1]
354        INS     v10.d[1], x11
355        UDOT    v22.4s,  v9.16b, v2.4b[1]
356        LDR     x11,  [x5], 8
357        UDOT    v23.4s,  v9.16b, v3.4b[1]
358        LDR     d5, [x14], 8
359
360        # BLOCK 2
361        UDOT    v24.4s, v10.16b, v0.4b[1]
362        LDR     d8,  [x5], 8
363        UDOT    v25.4s, v10.16b, v1.4b[1]
364        INS     v11.d[1], x11
365        UDOT    v26.4s, v10.16b, v2.4b[1]
366        LDR     x11,  [x5], 8
367        UDOT    v27.4s, v10.16b, v3.4b[1]
368        LDR     d6, [x15], 8
369
370        # BLOCK 3
371        UDOT    v28.4s, v11.16b, v0.4b[1]
372        LDR     d9,  [x5], 8
373        UDOT    v29.4s, v11.16b, v1.4b[1]
374        INS     v8.d[1], x11
375        UDOT    v30.4s, v11.16b, v2.4b[1]
376        LDR     x11,  [x5], 8
377        UDOT    v31.4s, v11.16b, v3.4b[1]
378        LDR     d0, [x10], 8
379
380        # BLOCK 0
381        UDOT    v16.4s,  v8.16b, v4.4b[0]
382        LDR     d10,  [x5], 8
383        UDOT    v17.4s,  v8.16b, v5.4b[0]
384        INS     v9.d[1], x11
385        UDOT    v18.4s,  v8.16b, v6.4b[0]
386        LDR     x11,  [x5], 8
387        UDOT    v19.4s,  v8.16b, v0.4b[0]
388
389        # BLOCK 1
390        UDOT    v20.4s,  v9.16b, v4.4b[0]
391        LDR     d11,  [x5], 8
392        UDOT    v21.4s,  v9.16b, v5.4b[0]
393        INS     v10.d[1], x11
394        UDOT    v22.4s,  v9.16b, v6.4b[0]
395        LDR     x11,  [x5], 8
396        UDOT    v23.4s,  v9.16b, v0.4b[0]
397
398        # BLOCK 2
399        UDOT    v24.4s, v10.16b, v4.4b[0]
400        LDR     d8,  [x5], 8
401        UDOT    v25.4s, v10.16b, v5.4b[0]
402        INS     v11.d[1], x11
403        UDOT    v26.4s, v10.16b, v6.4b[0]
404        LDR     x11,  [x5], 8
405        UDOT    v27.4s, v10.16b, v0.4b[0]
406
407        # BLOCK 3
408        UDOT    v28.4s, v11.16b, v4.4b[0]
409        LDR     d9,  [x5], 8
410        UDOT    v29.4s, v11.16b, v5.4b[0]
411        INS     v8.d[1], x11
412        UDOT    v30.4s, v11.16b, v6.4b[0]
413        LDR     x11,  [x5], 8
414        UDOT    v31.4s, v11.16b, v0.4b[0]
415
416        # BLOCK 0
417        UDOT    v16.4s,  v8.16b, v4.4b[1]
418        LDR     d10,  [x5], 8
419        UDOT    v17.4s,  v8.16b, v5.4b[1]
420        INS     v9.d[1], x11
421        UDOT    v18.4s,  v8.16b, v6.4b[1]
422        LDR     x11,  [x5], 8
423        UDOT    v19.4s,  v8.16b, v0.4b[1]
424
425        # BLOCK 1
426        UDOT    v20.4s,  v9.16b, v4.4b[1]
427        LDR     d11,  [x5], 8
428        UDOT    v21.4s,  v9.16b, v5.4b[1]
429        INS     v10.d[1], x11
430        UDOT    v22.4s,  v9.16b, v6.4b[1]
431        LDR     x11,  [x5], 8
432        UDOT    v23.4s,  v9.16b, v0.4b[1]
433
434        # BLOCK 2
435        UDOT    v24.4s, v10.16b, v4.4b[1]
436        UDOT    v25.4s, v10.16b, v5.4b[1]
437        INS     v11.d[1], x11
438        UDOT    v26.4s, v10.16b, v6.4b[1]
439        UDOT    v27.4s, v10.16b, v0.4b[1]
440
441        # BLOCK 3
442        UDOT    v28.4s, v11.16b, v4.4b[1]
443        UDOT    v29.4s, v11.16b, v5.4b[1]
444        UDOT    v30.4s, v11.16b, v6.4b[1]
445        UDOT    v31.4s, v11.16b, v0.4b[1]
446
447        UDOT    v12.2s, v7.8b, v4.8b
448        UDOT    v13.2s, v7.8b, v5.8b
449        UDOT    v14.2s, v7.8b, v6.8b
450        UDOT    v15.2s, v7.8b, v0.8b
451
452        # Is there a remainder?- 4 to 12 bytes of A
453        TST     x0, 15
454        B.NE    5f
455
4564:
457        # ks loop
458        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
459        B.HI    1b
460
461        ADDP    v0.2s, v12.2s, v13.2s
462        ADDP    v1.2s, v14.2s, v15.2s
463        LDR     x11, [sp, 88]           // Reload params
464        DUP     v12.4s, v0.s[0]
465        DUP     v13.4s, v0.s[1]
466        DUP     v14.4s, v1.s[0]
467        DUP     v15.4s, v1.s[1]
468        ADD     x11, x11, 4
469
470        # Subtract zero point from accumulators
471        SUB     v16.4s, v16.4s, v12.4s
472        SUB     v17.4s, v17.4s, v13.4s
473        SUB     v18.4s, v18.4s, v14.4s
474        SUB     v19.4s, v19.4s, v15.4s
475        SUB     v20.4s, v20.4s, v12.4s
476        SUB     v21.4s, v21.4s, v13.4s
477        SUB     v22.4s, v22.4s, v14.4s
478        SUB     v23.4s, v23.4s, v15.4s
479        SUB     v24.4s, v24.4s, v12.4s
480        SUB     v25.4s, v25.4s, v13.4s
481        SUB     v26.4s, v26.4s, v14.4s
482        SUB     v27.4s, v27.4s, v15.4s
483        SUB     v28.4s, v28.4s, v12.4s
484        SUB     v29.4s, v29.4s, v13.4s
485        SUB     v30.4s, v30.4s, v14.4s
486        SUB     v31.4s, v31.4s, v15.4s
487
488        $if REQUANTIZATION == "RNDNU":
489          # Apply params - preshift, scale, postshift, bias and clamp
490          LD1R    {v4.4s}, [x11], 4
491          SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
492          SSHL    v17.4s, v17.4s, v4.4s
493          SSHL    v18.4s, v18.4s, v4.4s
494          SSHL    v19.4s, v19.4s, v4.4s
495          SSHL    v20.4s, v20.4s, v4.4s
496          SSHL    v21.4s, v21.4s, v4.4s
497          SSHL    v22.4s, v22.4s, v4.4s
498          SSHL    v23.4s, v23.4s, v4.4s
499          LD1R    {v5.4s}, [x11], 4
500          SSHL    v24.4s, v24.4s, v4.4s
501          SSHL    v25.4s, v25.4s, v4.4s
502          SSHL    v26.4s, v26.4s, v4.4s
503          SSHL    v27.4s, v27.4s, v4.4s
504          SSHL    v28.4s, v28.4s, v4.4s
505          SSHL    v29.4s, v29.4s, v4.4s
506          SSHL    v30.4s, v30.4s, v4.4s
507          SSHL    v31.4s, v31.4s, v4.4s
508          LD1R    {v6.4s}, [x11], 4
509          SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
510          SQDMULH v17.4s, v17.4s, v5.4s
511          SQDMULH v18.4s, v18.4s, v5.4s
512          SQDMULH v19.4s, v19.4s, v5.4s
513          SQDMULH v20.4s, v20.4s, v5.4s
514          SQDMULH v21.4s, v21.4s, v5.4s
515          SQDMULH v22.4s, v22.4s, v5.4s
516          SQDMULH v23.4s, v23.4s, v5.4s
517          SQDMULH v24.4s, v24.4s, v5.4s
518          SQDMULH v25.4s, v25.4s, v5.4s
519          SQDMULH v26.4s, v26.4s, v5.4s
520          SQDMULH v27.4s, v27.4s, v5.4s
521          SQDMULH v28.4s, v28.4s, v5.4s
522          SQDMULH v29.4s, v29.4s, v5.4s
523          SQDMULH v30.4s, v30.4s, v5.4s
524          SQDMULH v31.4s, v31.4s, v5.4s
525          SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
526          SRSHL   v17.4s, v17.4s, v6.4s
527          SRSHL   v18.4s, v18.4s, v6.4s
528          SRSHL   v19.4s, v19.4s, v6.4s
529          SRSHL   v20.4s, v20.4s, v6.4s
530          SRSHL   v21.4s, v21.4s, v6.4s
531          SRSHL   v22.4s, v22.4s, v6.4s
532          SRSHL   v23.4s, v23.4s, v6.4s
533          SRSHL   v24.4s, v24.4s, v6.4s
534          SRSHL   v25.4s, v25.4s, v6.4s
535          SRSHL   v26.4s, v26.4s, v6.4s
536          SRSHL   v27.4s, v27.4s, v6.4s
537          SRSHL   v28.4s, v28.4s, v6.4s
538          SRSHL   v29.4s, v29.4s, v6.4s
539          SRSHL   v30.4s, v30.4s, v6.4s
540          SRSHL   v31.4s, v31.4s, v6.4s
541        $elif REQUANTIZATION == "FP32":
542          SCVTF   v16.4s, v16.4s
543          SCVTF   v17.4s, v17.4s
544          # Apply params - scale, bias and clamp
545          LD1R    {v4.4s}, [x11], 4
546          SCVTF   v18.4s, v18.4s
547          SCVTF   v19.4s, v19.4s
548          SCVTF   v20.4s, v20.4s
549          SCVTF   v21.4s, v21.4s
550          SCVTF   v22.4s, v22.4s
551          SCVTF   v23.4s, v23.4s
552          SCVTF   v24.4s, v24.4s
553          SCVTF   v25.4s, v25.4s
554          SCVTF   v26.4s, v26.4s
555          SCVTF   v27.4s, v27.4s
556          SCVTF   v28.4s, v28.4s
557          SCVTF   v29.4s, v29.4s
558          SCVTF   v30.4s, v30.4s
559          SCVTF   v31.4s, v31.4s
560
561          FMUL    v16.4s, v16.4s, v4.4s
562          FMUL    v17.4s, v17.4s, v4.4s
563          FMUL    v18.4s, v18.4s, v4.4s
564          FMUL    v19.4s, v19.4s, v4.4s
565          FMUL    v20.4s, v20.4s, v4.4s
566          FMUL    v21.4s, v21.4s, v4.4s
567          FMUL    v22.4s, v22.4s, v4.4s
568          FMUL    v23.4s, v23.4s, v4.4s
569          FMUL    v24.4s, v24.4s, v4.4s
570          FMUL    v25.4s, v25.4s, v4.4s
571          FMUL    v26.4s, v26.4s, v4.4s
572          FMUL    v27.4s, v27.4s, v4.4s
573          FMUL    v28.4s, v28.4s, v4.4s
574          FMUL    v29.4s, v29.4s, v4.4s
575          FMUL    v30.4s, v30.4s, v4.4s
576          FMUL    v31.4s, v31.4s, v4.4s
577
578          FCVTNS  v16.4s, v16.4s
579          FCVTNS  v17.4s, v17.4s
580          FCVTNS  v18.4s, v18.4s
581          FCVTNS  v19.4s, v19.4s
582          FCVTNS  v20.4s, v20.4s
583          FCVTNS  v21.4s, v21.4s
584          FCVTNS  v22.4s, v22.4s
585          FCVTNS  v23.4s, v23.4s
586          FCVTNS  v24.4s, v24.4s
587          FCVTNS  v25.4s, v25.4s
588          FCVTNS  v26.4s, v26.4s
589          FCVTNS  v27.4s, v27.4s
590          FCVTNS  v28.4s, v28.4s
591          FCVTNS  v29.4s, v29.4s
592          FCVTNS  v30.4s, v30.4s
593          FCVTNS  v31.4s, v31.4s
594
595        SQXTN   v16.4h, v16.4s
596        SQXTN   v17.4h, v17.4s
597        SQXTN   v18.4h, v18.4s
598        SQXTN   v19.4h, v19.4s
599        SQXTN   v24.4h, v24.4s
600        SQXTN   v25.4h, v25.4s
601        SQXTN   v26.4h, v26.4s
602        SQXTN   v27.4h, v27.4s
603        LD1R    {v6.8h}, [x11], 2        // add bias
604
605        SQXTN2  v16.8h, v20.4s
606        SQXTN2  v17.8h, v21.4s
607        SQXTN2  v18.8h, v22.4s
608        SQXTN2  v19.8h, v23.4s
609        SQXTN2  v24.8h, v28.4s
610        SQXTN2  v25.8h, v29.4s
611        SQXTN2  v26.8h, v30.4s
612        SQXTN2  v27.8h, v31.4s
613
614        SQADD   v16.8h, v16.8h, v6.8h
615        SQADD   v17.8h, v17.8h, v6.8h
616        SQADD   v18.8h, v18.8h, v6.8h
617        SQADD   v19.8h, v19.8h, v6.8h
618        SQADD   v24.8h, v24.8h, v6.8h
619        SQADD   v25.8h, v25.8h, v6.8h
620        SQADD   v26.8h, v26.8h, v6.8h
621        SQADD   v27.8h, v27.8h, v6.8h
622        LD1R    {v4.16b}, [x11], 1      // clamp min value
623
624        SQXTUN  v0.8b, v16.8h
625        SQXTUN  v1.8b, v17.8h
626        SQXTUN  v2.8b, v18.8h
627        SQXTUN  v3.8b, v19.8h
628        LD1R    {v5.16b}, [x11]         // clamp max value
629        SQXTUN2 v0.16b, v24.8h
630        SQXTUN2 v1.16b, v25.8h
631        SQXTUN2 v2.16b, v26.8h
632        SQXTUN2 v3.16b, v27.8h
633        LDR     x0, [sp, 64]            // Load cn_stride
634
635        UMAX    v0.16b, v0.16b, v4.16b
636        UMAX    v1.16b, v1.16b, v4.16b
637        UMAX    v2.16b, v2.16b, v4.16b
638        UMAX    v3.16b, v3.16b, v4.16b
639        SUBS    x1, x1, 16
640        UMIN    v0.16b, v0.16b, v5.16b
641        UMIN    v1.16b, v1.16b, v5.16b
642        UMIN    v2.16b, v2.16b, v5.16b
643        UMIN    v3.16b, v3.16b, v5.16b
644        B.LO    7f
645
646        # Store full 4 x 16
647        ST1     {v3.16b},  [x7], x0
648        ST1     {v2.16b}, [x17], x0
649        ST1     {v1.16b}, [x16], x0
650        ST1     {v0.16b},  [x6], x0
651
652        SUB     x4, x4, x3              // a -= ks
653
654        # nc loop
655        B.HI    0b
656
657        # Restore d8-d15 from stack
658        LDP     d14, d15, [sp, 48]
659        LDP     d12, d13, [sp, 32]
660        LDP     d10, d11, [sp, 16]
661        LDP     d8, d9, [sp], 64
662        RET
663
664         # Remainder- 4 to 12 bytes of A
665        .p2align 3
6665:
667        TBZ     x0, 3, 6f
668
669        LDR     d0, [x13], 8
670        LDP     q8,  q9,  [x5], 32
671        LDR     d1, [x14], 8
672        LDR     d2, [x15], 8
673        LDR     d3, [x10], 8
674        LDP     q10, q11, [x5], 32
675        UDOT    v12.2s, v7.8b, v0.8b
676        UDOT    v13.2s, v7.8b, v1.8b
677        UDOT    v14.2s, v7.8b, v2.8b
678        UDOT    v15.2s, v7.8b, v3.8b
679        UDOT    v16.4s,  v8.16b, v0.4b[0]
680        UDOT    v17.4s,  v8.16b, v1.4b[0]
681        UDOT    v18.4s,  v8.16b, v2.4b[0]
682        UDOT    v19.4s,  v8.16b, v3.4b[0]
683        UDOT    v20.4s,  v9.16b, v0.4b[0]
684        UDOT    v21.4s,  v9.16b, v1.4b[0]
685        UDOT    v22.4s,  v9.16b, v2.4b[0]
686        UDOT    v23.4s,  v9.16b, v3.4b[0]
687        UDOT    v24.4s, v10.16b, v0.4b[0]
688        UDOT    v25.4s, v10.16b, v1.4b[0]
689        UDOT    v26.4s, v10.16b, v2.4b[0]
690        UDOT    v27.4s, v10.16b, v3.4b[0]
691        UDOT    v28.4s, v11.16b, v0.4b[0]
692        UDOT    v29.4s, v11.16b, v1.4b[0]
693        UDOT    v30.4s, v11.16b, v2.4b[0]
694        UDOT    v31.4s, v11.16b, v3.4b[0]
695        LDP     q8,  q9,  [x5], 32
696        LDP     q10, q11, [x5], 32
697        UDOT    v16.4s,  v8.16b, v0.4b[1]
698        UDOT    v17.4s,  v8.16b, v1.4b[1]
699        UDOT    v18.4s,  v8.16b, v2.4b[1]
700        UDOT    v19.4s,  v8.16b, v3.4b[1]
701        UDOT    v20.4s,  v9.16b, v0.4b[1]
702        UDOT    v21.4s,  v9.16b, v1.4b[1]
703        UDOT    v22.4s,  v9.16b, v2.4b[1]
704        UDOT    v23.4s,  v9.16b, v3.4b[1]
705        UDOT    v24.4s, v10.16b, v0.4b[1]
706        UDOT    v25.4s, v10.16b, v1.4b[1]
707        UDOT    v26.4s, v10.16b, v2.4b[1]
708        UDOT    v27.4s, v10.16b, v3.4b[1]
709        UDOT    v28.4s, v11.16b, v0.4b[1]
710        UDOT    v29.4s, v11.16b, v1.4b[1]
711        UDOT    v30.4s, v11.16b, v2.4b[1]
712        UDOT    v31.4s, v11.16b, v3.4b[1]
713        TBZ     x0, 2, 4b
7146:
715        LDR     s0, [x13], 4
716        LDP     q8,  q9,  [x5], 32
717        LDR     s1, [x14], 4
718        LDR     s2, [x15], 4
719        LDR     s3, [x10], 4
720        LDP     q10, q11, [x5], 32
721        UDOT    v12.2s, v7.8b, v0.8b
722        UDOT    v13.2s, v7.8b, v1.8b
723        UDOT    v14.2s, v7.8b, v2.8b
724        UDOT    v15.2s, v7.8b, v3.8b
725        UDOT    v16.4s,  v8.16b, v0.4b[0]
726        UDOT    v17.4s,  v8.16b, v1.4b[0]
727        UDOT    v18.4s,  v8.16b, v2.4b[0]
728        UDOT    v19.4s,  v8.16b, v3.4b[0]
729        UDOT    v20.4s,  v9.16b, v0.4b[0]
730        UDOT    v21.4s,  v9.16b, v1.4b[0]
731        UDOT    v22.4s,  v9.16b, v2.4b[0]
732        UDOT    v23.4s,  v9.16b, v3.4b[0]
733        UDOT    v24.4s, v10.16b, v0.4b[0]
734        UDOT    v25.4s, v10.16b, v1.4b[0]
735        UDOT    v26.4s, v10.16b, v2.4b[0]
736        UDOT    v27.4s, v10.16b, v3.4b[0]
737        UDOT    v28.4s, v11.16b, v0.4b[0]
738        UDOT    v29.4s, v11.16b, v1.4b[0]
739        UDOT    v30.4s, v11.16b, v2.4b[0]
740        UDOT    v31.4s, v11.16b, v3.4b[0]
741        B       4b
742
743        # Store odd width
744        .p2align 3
7457:
746        TBZ     x1, 3, 8f
747        STR     d3, [x7], 8
748        STR     d2, [x17], 8
749        DUP     d3, v3.d[1]
750        DUP     d2, v2.d[1]
751        STR     d1, [x16], 8
752        STR     d0, [x6], 8
753        DUP     d1, v1.d[1]
754        DUP     d0, v0.d[1]
7558:
756        TBZ     x1, 2, 9f
757        STR     s3, [x7], 4
758        STR     s2, [x17], 4
759        DUP     s3, v3.s[1]
760        DUP     s2, v2.s[1]
761        STR     s1, [x16], 4
762        STR     s0, [x6], 4
763        DUP     s1, v1.s[1]
764        DUP     s0, v0.s[1]
7659:
766        TBZ     x1, 1, 10f
767        STR     h3, [x7], 2
768        STR     h2, [x17], 2
769        DUP     h3, v3.h[1]
770        DUP     h2, v2.h[1]
771        STR     h1, [x16], 2
772        STR     h0, [x6], 2
773        DUP     h1, v1.h[1]
774        DUP     h0, v0.h[1]
77510:
776        TBZ     x1, 0, 11f
777        STR     b3, [x7]
778        STR     b2, [x17]
779        STR     b1, [x16]
780        STR     b0, [x6]
78111:
782        # Restore d8-d15 from stack
783        LDP     d14, d15, [sp, 48]
784        LDP     d12, d13, [sp, 32]
785        LDP     d10, d11, [sp, 16]
786        LDP     d8, d9, [sp], 64
787        RET
788
789END_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
790
791#ifdef __ELF__
792.section ".note.GNU-stack","",%progbits
793#endif
794