xref: /aosp_15_r20/external/XNNPACK/src/qu8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert REQUANTIZATION in ["FP32", "RNDNU"]
7
8#include <xnnpack/assembly.h>
9
10$REWIND_DECREMENT = {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
11# void xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55(
12#     size_t mr,                 x0
13#     size_t nc,                 x1
14#     size_t kc,                 x2 / x0
15#     const int8_t* restrict a,  x3
16#     size_t a_stride,           x4
17#     const void* restrict w,    x5
18#     int8_t* restrict c,        x6
19#     size_t cm_stride,          x7
20#     size_t cn_stride,          [sp] -> x12
21#     const union xnn_qu8_conv_minmax_params params)  [sp + 8] -> x11
22
23$if REQUANTIZATION == "RNDNU":
24  # params structure is 20 bytes
25  #  struct {
26  #    uint8_t kernel_zero_point[4];
27  #    int32_t right_pre_shift;
28  #    int32_t multiplier;
29  #    int32_t right_post_shift;
30  #    int16_t output_zero_point;
31  #    int8_t output_min;
32  #    int8_t output_max;
33  #  } rndnu_neon;
34$elif REQUANTIZATION == "FP32":
35  # params structure is 12 bytes
36  #  struct {
37  #    uint8_t kernel_zero_point[4];
38  #    float scale;
39  #    int16_t output_zero_point;
40  #    int8_t output_min;
41  #    int8_t output_max;
42  #  } fp32_neonv8;
43
44# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
45
46# Register usage
47# A0  x3  v0  v4
48# A1 x15  v1  v5
49# A2 x13  v2  v6
50# A3  x4  v3  (v0)
51# B   x5  v8  v9 v10 v11
52# C0  x6 v16 v20 v24 v28
53# C1  x8 v17 v21 v25 v29
54# C2  x9 v18 v22 v26 v30
55# C3  x7 v19 v23 v27 v31
56# zero point v7 v12 v13 v14 v15
57
58# x14 temp for Cortex-A55 loads
59
60BEGIN_FUNCTION xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
61
62        # Clamp A and C pointers
63        CMP     x0, 2                   // if mr < 2
64        LDP     x12, x11, [sp]          // cn_stride, params
65        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
66        ADD     x15, x3, x4             // a1 = a0 + a_stride
67        ADD     x8, x6, x7              // c1 = c0 + cm_stride
68
69        # Save d8-d15 to stack
70        STP     d8, d9, [sp, -64]!
71        CSEL    x15, x3, x15, LO        //   a1 = a0
72        CSEL    x8, x6,  x8, LO         //   c1 = c0
73        BIC     x2, x2, 3
74        STP     d10, d11, [sp, 16]
75
76        ADD     x13, x15, x4            // a2 = a1 + a_stride
77        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
78        STP     d12, d13, [sp, 32]
79                                        // if mr <= 2
80        CSEL    x13, x15, x13, LS       //   a2 = a1
81        CSEL    x9,  x8,  x9, LS        //   c2 = c1
82        STP     d14, d15, [sp, 48]
83
84        CMP     x0, 4                   // if mr < 4
85        ADD     x4, x13, x4             // a3 = a2 + a_stride
86        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
87
88        LD1R    {v7.4s}, [x11], 4       // kernel_zero_point
89
90        CSEL    x4, x13, x4, LO         //   a3 = a2
91        CSEL    x7,  x9, x7, LO         //   c3 = c2
92
93
94        .p2align 3
950:
96        # Load initial bias from w into accumulators
97        LDP     q16, q20, [x5], 32
98
99        MOVI    v12.4s, 0
100        MOVI    v13.4s, 0
101        MOVI    v14.4s, 0
102        MOVI    v15.4s, 0
103
104        LDP     q24, q28, [x5], 32
105        MOV     v17.16b, v16.16b
106        MOV     v18.16b, v16.16b
107        MOV     v19.16b, v16.16b
108        MOV     v21.16b, v20.16b
109        SUBS    x0, x2, 16              // k = kc - 16
110        MOV     v22.16b, v20.16b
111        MOV     v23.16b, v20.16b
112        MOV     v25.16b, v24.16b
113        MOV     v26.16b, v24.16b
114        MOV     v27.16b, v24.16b
115        MOV     v29.16b, v28.16b
116        MOV     v30.16b, v28.16b
117        MOV     v31.16b, v28.16b
118
119        # Is there at least 16 bytes for prologue/epilogue?
120        B.LO    4f
121
122        # prologue - read A and B values for block 0 and 1
123        LDR     d0,  [x3], 8
124        LDR     q8,  [x5], 16
125        LDR     d1, [x15], 8
126        LDR     d2, [x13], 8
127        LDR     d3,  [x4], 8
128        SUBS    x0, x0, 16              // is there 16 for main loop?
129        LDR     d9,  [x5], 8
130        LDR     x14, [x5], 8
131        # Is there at least 16 bytes for main loop?
132        B.LO    2f
133
134        # Main loop - 16 bytes of A in 4 groups.
135        # 4 row of 4 vectors wide = 16 UDOT instructions for 4 channels
136        # 4 LD64 for A
137        # 4 LD128 for W. = 2 LD64 + INS.
138        # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS.
139
140        .p2align 3
1411:
142        # BLOCK 0
143        UDOT    v16.4s,  v8.16b, v0.4b[0]
144        LDR     d10,  [x5], 8
145        UDOT    v17.4s,  v8.16b, v1.4b[0]
146        INS     v9.d[1], x14
147        UDOT    v18.4s,  v8.16b, v2.4b[0]
148        LDR     x14,  [x5], 8
149        UDOT    v19.4s,  v8.16b, v3.4b[0]
150
151        # BLOCK 1
152        UDOT    v20.4s,  v9.16b, v0.4b[0]
153        LDR     d11,  [x5], 8
154        UDOT    v21.4s,  v9.16b, v1.4b[0]
155        INS     v10.d[1], x14
156        UDOT    v22.4s,  v9.16b, v2.4b[0]
157        LDR     x14,  [x5], 8
158        UDOT    v23.4s,  v9.16b, v3.4b[0]
159
160        # BLOCK 2
161        UDOT    v24.4s, v10.16b, v0.4b[0]
162        LDR     d8,  [x5], 8
163        UDOT    v25.4s, v10.16b, v1.4b[0]
164        INS     v11.d[1], x14
165        UDOT    v26.4s, v10.16b, v2.4b[0]
166        LDR     x14,  [x5], 8
167        UDOT    v27.4s, v10.16b, v3.4b[0]
168
169        # BLOCK 3
170        UDOT    v28.4s, v11.16b, v0.4b[0]
171        LDR     d9,  [x5], 8
172        UDOT    v29.4s, v11.16b, v1.4b[0]
173        INS     v8.d[1], x14
174        UDOT    v30.4s, v11.16b, v2.4b[0]
175        LDR     x14,  [x5], 8
176        UDOT    v31.4s, v11.16b, v3.4b[0]
177
178        UDOT    v12.2s, v7.8b, v0.8b
179        UDOT    v13.2s, v7.8b, v1.8b
180        UDOT    v14.2s, v7.8b, v2.8b
181        UDOT    v15.2s, v7.8b, v3.8b
182
183        # BLOCK 0
184        UDOT    v16.4s,  v8.16b, v0.4b[1]
185        LDR     d10,  [x5], 8
186        UDOT    v17.4s,  v8.16b, v1.4b[1]
187        INS     v9.d[1], x14
188        UDOT    v18.4s,  v8.16b, v2.4b[1]
189        LDR     x14,  [x5], 8
190        UDOT    v19.4s,  v8.16b, v3.4b[1]
191        LDR     d4,  [x3], 8
192
193        # BLOCK 1
194        UDOT    v20.4s,  v9.16b, v0.4b[1]
195        LDR     d11,  [x5], 8
196        UDOT    v21.4s,  v9.16b, v1.4b[1]
197        INS     v10.d[1], x14
198        UDOT    v22.4s,  v9.16b, v2.4b[1]
199        LDR     x14,  [x5], 8
200        UDOT    v23.4s,  v9.16b, v3.4b[1]
201        LDR     d5, [x15], 8
202
203        # BLOCK 2
204        UDOT    v24.4s, v10.16b, v0.4b[1]
205        LDR     d8,  [x5], 8
206        UDOT    v25.4s, v10.16b, v1.4b[1]
207        INS     v11.d[1], x14
208        UDOT    v26.4s, v10.16b, v2.4b[1]
209        LDR     x14,  [x5], 8
210        UDOT    v27.4s, v10.16b, v3.4b[1]
211        LDR     d6, [x13], 8
212
213        # BLOCK 3
214        UDOT    v28.4s, v11.16b, v0.4b[1]
215        LDR     d9,  [x5], 8
216        UDOT    v29.4s, v11.16b, v1.4b[1]
217        INS     v8.d[1], x14
218        UDOT    v30.4s, v11.16b, v2.4b[1]
219        LDR     x14,  [x5], 8
220        UDOT    v31.4s, v11.16b, v3.4b[1]
221        LDR     d0,  [x4], 8
222
223        # BLOCK 0
224        UDOT    v16.4s,  v8.16b, v4.4b[0]
225        LDR     d10,  [x5], 8
226        UDOT    v17.4s,  v8.16b, v5.4b[0]
227        INS     v9.d[1], x14
228        UDOT    v18.4s,  v8.16b, v6.4b[0]
229        LDR     x14,  [x5], 8
230        UDOT    v19.4s,  v8.16b, v0.4b[0]
231
232        # BLOCK 1
233        UDOT    v20.4s,  v9.16b, v4.4b[0]
234        LDR     d11,  [x5], 8
235        UDOT    v21.4s,  v9.16b, v5.4b[0]
236        INS     v10.d[1], x14
237        UDOT    v22.4s,  v9.16b, v6.4b[0]
238        LDR     x14,  [x5], 8
239        UDOT    v23.4s,  v9.16b, v0.4b[0]
240
241        # BLOCK 2
242        UDOT    v24.4s, v10.16b, v4.4b[0]
243        LDR     d8,  [x5], 8
244        UDOT    v25.4s, v10.16b, v5.4b[0]
245        INS     v11.d[1], x14
246        UDOT    v26.4s, v10.16b, v6.4b[0]
247        LDR     x14,  [x5], 8
248        UDOT    v27.4s, v10.16b, v0.4b[0]
249
250        # BLOCK 3
251        UDOT    v28.4s, v11.16b, v4.4b[0]
252        LDR     d9,  [x5], 8
253        UDOT    v29.4s, v11.16b, v5.4b[0]
254        INS     v8.d[1], x14
255        UDOT    v30.4s, v11.16b, v6.4b[0]
256        LDR     x14,  [x5], 8
257        UDOT    v31.4s, v11.16b, v0.4b[0]
258
259        # BLOCK 0
260        UDOT    v16.4s,  v8.16b, v4.4b[1]
261        LDR     d10,  [x5], 8
262        UDOT    v17.4s,  v8.16b, v5.4b[1]
263        INS     v9.d[1], x14
264        UDOT    v18.4s,  v8.16b, v6.4b[1]
265        LDR     x14,  [x5], 8
266        UDOT    v19.4s,  v8.16b, v0.4b[1]
267        LDR     d1, [x15], 8
268
269        # BLOCK 1
270        UDOT    v20.4s,  v9.16b, v4.4b[1]
271        LDR     d11,  [x5], 8
272        UDOT    v21.4s,  v9.16b, v5.4b[1]
273        INS     v10.d[1], x14
274        UDOT    v22.4s,  v9.16b, v6.4b[1]
275        LDR     x14,  [x5], 8
276        UDOT    v23.4s,  v9.16b, v0.4b[1]
277        LDR     d2, [x13], 8
278
279        # BLOCK 2
280        UDOT    v24.4s, v10.16b, v4.4b[1]
281        LDR     d8,  [x5], 8            // First B values for block 0 and 1
282        UDOT    v25.4s, v10.16b, v5.4b[1]
283        INS     v11.d[1], x14
284        UDOT    v26.4s, v10.16b, v6.4b[1]
285        LDR     x14,  [x5], 8
286        UDOT    v27.4s, v10.16b, v0.4b[1]
287        LDR     d3,  [x4], 8
288
289        # BLOCK 3 special
290        UDOT    v31.4s, v11.16b, v0.4b[1]
291        LDR     d9,  [x5], 8
292        UDOT    v15.2s, v7.8b, v0.8b    // free up v0 early
293        INS     v8.d[1], x14
294        UDOT    v28.4s, v11.16b, v4.4b[1]
295        LDR     x14,  [x5], 8
296        UDOT    v29.4s, v11.16b, v5.4b[1]
297        LDR     d0,  [x3], 8
298        UDOT    v30.4s, v11.16b, v6.4b[1]
299        SUBS    x0, x0, 16
300
301        UDOT    v12.2s, v7.8b, v4.8b
302        UDOT    v13.2s, v7.8b, v5.8b
303        UDOT    v14.2s, v7.8b, v6.8b
304        B.HS    1b
305
306        # Epilogue.  Same as main loop but no preloads in final group
3072:
308        # BLOCK 0
309        UDOT    v16.4s,  v8.16b, v0.4b[0]
310        LDR     d10,  [x5], 8
311        UDOT    v17.4s,  v8.16b, v1.4b[0]
312        INS     v9.d[1], x14
313        UDOT    v18.4s,  v8.16b, v2.4b[0]
314        LDR     x14,  [x5], 8
315        UDOT    v19.4s,  v8.16b, v3.4b[0]
316
317        # BLOCK 1
318        UDOT    v20.4s,  v9.16b, v0.4b[0]
319        LDR     d11,  [x5], 8
320        UDOT    v21.4s,  v9.16b, v1.4b[0]
321        INS     v10.d[1], x14
322        UDOT    v22.4s,  v9.16b, v2.4b[0]
323        LDR     x14,  [x5], 8
324        UDOT    v23.4s,  v9.16b, v3.4b[0]
325
326        # BLOCK 2
327        UDOT    v24.4s, v10.16b, v0.4b[0]
328        LDR     d8,  [x5], 8
329        UDOT    v25.4s, v10.16b, v1.4b[0]
330        INS     v11.d[1], x14
331        UDOT    v26.4s, v10.16b, v2.4b[0]
332        LDR     x14,  [x5], 8
333        UDOT    v27.4s, v10.16b, v3.4b[0]
334
335        # BLOCK 3
336        UDOT    v28.4s, v11.16b, v0.4b[0]
337        LDR     d9,  [x5], 8
338        UDOT    v29.4s, v11.16b, v1.4b[0]
339        INS     v8.d[1], x14
340        UDOT    v30.4s, v11.16b, v2.4b[0]
341        LDR     x14,  [x5], 8
342        UDOT    v31.4s, v11.16b, v3.4b[0]
343
344        UDOT    v12.2s, v7.8b, v0.8b
345        UDOT    v13.2s, v7.8b, v1.8b
346        UDOT    v14.2s, v7.8b, v2.8b
347        UDOT    v15.2s, v7.8b, v3.8b
348
349        # BLOCK 0
350        UDOT    v16.4s,  v8.16b, v0.4b[1]
351        LDR     d10,  [x5], 8
352        UDOT    v17.4s,  v8.16b, v1.4b[1]
353        INS     v9.d[1], x14
354        UDOT    v18.4s,  v8.16b, v2.4b[1]
355        LDR     x14,  [x5], 8
356        UDOT    v19.4s,  v8.16b, v3.4b[1]
357        LDR     d4,  [x3], 8
358
359        # BLOCK 1
360        UDOT    v20.4s,  v9.16b, v0.4b[1]
361        LDR     d11,  [x5], 8
362        UDOT    v21.4s,  v9.16b, v1.4b[1]
363        INS     v10.d[1], x14
364        UDOT    v22.4s,  v9.16b, v2.4b[1]
365        LDR     x14,  [x5], 8
366        UDOT    v23.4s,  v9.16b, v3.4b[1]
367        LDR     d5, [x15], 8
368
369        # BLOCK 2
370        UDOT    v24.4s, v10.16b, v0.4b[1]
371        LDR     d8,  [x5], 8
372        UDOT    v25.4s, v10.16b, v1.4b[1]
373        INS     v11.d[1], x14
374        UDOT    v26.4s, v10.16b, v2.4b[1]
375        LDR     x14,  [x5], 8
376        UDOT    v27.4s, v10.16b, v3.4b[1]
377        LDR     d6, [x13], 8
378
379        # BLOCK 3
380        UDOT    v28.4s, v11.16b, v0.4b[1]
381        LDR     d9,  [x5], 8
382        UDOT    v29.4s, v11.16b, v1.4b[1]
383        INS     v8.d[1], x14
384        UDOT    v30.4s, v11.16b, v2.4b[1]
385        LDR     x14,  [x5], 8
386        UDOT    v31.4s, v11.16b, v3.4b[1]
387        LDR     d0,  [x4], 8
388
389        # BLOCK 0
390        UDOT    v16.4s,  v8.16b, v4.4b[0]
391        LDR     d10,  [x5], 8
392        UDOT    v17.4s,  v8.16b, v5.4b[0]
393        INS     v9.d[1], x14
394        UDOT    v18.4s,  v8.16b, v6.4b[0]
395        LDR     x14,  [x5], 8
396        UDOT    v19.4s,  v8.16b, v0.4b[0]
397
398        # BLOCK 1
399        UDOT    v20.4s,  v9.16b, v4.4b[0]
400        LDR     d11,  [x5], 8
401        UDOT    v21.4s,  v9.16b, v5.4b[0]
402        INS     v10.d[1], x14
403        UDOT    v22.4s,  v9.16b, v6.4b[0]
404        LDR     x14,  [x5], 8
405        UDOT    v23.4s,  v9.16b, v0.4b[0]
406
407        # BLOCK 2
408        UDOT    v24.4s, v10.16b, v4.4b[0]
409        LDR     d8,  [x5], 8
410        UDOT    v25.4s, v10.16b, v5.4b[0]
411        INS     v11.d[1], x14
412        UDOT    v26.4s, v10.16b, v6.4b[0]
413        LDR     x14,  [x5], 8
414        UDOT    v27.4s, v10.16b, v0.4b[0]
415
416        # BLOCK 3
417        UDOT    v28.4s, v11.16b, v4.4b[0]
418        LDR     d9,  [x5], 8
419        UDOT    v29.4s, v11.16b, v5.4b[0]
420        INS     v8.d[1], x14
421        UDOT    v30.4s, v11.16b, v6.4b[0]
422        LDR     x14,  [x5], 8
423        UDOT    v31.4s, v11.16b, v0.4b[0]
424
425        # BLOCK 0
426        UDOT    v16.4s,  v8.16b, v4.4b[1]
427        LDR     d10,  [x5], 8
428        UDOT    v17.4s,  v8.16b, v5.4b[1]
429        INS     v9.d[1], x14
430        UDOT    v18.4s,  v8.16b, v6.4b[1]
431        LDR     x14,  [x5], 8
432        UDOT    v19.4s,  v8.16b, v0.4b[1]
433
434        # BLOCK 1
435        UDOT    v20.4s,  v9.16b, v4.4b[1]
436        LDR     d11,  [x5], 8
437        UDOT    v21.4s,  v9.16b, v5.4b[1]
438        INS     v10.d[1], x14
439        UDOT    v22.4s,  v9.16b, v6.4b[1]
440        LDR     x14,  [x5], 8
441        UDOT    v23.4s,  v9.16b, v0.4b[1]
442
443        # BLOCK 2
444        UDOT    v24.4s, v10.16b, v4.4b[1]
445        UDOT    v25.4s, v10.16b, v5.4b[1]
446        INS     v11.d[1], x14
447        UDOT    v26.4s, v10.16b, v6.4b[1]
448        UDOT    v27.4s, v10.16b, v0.4b[1]
449
450        # BLOCK 3
451        UDOT    v28.4s, v11.16b, v4.4b[1]
452        UDOT    v29.4s, v11.16b, v5.4b[1]
453        UDOT    v30.4s, v11.16b, v6.4b[1]
454        UDOT    v31.4s, v11.16b, v0.4b[1]
455        AND     x0, x2, 15              // kc remainder 0 to 12
456
457        UDOT    v12.2s, v7.8b, v4.8b
458        UDOT    v13.2s, v7.8b, v5.8b
459        UDOT    v14.2s, v7.8b, v6.8b
460        UDOT    v15.2s, v7.8b, v0.8b
461
462        # Is there a remainder?- 4 to 12 bytes of A
463        CBNZ    x0, 4f
464
4653:
466        ADDP    v0.2s, v12.2s, v13.2s
467        ADDP    v1.2s, v14.2s, v15.2s
468        DUP     v12.4s, v0.s[0]
469        DUP     v13.4s, v0.s[1]
470        DUP     v14.4s, v1.s[0]
471        DUP     v15.4s, v1.s[1]
472
473        # Subtract zero point from accumulators
474        SUB     v16.4s, v16.4s, v12.4s
475        SUB     v17.4s, v17.4s, v13.4s
476        SUB     v18.4s, v18.4s, v14.4s
477        SUB     v19.4s, v19.4s, v15.4s
478        SUB     v20.4s, v20.4s, v12.4s
479        SUB     v21.4s, v21.4s, v13.4s
480        SUB     v22.4s, v22.4s, v14.4s
481        SUB     v23.4s, v23.4s, v15.4s
482        SUB     v24.4s, v24.4s, v12.4s
483        SUB     v25.4s, v25.4s, v13.4s
484        SUB     v26.4s, v26.4s, v14.4s
485        SUB     v27.4s, v27.4s, v15.4s
486        SUB     v28.4s, v28.4s, v12.4s
487        SUB     v29.4s, v29.4s, v13.4s
488        SUB     v30.4s, v30.4s, v14.4s
489        SUB     v31.4s, v31.4s, v15.4s
490
491        $if REQUANTIZATION == "RNDNU":
492          # Apply params - preshift, scale, postshift, bias and clamp
493          LD1R    {v4.4s}, [x11], 4
494          SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
495          SSHL    v17.4s, v17.4s, v4.4s
496          SSHL    v18.4s, v18.4s, v4.4s
497          SSHL    v19.4s, v19.4s, v4.4s
498          SSHL    v20.4s, v20.4s, v4.4s
499          SSHL    v21.4s, v21.4s, v4.4s
500          SSHL    v22.4s, v22.4s, v4.4s
501          SSHL    v23.4s, v23.4s, v4.4s
502          LD1R    {v5.4s}, [x11], 4
503          SSHL    v24.4s, v24.4s, v4.4s
504          SSHL    v25.4s, v25.4s, v4.4s
505          SSHL    v26.4s, v26.4s, v4.4s
506          SSHL    v27.4s, v27.4s, v4.4s
507          SSHL    v28.4s, v28.4s, v4.4s
508          SSHL    v29.4s, v29.4s, v4.4s
509          SSHL    v30.4s, v30.4s, v4.4s
510          SSHL    v31.4s, v31.4s, v4.4s
511          LD1R    {v6.4s}, [x11], 4
512          SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
513          SQDMULH v17.4s, v17.4s, v5.4s
514          SQDMULH v18.4s, v18.4s, v5.4s
515          SQDMULH v19.4s, v19.4s, v5.4s
516          SQDMULH v20.4s, v20.4s, v5.4s
517          SQDMULH v21.4s, v21.4s, v5.4s
518          SQDMULH v22.4s, v22.4s, v5.4s
519          SQDMULH v23.4s, v23.4s, v5.4s
520          SQDMULH v24.4s, v24.4s, v5.4s
521          SQDMULH v25.4s, v25.4s, v5.4s
522          SQDMULH v26.4s, v26.4s, v5.4s
523          SQDMULH v27.4s, v27.4s, v5.4s
524          SQDMULH v28.4s, v28.4s, v5.4s
525          SQDMULH v29.4s, v29.4s, v5.4s
526          SQDMULH v30.4s, v30.4s, v5.4s
527          SQDMULH v31.4s, v31.4s, v5.4s
528          SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
529          SRSHL   v17.4s, v17.4s, v6.4s
530          SRSHL   v18.4s, v18.4s, v6.4s
531          SRSHL   v19.4s, v19.4s, v6.4s
532          SRSHL   v20.4s, v20.4s, v6.4s
533          SRSHL   v21.4s, v21.4s, v6.4s
534          SRSHL   v22.4s, v22.4s, v6.4s
535          SRSHL   v23.4s, v23.4s, v6.4s
536          SRSHL   v24.4s, v24.4s, v6.4s
537          SRSHL   v25.4s, v25.4s, v6.4s
538          SRSHL   v26.4s, v26.4s, v6.4s
539          SRSHL   v27.4s, v27.4s, v6.4s
540          SRSHL   v28.4s, v28.4s, v6.4s
541          SRSHL   v29.4s, v29.4s, v6.4s
542          SRSHL   v30.4s, v30.4s, v6.4s
543          SRSHL   v31.4s, v31.4s, v6.4s
544        $elif REQUANTIZATION == "FP32":
545          SCVTF   v16.4s, v16.4s
546          SCVTF   v17.4s, v17.4s
547          # Apply params - scale, bias and clamp
548          LD1R    {v4.4s}, [x11], 4
549          SCVTF   v18.4s, v18.4s
550          SCVTF   v19.4s, v19.4s
551          SCVTF   v20.4s, v20.4s
552          SCVTF   v21.4s, v21.4s
553          SCVTF   v22.4s, v22.4s
554          SCVTF   v23.4s, v23.4s
555          SCVTF   v24.4s, v24.4s
556          SCVTF   v25.4s, v25.4s
557          SCVTF   v26.4s, v26.4s
558          SCVTF   v27.4s, v27.4s
559          SCVTF   v28.4s, v28.4s
560          SCVTF   v29.4s, v29.4s
561          SCVTF   v30.4s, v30.4s
562          SCVTF   v31.4s, v31.4s
563
564          FMUL    v16.4s, v16.4s, v4.4s
565          FMUL    v17.4s, v17.4s, v4.4s
566          FMUL    v18.4s, v18.4s, v4.4s
567          FMUL    v19.4s, v19.4s, v4.4s
568          FMUL    v20.4s, v20.4s, v4.4s
569          FMUL    v21.4s, v21.4s, v4.4s
570          FMUL    v22.4s, v22.4s, v4.4s
571          FMUL    v23.4s, v23.4s, v4.4s
572          FMUL    v24.4s, v24.4s, v4.4s
573          FMUL    v25.4s, v25.4s, v4.4s
574          FMUL    v26.4s, v26.4s, v4.4s
575          FMUL    v27.4s, v27.4s, v4.4s
576          FMUL    v28.4s, v28.4s, v4.4s
577          FMUL    v29.4s, v29.4s, v4.4s
578          FMUL    v30.4s, v30.4s, v4.4s
579          FMUL    v31.4s, v31.4s, v4.4s
580
581          FCVTNS  v16.4s, v16.4s
582          FCVTNS  v17.4s, v17.4s
583          FCVTNS  v18.4s, v18.4s
584          FCVTNS  v19.4s, v19.4s
585          FCVTNS  v20.4s, v20.4s
586          FCVTNS  v21.4s, v21.4s
587          FCVTNS  v22.4s, v22.4s
588          FCVTNS  v23.4s, v23.4s
589          FCVTNS  v24.4s, v24.4s
590          FCVTNS  v25.4s, v25.4s
591          FCVTNS  v26.4s, v26.4s
592          FCVTNS  v27.4s, v27.4s
593          FCVTNS  v28.4s, v28.4s
594          FCVTNS  v29.4s, v29.4s
595          FCVTNS  v30.4s, v30.4s
596          FCVTNS  v31.4s, v31.4s
597
598        SQXTN   v16.4h, v16.4s
599        SQXTN   v17.4h, v17.4s
600        SQXTN   v18.4h, v18.4s
601        SQXTN   v19.4h, v19.4s
602        SQXTN   v24.4h, v24.4s
603        SQXTN   v25.4h, v25.4s
604        SQXTN   v26.4h, v26.4s
605        SQXTN   v27.4h, v27.4s
606        LD1R    {v6.8h}, [x11], 2       // add bias
607
608        SQXTN2  v16.8h, v20.4s
609        SQXTN2  v17.8h, v21.4s
610        SQXTN2  v18.8h, v22.4s
611        SQXTN2  v19.8h, v23.4s
612        SQXTN2  v24.8h, v28.4s
613        SQXTN2  v25.8h, v29.4s
614        SQXTN2  v26.8h, v30.4s
615        SQXTN2  v27.8h, v31.4s
616
617        SQADD   v16.8h, v16.8h, v6.8h
618        SQADD   v17.8h, v17.8h, v6.8h
619        SQADD   v18.8h, v18.8h, v6.8h
620        SQADD   v19.8h, v19.8h, v6.8h
621        SQADD   v24.8h, v24.8h, v6.8h
622        SQADD   v25.8h, v25.8h, v6.8h
623        SQADD   v26.8h, v26.8h, v6.8h
624        SQADD   v27.8h, v27.8h, v6.8h
625        LD1R    {v4.16b}, [x11], 1      // clamp min value
626
627        SQXTUN  v0.8b, v16.8h
628        SQXTUN  v1.8b, v17.8h
629        SQXTUN  v2.8b, v18.8h
630        SQXTUN  v3.8b, v19.8h
631        LD1R    {v5.16b}, [x11]         // clamp max value
632        SQXTUN2 v0.16b, v24.8h
633        SQXTUN2 v1.16b, v25.8h
634        SQXTUN2 v2.16b, v26.8h
635        SQXTUN2 v3.16b, v27.8h
636
637        SUB     x11, x11, ${REWIND_DECREMENT}             // rewind params pointer
638
639        UMAX    v0.16b, v0.16b, v4.16b
640        UMAX    v1.16b, v1.16b, v4.16b
641        UMAX    v2.16b, v2.16b, v4.16b
642        UMAX    v3.16b, v3.16b, v4.16b
643        SUBS    x1, x1, 16
644        UMIN    v0.16b, v0.16b, v5.16b
645        UMIN    v1.16b, v1.16b, v5.16b
646        UMIN    v2.16b, v2.16b, v5.16b
647        UMIN    v3.16b, v3.16b, v5.16b
648        B.LO    6f
649
650        # Store full 4 x 16
651        ST1     {v0.16b}, [x6], x12
652        SUB     x3,  x3, x2             // a0 -= kc
653        ST1     {v1.16b}, [x8], x12
654        SUB     x15, x15, x2            // a1 -= kc
655        ST1     {v2.16b}, [x9], x12
656        SUB     x13, x13, x2            // a2 -= kc
657        ST1     {v3.16b}, [x7], x12
658        SUB     x4,  x4, x2             // a3 -= kc
659        B.NE    0b
660
661        # Restore d8-d15 from stack
662        LDP     d14, d15, [sp, 48]
663        LDP     d12, d13, [sp, 32]
664        LDP     d10, d11, [sp, 16]
665        LDP     d8, d9, [sp], 64
666        RET
667
668        # Remainder- 4 to 12 bytes of A
669        .p2align 3
6704:
671        TBZ     x0, 3, 5f
672
673        LDR     d0,  [x3], 8
674        LDP     q8,  q9,  [x5], 32
675        LDR     d1, [x15], 8
676        LDR     d2, [x13], 8
677        LDR     d3,  [x4], 8
678        LDP     q10, q11, [x5], 32
679        UDOT    v12.2s, v7.8b, v0.8b
680        UDOT    v13.2s, v7.8b, v1.8b
681        UDOT    v14.2s, v7.8b, v2.8b
682        UDOT    v15.2s, v7.8b, v3.8b
683        UDOT    v16.4s,  v8.16b, v0.4b[0]
684        UDOT    v17.4s,  v8.16b, v1.4b[0]
685        UDOT    v18.4s,  v8.16b, v2.4b[0]
686        UDOT    v19.4s,  v8.16b, v3.4b[0]
687        UDOT    v20.4s,  v9.16b, v0.4b[0]
688        UDOT    v21.4s,  v9.16b, v1.4b[0]
689        UDOT    v22.4s,  v9.16b, v2.4b[0]
690        UDOT    v23.4s,  v9.16b, v3.4b[0]
691        UDOT    v24.4s, v10.16b, v0.4b[0]
692        UDOT    v25.4s, v10.16b, v1.4b[0]
693        UDOT    v26.4s, v10.16b, v2.4b[0]
694        UDOT    v27.4s, v10.16b, v3.4b[0]
695        UDOT    v28.4s, v11.16b, v0.4b[0]
696        UDOT    v29.4s, v11.16b, v1.4b[0]
697        UDOT    v30.4s, v11.16b, v2.4b[0]
698        UDOT    v31.4s, v11.16b, v3.4b[0]
699        LDP     q8,  q9,  [x5], 32
700        LDP     q10, q11, [x5], 32
701        UDOT    v16.4s,  v8.16b, v0.4b[1]
702        UDOT    v17.4s,  v8.16b, v1.4b[1]
703        UDOT    v18.4s,  v8.16b, v2.4b[1]
704        UDOT    v19.4s,  v8.16b, v3.4b[1]
705        UDOT    v20.4s,  v9.16b, v0.4b[1]
706        UDOT    v21.4s,  v9.16b, v1.4b[1]
707        UDOT    v22.4s,  v9.16b, v2.4b[1]
708        UDOT    v23.4s,  v9.16b, v3.4b[1]
709        UDOT    v24.4s, v10.16b, v0.4b[1]
710        UDOT    v25.4s, v10.16b, v1.4b[1]
711        UDOT    v26.4s, v10.16b, v2.4b[1]
712        UDOT    v27.4s, v10.16b, v3.4b[1]
713        UDOT    v28.4s, v11.16b, v0.4b[1]
714        UDOT    v29.4s, v11.16b, v1.4b[1]
715        UDOT    v30.4s, v11.16b, v2.4b[1]
716        UDOT    v31.4s, v11.16b, v3.4b[1]
717        TBZ     x0, 2, 3b
7185:
719        LDR     s0,  [x3], 4
720        LDP     q8,  q9,  [x5], 32
721        LDR     s1, [x15], 4
722        LDR     s2, [x13], 4
723        LDR     s3,  [x4], 4
724        LDP     q10, q11, [x5], 32
725        UDOT    v12.2s, v7.8b, v0.8b
726        UDOT    v13.2s, v7.8b, v1.8b
727        UDOT    v14.2s, v7.8b, v2.8b
728        UDOT    v15.2s, v7.8b, v3.8b
729        UDOT    v16.4s,  v8.16b, v0.4b[0]
730        UDOT    v17.4s,  v8.16b, v1.4b[0]
731        UDOT    v18.4s,  v8.16b, v2.4b[0]
732        UDOT    v19.4s,  v8.16b, v3.4b[0]
733        UDOT    v20.4s,  v9.16b, v0.4b[0]
734        UDOT    v21.4s,  v9.16b, v1.4b[0]
735        UDOT    v22.4s,  v9.16b, v2.4b[0]
736        UDOT    v23.4s,  v9.16b, v3.4b[0]
737        UDOT    v24.4s, v10.16b, v0.4b[0]
738        UDOT    v25.4s, v10.16b, v1.4b[0]
739        UDOT    v26.4s, v10.16b, v2.4b[0]
740        UDOT    v27.4s, v10.16b, v3.4b[0]
741        UDOT    v28.4s, v11.16b, v0.4b[0]
742        UDOT    v29.4s, v11.16b, v1.4b[0]
743        UDOT    v30.4s, v11.16b, v2.4b[0]
744        UDOT    v31.4s, v11.16b, v3.4b[0]
745        B       3b
746
747        # Store odd width
748        .p2align 3
7496:
750        TBZ     x1, 3, 7f
751        STR     d0, [x6], 8
752        STR     d1, [x8], 8
753        DUP     d0, v0.d[1]
754        DUP     d1, v1.d[1]
755        STR     d2, [x9], 8
756        STR     d3, [x7], 8
757        DUP     d2, v2.d[1]
758        DUP     d3, v3.d[1]
7597:
760        TBZ     x1, 2, 8f
761        STR     s0, [x6], 4
762        STR     s1, [x8], 4
763        DUP     s0, v0.s[1]
764        DUP     s1, v1.s[1]
765        STR     s2, [x9], 4
766        STR     s3, [x7], 4
767        DUP     s2, v2.s[1]
768        DUP     s3, v3.s[1]
7698:
770        TBZ     x1, 1, 9f
771        STR     h0, [x6], 2
772        STR     h1, [x8], 2
773        DUP     h0, v0.h[1]
774        DUP     h1, v1.h[1]
775        STR     h2, [x9], 2
776        STR     h3, [x7], 2
777        DUP     h2, v2.h[1]
778        DUP     h3, v3.h[1]
7799:
780        TBZ     x1, 0, 10f
781        STR     b0, [x6]
782        STR     b1, [x8]
783        STR     b2, [x9]
784        STR     b3, [x7]
78510:
786        # Restore d8-d15 from stack
787        LDP     d14, d15, [sp, 48]
788        LDP     d12, d13, [sp, 32]
789        LDP     d10, d11, [sp, 16]
790        LDP     d8, d9, [sp], 64
791        RET
792
793END_FUNCTION xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
794
795#ifdef __ELF__
796.section ".note.GNU-stack","",%progbits
797#endif
798