xref: /aosp_15_r20/external/XNNPACK/src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qu8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> (x0)
23#     size_t a_offset,           [sp + 8] -> x8
24#     const int8_t* zero,        [sp + 16] -> x12
25#     const union xnn_qu8_conv_minmax_params params) [sp + 24] -> (x11)
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0  v4
31# A1  x14  v1  v5
32# A2  x15  v2  v6
33# A3  x10  v3  (v0)
34# B    x5  v8  v9 v10 v11
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# zero point v7 v12 v13 v14 v15
40
41# x11 temp for Cortex-A55 loads
42
43BEGIN_FUNCTION xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
44
45        # Clamp C pointers
46        CMP     x0, 2                   // if mr < 2
47        LDR     x8, [sp, 8]             // Load a_offset
48        ADD     x16, x6, x7             // c1 = c0 + cm_stride
49        CSEL    x16, x6,  x16, LO       //   c1 = c0
50        LDP     x12, x11, [sp, 16]      // Load zero pointer, params
51        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
52        ADD     x17, x16, x7            // c2 = c1 + cm_stride
53                                        // if mr <= 2
54        # Save d8-d15 to stack
55        STP     d8, d9, [sp, -64]!
56
57        CSEL    x17, x16, x17, LS       //   c2 = c1
58        BIC     x2, x2, 3
59        STP     d10, d11, [sp, 16]
60        CMP     x0, 4                   // if mr < 4
61        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
62        STP     d12, d13, [sp, 32]
63        CSEL    x7,  x17, x7, LO        //   c3 = c2
64        LD1R    {v7.4s}, [x11], 4       // kernel_zero_point
65        STP     d14, d15, [sp, 48]
66
67        .p2align 3
680:
69        # Load initial bias from w into accumulators
70        LDP     q16, q20, [x5], 32
71
72        MOVI    v12.4s, 0
73        MOVI    v13.4s, 0
74        MOVI    v14.4s, 0
75        MOVI    v15.4s, 0
76
77        MOV     v17.16b, v16.16b
78        MOV     v18.16b, v16.16b
79        LDP     q24, q28, [x5], 32
80        MOV     v19.16b, v16.16b
81        MOV     v21.16b, v20.16b
82        MOV     v22.16b, v20.16b
83        MOV     v23.16b, v20.16b
84        MOV     v25.16b, v24.16b
85        MOV     v26.16b, v24.16b
86        MOV     v27.16b, v24.16b
87        MOV     v29.16b, v28.16b
88        MOV     v30.16b, v28.16b
89        MOV     v31.16b, v28.16b
90
91        MOV     x9, x3                  // p = ks
92
93        .p2align 3
941:
95        # Load next 4 A pointers
96        LDP     x13, x14, [x4], 16
97        LDP     x15, x10, [x4], 16
98
99        CMP     x13, x12                // if a0 == zero
100        ADD     x13, x13, x8            // a0 += a_offset
101        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
102        CMP     x14, x12                // if a1 == zero
103        ADD     x14, x14, x8            // a1 += a_offset
104        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
105        CMP     x15, x12                // if a2 == zero
106        ADD     x15, x15, x8            // a2 += a_offset
107        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
108        CMP     x10, x12                // if a3 == zero
109        ADD     x10, x10, x8            // a3 += a_offset
110        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
111
112        # Is there at least 16 bytes for prologue/epilogue?
113        SUBS    x0, x2, 16              // k = kc - 16
114        B.LO    5f
115
116        # prologue - read A and B values for block 0 and 1
117        LDR     q8,  [x5], 16
118        LDR     d0, [x13], 8
119        LDR     d1, [x14], 8
120        LDR     d2, [x15], 8
121        LDR     d3, [x10], 8
122        SUBS    x0, x0, 16              // is there 16 for main loop?
123        LDR     d9,  [x5], 8
124        LDR     x11, [x5], 8
125        # Is there at least 16 bytes for main loop?
126        B.LO    3f
127
128        # Main loop - 16 bytes of A in 4 groups.
129        # 4 row of 4 vectors wide = 16 UDOT instructions for 4 channels
130        # 4 LD64 for A
131        # 4 LD128 for W. = 2 LD64 + INS.
132        # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS.
133
134        .p2align 3
1352:
136        # BLOCK 0
137        UDOT    v16.4s,  v8.16b, v0.4b[0]
138        LDR     d10,  [x5], 8
139        UDOT    v17.4s,  v8.16b, v1.4b[0]
140        INS     v9.d[1], x11
141        UDOT    v18.4s,  v8.16b, v2.4b[0]
142        LDR     x11,  [x5], 8
143        UDOT    v19.4s,  v8.16b, v3.4b[0]
144
145        # BLOCK 1
146        UDOT    v20.4s,  v9.16b, v0.4b[0]
147        LDR     d11,  [x5], 8
148        UDOT    v21.4s,  v9.16b, v1.4b[0]
149        INS     v10.d[1], x11
150        UDOT    v22.4s,  v9.16b, v2.4b[0]
151        LDR     x11,  [x5], 8
152        UDOT    v23.4s,  v9.16b, v3.4b[0]
153
154        # BLOCK 2
155        UDOT    v24.4s, v10.16b, v0.4b[0]
156        LDR     d8,  [x5], 8
157        UDOT    v25.4s, v10.16b, v1.4b[0]
158        INS     v11.d[1], x11
159        UDOT    v26.4s, v10.16b, v2.4b[0]
160        LDR     x11,  [x5], 8
161        UDOT    v27.4s, v10.16b, v3.4b[0]
162
163        # BLOCK 3
164        UDOT    v28.4s, v11.16b, v0.4b[0]
165        LDR     d9,  [x5], 8
166        UDOT    v29.4s, v11.16b, v1.4b[0]
167        INS     v8.d[1], x11
168        UDOT    v30.4s, v11.16b, v2.4b[0]
169        LDR     x11,  [x5], 8
170        UDOT    v31.4s, v11.16b, v3.4b[0]
171
172        UDOT    v12.2s, v7.8b, v0.8b
173        UDOT    v13.2s, v7.8b, v1.8b
174        UDOT    v14.2s, v7.8b, v2.8b
175        UDOT    v15.2s, v7.8b, v3.8b
176
177        # BLOCK 0
178        UDOT    v16.4s,  v8.16b, v0.4b[1]
179        LDR     d10,  [x5], 8
180        UDOT    v17.4s,  v8.16b, v1.4b[1]
181        INS     v9.d[1], x11
182        UDOT    v18.4s,  v8.16b, v2.4b[1]
183        LDR     x11,  [x5], 8
184        UDOT    v19.4s,  v8.16b, v3.4b[1]
185        LDR     d4, [x13], 8
186
187        # BLOCK 1
188        UDOT    v20.4s,  v9.16b, v0.4b[1]
189        LDR     d11,  [x5], 8
190        UDOT    v21.4s,  v9.16b, v1.4b[1]
191        INS     v10.d[1], x11
192        UDOT    v22.4s,  v9.16b, v2.4b[1]
193        LDR     x11,  [x5], 8
194        UDOT    v23.4s,  v9.16b, v3.4b[1]
195        LDR     d5, [x14], 8
196
197        # BLOCK 2
198        UDOT    v24.4s, v10.16b, v0.4b[1]
199        LDR     d8,  [x5], 8
200        UDOT    v25.4s, v10.16b, v1.4b[1]
201        INS     v11.d[1], x11
202        UDOT    v26.4s, v10.16b, v2.4b[1]
203        LDR     x11,  [x5], 8
204        UDOT    v27.4s, v10.16b, v3.4b[1]
205        LDR     d6, [x15], 8
206
207        # BLOCK 3
208        UDOT    v28.4s, v11.16b, v0.4b[1]
209        LDR     d9,  [x5], 8
210        UDOT    v29.4s, v11.16b, v1.4b[1]
211        INS     v8.d[1], x11
212        UDOT    v30.4s, v11.16b, v2.4b[1]
213        LDR     x11,  [x5], 8
214        UDOT    v31.4s, v11.16b, v3.4b[1]
215        LDR     d0, [x10], 8
216
217        # BLOCK 0
218        UDOT    v16.4s,  v8.16b, v4.4b[0]
219        LDR     d10,  [x5], 8
220        UDOT    v17.4s,  v8.16b, v5.4b[0]
221        INS     v9.d[1], x11
222        UDOT    v18.4s,  v8.16b, v6.4b[0]
223        LDR     x11,  [x5], 8
224        UDOT    v19.4s,  v8.16b, v0.4b[0]
225
226        # BLOCK 1
227        UDOT    v20.4s,  v9.16b, v4.4b[0]
228        LDR     d11,  [x5], 8
229        UDOT    v21.4s,  v9.16b, v5.4b[0]
230        INS     v10.d[1], x11
231        UDOT    v22.4s,  v9.16b, v6.4b[0]
232        LDR     x11,  [x5], 8
233        UDOT    v23.4s,  v9.16b, v0.4b[0]
234
235        # BLOCK 2
236        UDOT    v24.4s, v10.16b, v4.4b[0]
237        LDR     d8,  [x5], 8
238        UDOT    v25.4s, v10.16b, v5.4b[0]
239        INS     v11.d[1], x11
240        UDOT    v26.4s, v10.16b, v6.4b[0]
241        LDR     x11,  [x5], 8
242        UDOT    v27.4s, v10.16b, v0.4b[0]
243
244        # BLOCK 3
245        UDOT    v28.4s, v11.16b, v4.4b[0]
246        LDR     d9,  [x5], 8
247        UDOT    v29.4s, v11.16b, v5.4b[0]
248        INS     v8.d[1], x11
249        UDOT    v30.4s, v11.16b, v6.4b[0]
250        LDR     x11,  [x5], 8
251        UDOT    v31.4s, v11.16b, v0.4b[0]
252
253        # BLOCK 0
254        UDOT    v16.4s,  v8.16b, v4.4b[1]
255        LDR     d10,  [x5], 8
256        UDOT    v17.4s,  v8.16b, v5.4b[1]
257        INS     v9.d[1], x11
258        UDOT    v18.4s,  v8.16b, v6.4b[1]
259        LDR     x11,  [x5], 8
260        UDOT    v19.4s,  v8.16b, v0.4b[1]
261        LDR     d1, [x14], 8
262
263        # BLOCK 1
264        UDOT    v20.4s,  v9.16b, v4.4b[1]
265        LDR     d11,  [x5], 8
266        UDOT    v21.4s,  v9.16b, v5.4b[1]
267        INS     v10.d[1], x11
268        UDOT    v22.4s,  v9.16b, v6.4b[1]
269        LDR     x11,  [x5], 8
270        UDOT    v23.4s,  v9.16b, v0.4b[1]
271        LDR     d2, [x15], 8
272
273        # BLOCK 2
274        UDOT    v24.4s, v10.16b, v4.4b[1]
275        LDR     d8,  [x5], 8            // First B values for block 0 and 1
276        UDOT    v25.4s, v10.16b, v5.4b[1]
277        INS     v11.d[1], x11
278        UDOT    v26.4s, v10.16b, v6.4b[1]
279        LDR     x11,  [x5], 8
280        UDOT    v27.4s, v10.16b, v0.4b[1]
281        LDR     d3, [x10], 8
282
283        # BLOCK 3 special
284        UDOT    v31.4s, v11.16b, v0.4b[1]
285        LDR     d9,  [x5], 8
286        UDOT    v15.2s, v7.8b, v0.8b    // free up v0 early
287        INS     v8.d[1], x11
288        UDOT    v28.4s, v11.16b, v4.4b[1]
289        LDR     x11,  [x5], 8
290        UDOT    v29.4s, v11.16b, v5.4b[1]
291        LDR     d0, [x13], 8
292        UDOT    v30.4s, v11.16b, v6.4b[1]
293        SUBS    x0, x0, 16
294
295        UDOT    v12.2s, v7.8b, v4.8b
296        UDOT    v13.2s, v7.8b, v5.8b
297        UDOT    v14.2s, v7.8b, v6.8b
298        B.HS    2b
299
300        # Epilogue.  Same as main loop but no preloads in final group
3013:
302        # BLOCK 0
303        UDOT    v16.4s,  v8.16b, v0.4b[0]
304        LDR     d10,  [x5], 8
305        UDOT    v17.4s,  v8.16b, v1.4b[0]
306        INS     v9.d[1], x11
307        UDOT    v18.4s,  v8.16b, v2.4b[0]
308        LDR     x11,  [x5], 8
309        UDOT    v19.4s,  v8.16b, v3.4b[0]
310
311        # BLOCK 1
312        UDOT    v20.4s,  v9.16b, v0.4b[0]
313        LDR     d11,  [x5], 8
314        UDOT    v21.4s,  v9.16b, v1.4b[0]
315        INS     v10.d[1], x11
316        UDOT    v22.4s,  v9.16b, v2.4b[0]
317        LDR     x11,  [x5], 8
318        UDOT    v23.4s,  v9.16b, v3.4b[0]
319
320        # BLOCK 2
321        UDOT    v24.4s, v10.16b, v0.4b[0]
322        LDR     d8,  [x5], 8
323        UDOT    v25.4s, v10.16b, v1.4b[0]
324        INS     v11.d[1], x11
325        UDOT    v26.4s, v10.16b, v2.4b[0]
326        LDR     x11,  [x5], 8
327        UDOT    v27.4s, v10.16b, v3.4b[0]
328
329        # BLOCK 3
330        UDOT    v28.4s, v11.16b, v0.4b[0]
331        LDR     d9,  [x5], 8
332        UDOT    v29.4s, v11.16b, v1.4b[0]
333        INS     v8.d[1], x11
334        UDOT    v30.4s, v11.16b, v2.4b[0]
335        LDR     x11,  [x5], 8
336        UDOT    v31.4s, v11.16b, v3.4b[0]
337
338        UDOT    v12.2s, v7.8b, v0.8b
339        UDOT    v13.2s, v7.8b, v1.8b
340        UDOT    v14.2s, v7.8b, v2.8b
341        UDOT    v15.2s, v7.8b, v3.8b
342
343        # BLOCK 0
344        UDOT    v16.4s,  v8.16b, v0.4b[1]
345        LDR     d10,  [x5], 8
346        UDOT    v17.4s,  v8.16b, v1.4b[1]
347        INS     v9.d[1], x11
348        UDOT    v18.4s,  v8.16b, v2.4b[1]
349        LDR     x11,  [x5], 8
350        UDOT    v19.4s,  v8.16b, v3.4b[1]
351        LDR     d4, [x13], 8
352
353        # BLOCK 1
354        UDOT    v20.4s,  v9.16b, v0.4b[1]
355        LDR     d11,  [x5], 8
356        UDOT    v21.4s,  v9.16b, v1.4b[1]
357        INS     v10.d[1], x11
358        UDOT    v22.4s,  v9.16b, v2.4b[1]
359        LDR     x11,  [x5], 8
360        UDOT    v23.4s,  v9.16b, v3.4b[1]
361        LDR     d5, [x14], 8
362
363        # BLOCK 2
364        UDOT    v24.4s, v10.16b, v0.4b[1]
365        LDR     d8,  [x5], 8
366        UDOT    v25.4s, v10.16b, v1.4b[1]
367        INS     v11.d[1], x11
368        UDOT    v26.4s, v10.16b, v2.4b[1]
369        LDR     x11,  [x5], 8
370        UDOT    v27.4s, v10.16b, v3.4b[1]
371        LDR     d6, [x15], 8
372
373        # BLOCK 3
374        UDOT    v28.4s, v11.16b, v0.4b[1]
375        LDR     d9,  [x5], 8
376        UDOT    v29.4s, v11.16b, v1.4b[1]
377        INS     v8.d[1], x11
378        UDOT    v30.4s, v11.16b, v2.4b[1]
379        LDR     x11,  [x5], 8
380        UDOT    v31.4s, v11.16b, v3.4b[1]
381        LDR     d0, [x10], 8
382
383        # BLOCK 0
384        UDOT    v16.4s,  v8.16b, v4.4b[0]
385        LDR     d10,  [x5], 8
386        UDOT    v17.4s,  v8.16b, v5.4b[0]
387        INS     v9.d[1], x11
388        UDOT    v18.4s,  v8.16b, v6.4b[0]
389        LDR     x11,  [x5], 8
390        UDOT    v19.4s,  v8.16b, v0.4b[0]
391
392        # BLOCK 1
393        UDOT    v20.4s,  v9.16b, v4.4b[0]
394        LDR     d11,  [x5], 8
395        UDOT    v21.4s,  v9.16b, v5.4b[0]
396        INS     v10.d[1], x11
397        UDOT    v22.4s,  v9.16b, v6.4b[0]
398        LDR     x11,  [x5], 8
399        UDOT    v23.4s,  v9.16b, v0.4b[0]
400
401        # BLOCK 2
402        UDOT    v24.4s, v10.16b, v4.4b[0]
403        LDR     d8,  [x5], 8
404        UDOT    v25.4s, v10.16b, v5.4b[0]
405        INS     v11.d[1], x11
406        UDOT    v26.4s, v10.16b, v6.4b[0]
407        LDR     x11,  [x5], 8
408        UDOT    v27.4s, v10.16b, v0.4b[0]
409
410        # BLOCK 3
411        UDOT    v28.4s, v11.16b, v4.4b[0]
412        LDR     d9,  [x5], 8
413        UDOT    v29.4s, v11.16b, v5.4b[0]
414        INS     v8.d[1], x11
415        UDOT    v30.4s, v11.16b, v6.4b[0]
416        LDR     x11,  [x5], 8
417        UDOT    v31.4s, v11.16b, v0.4b[0]
418
419        # BLOCK 0
420        UDOT    v16.4s,  v8.16b, v4.4b[1]
421        LDR     d10,  [x5], 8
422        UDOT    v17.4s,  v8.16b, v5.4b[1]
423        INS     v9.d[1], x11
424        UDOT    v18.4s,  v8.16b, v6.4b[1]
425        LDR     x11,  [x5], 8
426        UDOT    v19.4s,  v8.16b, v0.4b[1]
427
428        # BLOCK 1
429        UDOT    v20.4s,  v9.16b, v4.4b[1]
430        LDR     d11,  [x5], 8
431        UDOT    v21.4s,  v9.16b, v5.4b[1]
432        INS     v10.d[1], x11
433        UDOT    v22.4s,  v9.16b, v6.4b[1]
434        LDR     x11,  [x5], 8
435        UDOT    v23.4s,  v9.16b, v0.4b[1]
436
437        # BLOCK 2
438        UDOT    v24.4s, v10.16b, v4.4b[1]
439        UDOT    v25.4s, v10.16b, v5.4b[1]
440        INS     v11.d[1], x11
441        UDOT    v26.4s, v10.16b, v6.4b[1]
442        UDOT    v27.4s, v10.16b, v0.4b[1]
443
444        # BLOCK 3
445        UDOT    v28.4s, v11.16b, v4.4b[1]
446        UDOT    v29.4s, v11.16b, v5.4b[1]
447        UDOT    v30.4s, v11.16b, v6.4b[1]
448        UDOT    v31.4s, v11.16b, v0.4b[1]
449
450        UDOT    v12.2s, v7.8b, v4.8b
451        UDOT    v13.2s, v7.8b, v5.8b
452        UDOT    v14.2s, v7.8b, v6.8b
453        UDOT    v15.2s, v7.8b, v0.8b
454
455        # Is there a remainder?- 4 to 12 bytes of A
456        TST     x0, 15
457        B.NE    5f
458
4594:
460        # ks loop
461        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
462        B.HI    1b
463
464        ADDP    v0.2s, v12.2s, v13.2s
465        ADDP    v1.2s, v14.2s, v15.2s
466        LDR     x11, [sp, 88]           // Reload params
467        DUP     v12.4s, v0.s[0]
468        DUP     v13.4s, v0.s[1]
469        DUP     v14.4s, v1.s[0]
470        DUP     v15.4s, v1.s[1]
471        ADD     x11, x11, 4
472
473        # Subtract zero point from accumulators
474        SUB     v16.4s, v16.4s, v12.4s
475        SUB     v17.4s, v17.4s, v13.4s
476        SUB     v18.4s, v18.4s, v14.4s
477        SUB     v19.4s, v19.4s, v15.4s
478        SUB     v20.4s, v20.4s, v12.4s
479        SUB     v21.4s, v21.4s, v13.4s
480        SUB     v22.4s, v22.4s, v14.4s
481        SUB     v23.4s, v23.4s, v15.4s
482        SUB     v24.4s, v24.4s, v12.4s
483        SUB     v25.4s, v25.4s, v13.4s
484        SUB     v26.4s, v26.4s, v14.4s
485        SUB     v27.4s, v27.4s, v15.4s
486        SUB     v28.4s, v28.4s, v12.4s
487        SUB     v29.4s, v29.4s, v13.4s
488        SUB     v30.4s, v30.4s, v14.4s
489        SUB     v31.4s, v31.4s, v15.4s
490
491        SCVTF   v16.4s, v16.4s
492        SCVTF   v17.4s, v17.4s
493        # Apply params - scale, bias and clamp
494        LD1R    {v4.4s}, [x11], 4
495        SCVTF   v18.4s, v18.4s
496        SCVTF   v19.4s, v19.4s
497        SCVTF   v20.4s, v20.4s
498        SCVTF   v21.4s, v21.4s
499        SCVTF   v22.4s, v22.4s
500        SCVTF   v23.4s, v23.4s
501        SCVTF   v24.4s, v24.4s
502        SCVTF   v25.4s, v25.4s
503        SCVTF   v26.4s, v26.4s
504        SCVTF   v27.4s, v27.4s
505        SCVTF   v28.4s, v28.4s
506        SCVTF   v29.4s, v29.4s
507        SCVTF   v30.4s, v30.4s
508        SCVTF   v31.4s, v31.4s
509
510        FMUL    v16.4s, v16.4s, v4.4s
511        FMUL    v17.4s, v17.4s, v4.4s
512        FMUL    v18.4s, v18.4s, v4.4s
513        FMUL    v19.4s, v19.4s, v4.4s
514        FMUL    v20.4s, v20.4s, v4.4s
515        FMUL    v21.4s, v21.4s, v4.4s
516        FMUL    v22.4s, v22.4s, v4.4s
517        FMUL    v23.4s, v23.4s, v4.4s
518        FMUL    v24.4s, v24.4s, v4.4s
519        FMUL    v25.4s, v25.4s, v4.4s
520        FMUL    v26.4s, v26.4s, v4.4s
521        FMUL    v27.4s, v27.4s, v4.4s
522        FMUL    v28.4s, v28.4s, v4.4s
523        FMUL    v29.4s, v29.4s, v4.4s
524        FMUL    v30.4s, v30.4s, v4.4s
525        FMUL    v31.4s, v31.4s, v4.4s
526
527        FCVTNS  v16.4s, v16.4s
528        FCVTNS  v17.4s, v17.4s
529        FCVTNS  v18.4s, v18.4s
530        FCVTNS  v19.4s, v19.4s
531        FCVTNS  v20.4s, v20.4s
532        FCVTNS  v21.4s, v21.4s
533        FCVTNS  v22.4s, v22.4s
534        FCVTNS  v23.4s, v23.4s
535        FCVTNS  v24.4s, v24.4s
536        FCVTNS  v25.4s, v25.4s
537        FCVTNS  v26.4s, v26.4s
538        FCVTNS  v27.4s, v27.4s
539        FCVTNS  v28.4s, v28.4s
540        FCVTNS  v29.4s, v29.4s
541        FCVTNS  v30.4s, v30.4s
542        FCVTNS  v31.4s, v31.4s
543
544        SQXTN   v16.4h, v16.4s
545        SQXTN   v17.4h, v17.4s
546        SQXTN   v18.4h, v18.4s
547        SQXTN   v19.4h, v19.4s
548        SQXTN   v24.4h, v24.4s
549        SQXTN   v25.4h, v25.4s
550        SQXTN   v26.4h, v26.4s
551        SQXTN   v27.4h, v27.4s
552        LD1R    {v6.8h}, [x11], 2        // add bias
553
554        SQXTN2  v16.8h, v20.4s
555        SQXTN2  v17.8h, v21.4s
556        SQXTN2  v18.8h, v22.4s
557        SQXTN2  v19.8h, v23.4s
558        SQXTN2  v24.8h, v28.4s
559        SQXTN2  v25.8h, v29.4s
560        SQXTN2  v26.8h, v30.4s
561        SQXTN2  v27.8h, v31.4s
562
563        SQADD   v16.8h, v16.8h, v6.8h
564        SQADD   v17.8h, v17.8h, v6.8h
565        SQADD   v18.8h, v18.8h, v6.8h
566        SQADD   v19.8h, v19.8h, v6.8h
567        SQADD   v24.8h, v24.8h, v6.8h
568        SQADD   v25.8h, v25.8h, v6.8h
569        SQADD   v26.8h, v26.8h, v6.8h
570        SQADD   v27.8h, v27.8h, v6.8h
571        LD1R    {v4.16b}, [x11], 1      // clamp min value
572
573        SQXTUN  v0.8b, v16.8h
574        SQXTUN  v1.8b, v17.8h
575        SQXTUN  v2.8b, v18.8h
576        SQXTUN  v3.8b, v19.8h
577        LD1R    {v5.16b}, [x11]         // clamp max value
578        SQXTUN2 v0.16b, v24.8h
579        SQXTUN2 v1.16b, v25.8h
580        SQXTUN2 v2.16b, v26.8h
581        SQXTUN2 v3.16b, v27.8h
582        LDR     x0, [sp, 64]            // Load cn_stride
583
584        UMAX    v0.16b, v0.16b, v4.16b
585        UMAX    v1.16b, v1.16b, v4.16b
586        UMAX    v2.16b, v2.16b, v4.16b
587        UMAX    v3.16b, v3.16b, v4.16b
588        SUBS    x1, x1, 16
589        UMIN    v0.16b, v0.16b, v5.16b
590        UMIN    v1.16b, v1.16b, v5.16b
591        UMIN    v2.16b, v2.16b, v5.16b
592        UMIN    v3.16b, v3.16b, v5.16b
593        B.LO    7f
594
595        # Store full 4 x 16
596        ST1     {v3.16b},  [x7], x0
597        ST1     {v2.16b}, [x17], x0
598        ST1     {v1.16b}, [x16], x0
599        ST1     {v0.16b},  [x6], x0
600
601        SUB     x4, x4, x3              // a -= ks
602
603        # nc loop
604        B.HI    0b
605
606        # Restore d8-d15 from stack
607        LDP     d14, d15, [sp, 48]
608        LDP     d12, d13, [sp, 32]
609        LDP     d10, d11, [sp, 16]
610        LDP     d8, d9, [sp], 64
611        RET
612
613         # Remainder- 4 to 12 bytes of A
614        .p2align 3
6155:
616        TBZ     x0, 3, 6f
617
618        LDR     d0, [x13], 8
619        LDP     q8,  q9,  [x5], 32
620        LDR     d1, [x14], 8
621        LDR     d2, [x15], 8
622        LDR     d3, [x10], 8
623        LDP     q10, q11, [x5], 32
624        UDOT    v12.2s, v7.8b, v0.8b
625        UDOT    v13.2s, v7.8b, v1.8b
626        UDOT    v14.2s, v7.8b, v2.8b
627        UDOT    v15.2s, v7.8b, v3.8b
628        UDOT    v16.4s,  v8.16b, v0.4b[0]
629        UDOT    v17.4s,  v8.16b, v1.4b[0]
630        UDOT    v18.4s,  v8.16b, v2.4b[0]
631        UDOT    v19.4s,  v8.16b, v3.4b[0]
632        UDOT    v20.4s,  v9.16b, v0.4b[0]
633        UDOT    v21.4s,  v9.16b, v1.4b[0]
634        UDOT    v22.4s,  v9.16b, v2.4b[0]
635        UDOT    v23.4s,  v9.16b, v3.4b[0]
636        UDOT    v24.4s, v10.16b, v0.4b[0]
637        UDOT    v25.4s, v10.16b, v1.4b[0]
638        UDOT    v26.4s, v10.16b, v2.4b[0]
639        UDOT    v27.4s, v10.16b, v3.4b[0]
640        UDOT    v28.4s, v11.16b, v0.4b[0]
641        UDOT    v29.4s, v11.16b, v1.4b[0]
642        UDOT    v30.4s, v11.16b, v2.4b[0]
643        UDOT    v31.4s, v11.16b, v3.4b[0]
644        LDP     q8,  q9,  [x5], 32
645        LDP     q10, q11, [x5], 32
646        UDOT    v16.4s,  v8.16b, v0.4b[1]
647        UDOT    v17.4s,  v8.16b, v1.4b[1]
648        UDOT    v18.4s,  v8.16b, v2.4b[1]
649        UDOT    v19.4s,  v8.16b, v3.4b[1]
650        UDOT    v20.4s,  v9.16b, v0.4b[1]
651        UDOT    v21.4s,  v9.16b, v1.4b[1]
652        UDOT    v22.4s,  v9.16b, v2.4b[1]
653        UDOT    v23.4s,  v9.16b, v3.4b[1]
654        UDOT    v24.4s, v10.16b, v0.4b[1]
655        UDOT    v25.4s, v10.16b, v1.4b[1]
656        UDOT    v26.4s, v10.16b, v2.4b[1]
657        UDOT    v27.4s, v10.16b, v3.4b[1]
658        UDOT    v28.4s, v11.16b, v0.4b[1]
659        UDOT    v29.4s, v11.16b, v1.4b[1]
660        UDOT    v30.4s, v11.16b, v2.4b[1]
661        UDOT    v31.4s, v11.16b, v3.4b[1]
662        TBZ     x0, 2, 4b
6636:
664        LDR     s0, [x13], 4
665        LDP     q8,  q9,  [x5], 32
666        LDR     s1, [x14], 4
667        LDR     s2, [x15], 4
668        LDR     s3, [x10], 4
669        LDP     q10, q11, [x5], 32
670        UDOT    v12.2s, v7.8b, v0.8b
671        UDOT    v13.2s, v7.8b, v1.8b
672        UDOT    v14.2s, v7.8b, v2.8b
673        UDOT    v15.2s, v7.8b, v3.8b
674        UDOT    v16.4s,  v8.16b, v0.4b[0]
675        UDOT    v17.4s,  v8.16b, v1.4b[0]
676        UDOT    v18.4s,  v8.16b, v2.4b[0]
677        UDOT    v19.4s,  v8.16b, v3.4b[0]
678        UDOT    v20.4s,  v9.16b, v0.4b[0]
679        UDOT    v21.4s,  v9.16b, v1.4b[0]
680        UDOT    v22.4s,  v9.16b, v2.4b[0]
681        UDOT    v23.4s,  v9.16b, v3.4b[0]
682        UDOT    v24.4s, v10.16b, v0.4b[0]
683        UDOT    v25.4s, v10.16b, v1.4b[0]
684        UDOT    v26.4s, v10.16b, v2.4b[0]
685        UDOT    v27.4s, v10.16b, v3.4b[0]
686        UDOT    v28.4s, v11.16b, v0.4b[0]
687        UDOT    v29.4s, v11.16b, v1.4b[0]
688        UDOT    v30.4s, v11.16b, v2.4b[0]
689        UDOT    v31.4s, v11.16b, v3.4b[0]
690        B       4b
691
692        # Store odd width
693        .p2align 3
6947:
695        TBZ     x1, 3, 8f
696        STR     d3, [x7], 8
697        STR     d2, [x17], 8
698        DUP     d3, v3.d[1]
699        DUP     d2, v2.d[1]
700        STR     d1, [x16], 8
701        STR     d0, [x6], 8
702        DUP     d1, v1.d[1]
703        DUP     d0, v0.d[1]
7048:
705        TBZ     x1, 2, 9f
706        STR     s3, [x7], 4
707        STR     s2, [x17], 4
708        DUP     s3, v3.s[1]
709        DUP     s2, v2.s[1]
710        STR     s1, [x16], 4
711        STR     s0, [x6], 4
712        DUP     s1, v1.s[1]
713        DUP     s0, v0.s[1]
7149:
715        TBZ     x1, 1, 10f
716        STR     h3, [x7], 2
717        STR     h2, [x17], 2
718        DUP     h3, v3.h[1]
719        DUP     h2, v2.h[1]
720        STR     h1, [x16], 2
721        STR     h0, [x6], 2
722        DUP     h1, v1.h[1]
723        DUP     h0, v0.h[1]
72410:
725        TBZ     x1, 0, 11f
726        STR     b3, [x7]
727        STR     b2, [x17]
728        STR     b1, [x16]
729        STR     b0, [x6]
73011:
731        # Restore d8-d15 from stack
732        LDP     d14, d15, [sp, 48]
733        LDP     d12, d13, [sp, 32]
734        LDP     d10, d11, [sp, 16]
735        LDP     d8, d9, [sp], 64
736        RET
737
738END_FUNCTION xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
739
740#ifdef __ELF__
741.section ".note.GNU-stack","",%progbits
742#endif
743