xref: /aosp_15_r20/external/XNNPACK/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> (x0)
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_minmax_params params [sp + 24] -> (x11)
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0  v4
31# A1  x14  v1  v5
32# A2  x15  v2  v6
33# A3  x10  v3  v7
34# B    x5  v8  v9 v10 v11
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# unused v12 v13 v14 v15
40
41# x11 temp for Cortex-A55 loads
42
43BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
44
45        # Clamp C pointers
46        CMP     x0, 2                   // if mr < 2
47        LDR     x8, [sp, 8]             // Load a_offset
48        ADD     x16, x6, x7             // c1 = c0 + cm_stride
49        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
50        CSEL    x16, x6,  x16, LO       //   c1 = c0
51        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
52        STP     d8,  d9, [sp, -32]!     // Save d8-d11 on stack
53
54        ADD     x17, x16, x7            // c2 = c1 + cm_stride
55        STP     d10, d11, [sp, 16]
56                                        // if mr <= 2
57        CSEL    x17, x16, x17, LS       //   c2 = c1
58        BIC     x2, x2, 3
59
60        CMP     x0, 4                   // if mr < 4
61        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
62        CSEL    x7,  x17, x7, LO        //   c3 = c2
63
64        .p2align 3
650:
66        # Load initial bias from w into accumulators
67        LDP     q16, q20, [x5], 32
68        MOV     v17.16b, v16.16b
69        MOV     v18.16b, v16.16b
70        LDP     q24, q28, [x5], 32
71        MOV     v19.16b, v16.16b
72        MOV     v21.16b, v20.16b
73        MOV     v22.16b, v20.16b
74        MOV     v23.16b, v20.16b
75        MOV     v25.16b, v24.16b
76        MOV     v26.16b, v24.16b
77        MOV     v27.16b, v24.16b
78        MOV     v29.16b, v28.16b
79        MOV     v30.16b, v28.16b
80        MOV     v31.16b, v28.16b
81        MOV     x9, x3                  // p = ks
82
83        .p2align 3
841:
85        # Load next 4 A pointers
86        LDP     x13, x14, [x4], 16
87        LDP     x15, x10, [x4], 16
88
89        CMP     x13, x12                // if a0 == zero
90        ADD     x13, x13, x8            // a0 += a_offset
91        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
92        CMP     x14, x12                // if a1 == zero
93        ADD     x14, x14, x8            // a1 += a_offset
94        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
95        CMP     x15, x12                // if a2 == zero
96        ADD     x15, x15, x8            // a2 += a_offset
97        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
98        CMP     x10, x12                // if a3 == zero
99        ADD     x10, x10, x8            // a3 += a_offset
100        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
101
102        # Is there at least 16 bytes for prologue/epilogue?
103        SUBS    x0, x2, 16              // k = kc - 16
104        B.LO    5f
105
106        # prologue - read A and B values for block 0 and 1
107        LDR     d0, [x13], 8
108        LDR     q8,  [x5], 16
109        LDR     d1, [x14], 8
110        LDR     d2, [x15], 8
111        LDR     d3, [x10], 8
112        SUBS    x0, x0, 16              // is there 16 for main loop?
113        LDR     d9,  [x5], 8
114        LDR     x11,  [x5], 8
115        # Is there at least 16 bytes for main loop?
116        B.LO    3f
117
118        # Main loop - 16 bytes of A in 4 groups.
119        # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
120        # 4 LD64 for A
121        # 4 LD128 for W. = 2 LD64 + INS.
122        # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.
123
124        .p2align 3
1252:
126        # BLOCK 0
127        SDOT    v16.4s,  v8.16b, v0.4b[0]
128        LDR     d10,  [x5], 8
129        SDOT    v17.4s,  v8.16b, v1.4b[0]
130        INS     v9.d[1], x11
131        SDOT    v18.4s,  v8.16b, v2.4b[0]
132        LDR     x11,  [x5], 8
133        SDOT    v19.4s,  v8.16b, v3.4b[0]
134        LDR     d4,  [x13], 8
135
136        # BLOCK 1
137        SDOT    v20.4s,  v9.16b, v0.4b[0]
138        LDR     d11,  [x5], 8
139        SDOT    v21.4s,  v9.16b, v1.4b[0]
140        INS     v10.d[1], x11
141        SDOT    v22.4s,  v9.16b, v2.4b[0]
142        LDR     x11,  [x5], 8
143        SDOT    v23.4s,  v9.16b, v3.4b[0]
144        LDR     d5, [x14], 8
145
146        # BLOCK 2
147        SDOT    v24.4s, v10.16b, v0.4b[0]
148        LDR     d8,  [x5], 8
149        SDOT    v25.4s, v10.16b, v1.4b[0]
150        INS     v11.d[1], x11
151        SDOT    v26.4s, v10.16b, v2.4b[0]
152        LDR     x11,  [x5], 8
153        SDOT    v27.4s, v10.16b, v3.4b[0]
154        LDR     d6, [x15], 8
155
156        # BLOCK 3
157        SDOT    v28.4s, v11.16b, v0.4b[0]
158        LDR     d9,  [x5], 8
159        SDOT    v29.4s, v11.16b, v1.4b[0]
160        INS     v8.d[1], x11
161        SDOT    v30.4s, v11.16b, v2.4b[0]
162        LDR     x11,  [x5], 8
163        SDOT    v31.4s, v11.16b, v3.4b[0]
164        LDR     d7,  [x10], 8
165
166        # BLOCK 0
167        SDOT    v16.4s,  v8.16b, v0.4b[1]
168        LDR     d10,  [x5], 8
169        SDOT    v17.4s,  v8.16b, v1.4b[1]
170        INS     v9.d[1], x11
171        SDOT    v18.4s,  v8.16b, v2.4b[1]
172        LDR     x11,  [x5], 8
173        SDOT    v19.4s,  v8.16b, v3.4b[1]
174
175        # BLOCK 1
176        SDOT    v20.4s,  v9.16b, v0.4b[1]
177        LDR     d11,  [x5], 8
178        SDOT    v21.4s,  v9.16b, v1.4b[1]
179        INS     v10.d[1], x11
180        SDOT    v22.4s,  v9.16b, v2.4b[1]
181        LDR     x11,  [x5], 8
182        SDOT    v23.4s,  v9.16b, v3.4b[1]
183
184        # BLOCK 2
185        SDOT    v24.4s, v10.16b, v0.4b[1]
186        LDR     d8,  [x5], 8
187        SDOT    v25.4s, v10.16b, v1.4b[1]
188        INS     v11.d[1], x11
189        SDOT    v26.4s, v10.16b, v2.4b[1]
190        LDR     x11,  [x5], 8
191        SDOT    v27.4s, v10.16b, v3.4b[1]
192
193        # BLOCK 3
194        SDOT    v28.4s, v11.16b, v0.4b[1]
195        LDR     d9,  [x5], 8
196        SDOT    v29.4s, v11.16b, v1.4b[1]
197        INS     v8.d[1], x11
198        SDOT    v30.4s, v11.16b, v2.4b[1]
199        LDR     x11,  [x5], 8
200        SDOT    v31.4s, v11.16b, v3.4b[1]
201
202        # BLOCK 0
203        SDOT    v16.4s,  v8.16b, v4.4b[0]
204        LDR     d10,  [x5], 8
205        SDOT    v17.4s,  v8.16b, v5.4b[0]
206        INS     v9.d[1], x11
207        SDOT    v18.4s,  v8.16b, v6.4b[0]
208        LDR     x11,  [x5], 8
209        SDOT    v19.4s,  v8.16b, v7.4b[0]
210        LDR     d0,  [x13], 8
211
212        # BLOCK 1
213        SDOT    v20.4s,  v9.16b, v4.4b[0]
214        LDR     d11,  [x5], 8
215        SDOT    v21.4s,  v9.16b, v5.4b[0]
216        INS     v10.d[1], x11
217        SDOT    v22.4s,  v9.16b, v6.4b[0]
218        LDR     x11,  [x5], 8
219        SDOT    v23.4s,  v9.16b, v7.4b[0]
220        LDR     d1, [x14], 8
221
222        # BLOCK 2
223        SDOT    v24.4s, v10.16b, v4.4b[0]
224        LDR     d8,  [x5], 8
225        SDOT    v25.4s, v10.16b, v5.4b[0]
226        INS     v11.d[1], x11
227        SDOT    v26.4s, v10.16b, v6.4b[0]
228        LDR     x11,  [x5], 8
229        SDOT    v27.4s, v10.16b, v7.4b[0]
230        LDR     d2, [x15], 8
231
232        # BLOCK 3
233        SDOT    v28.4s, v11.16b, v4.4b[0]
234        LDR     d9,  [x5], 8
235        SDOT    v29.4s, v11.16b, v5.4b[0]
236        INS     v8.d[1], x11
237        SDOT    v30.4s, v11.16b, v6.4b[0]
238        LDR     x11,  [x5], 8
239        SDOT    v31.4s, v11.16b, v7.4b[0]
240        LDR     d3,  [x10], 8
241
242        # BLOCK 0
243        SDOT    v16.4s,  v8.16b, v4.4b[1]
244        LDR     d10,  [x5], 8
245        SDOT    v17.4s,  v8.16b, v5.4b[1]
246        INS     v9.d[1], x11
247        SDOT    v18.4s,  v8.16b, v6.4b[1]
248        LDR     x11,  [x5], 8
249        SDOT    v19.4s,  v8.16b, v7.4b[1]
250
251        # BLOCK 1
252        SDOT    v20.4s,  v9.16b, v4.4b[1]
253        LDR     d11,  [x5], 8
254        SDOT    v21.4s,  v9.16b, v5.4b[1]
255        INS     v10.d[1], x11
256        SDOT    v22.4s,  v9.16b, v6.4b[1]
257        LDR     x11,  [x5], 8
258        SDOT    v23.4s,  v9.16b, v7.4b[1]
259
260        # BLOCK 2
261        SDOT    v24.4s, v10.16b, v4.4b[1]
262        LDR     d8,  [x5], 8            // First B values for block 0 and 1
263        SDOT    v25.4s, v10.16b, v5.4b[1]
264        INS     v11.d[1], x11
265        SDOT    v26.4s, v10.16b, v6.4b[1]
266        LDR     x11,  [x5], 8
267        SDOT    v27.4s, v10.16b, v7.4b[1]
268        SUBS    x0, x0, 16
269
270        # BLOCK 3
271        SDOT    v28.4s, v11.16b, v4.4b[1]
272        LDR     d9,  [x5], 8
273        SDOT    v29.4s, v11.16b, v5.4b[1]
274        INS     v8.d[1], x11
275        SDOT    v30.4s, v11.16b, v6.4b[1]
276        LDR     x11,  [x5], 8
277        SDOT    v31.4s, v11.16b, v7.4b[1]
278        B.HS    2b
279
280        # Epilogue.  Same as main loop but no preloads in final group
2813:
282        # BLOCK 0
283        SDOT    v16.4s,  v8.16b, v0.4b[0]
284        LDR     d10,  [x5], 8
285        SDOT    v17.4s,  v8.16b, v1.4b[0]
286        INS     v9.d[1], x11
287        SDOT    v18.4s,  v8.16b, v2.4b[0]
288        LDR     x11,  [x5], 8
289        SDOT    v19.4s,  v8.16b, v3.4b[0]
290        LDR     d4,  [x13], 8
291
292        # BLOCK 1
293        SDOT    v20.4s,  v9.16b, v0.4b[0]
294        LDR     d11,  [x5], 8
295        SDOT    v21.4s,  v9.16b, v1.4b[0]
296        INS     v10.d[1], x11
297        SDOT    v22.4s,  v9.16b, v2.4b[0]
298        LDR     x11,  [x5], 8
299        SDOT    v23.4s,  v9.16b, v3.4b[0]
300        LDR     d5, [x14], 8
301
302        # BLOCK 2
303        SDOT    v24.4s, v10.16b, v0.4b[0]
304        LDR     d8,  [x5], 8
305        SDOT    v25.4s, v10.16b, v1.4b[0]
306        INS     v11.d[1], x11
307        SDOT    v26.4s, v10.16b, v2.4b[0]
308        LDR     x11,  [x5], 8
309        SDOT    v27.4s, v10.16b, v3.4b[0]
310        LDR     d6, [x15], 8
311
312        # BLOCK 3
313        SDOT    v28.4s, v11.16b, v0.4b[0]
314        LDR     d9,  [x5], 8
315        SDOT    v29.4s, v11.16b, v1.4b[0]
316        INS     v8.d[1], x11
317        SDOT    v30.4s, v11.16b, v2.4b[0]
318        LDR     x11,  [x5], 8
319        SDOT    v31.4s, v11.16b, v3.4b[0]
320        LDR     d7,  [x10], 8
321
322        # BLOCK 0
323        SDOT    v16.4s,  v8.16b, v0.4b[1]
324        LDR     d10,  [x5], 8
325        SDOT    v17.4s,  v8.16b, v1.4b[1]
326        INS     v9.d[1], x11
327        SDOT    v18.4s,  v8.16b, v2.4b[1]
328        LDR     x11,  [x5], 8
329        SDOT    v19.4s,  v8.16b, v3.4b[1]
330
331        # BLOCK 1
332        SDOT    v20.4s,  v9.16b, v0.4b[1]
333        LDR     d11,  [x5], 8
334        SDOT    v21.4s,  v9.16b, v1.4b[1]
335        INS     v10.d[1], x11
336        SDOT    v22.4s,  v9.16b, v2.4b[1]
337        LDR     x11,  [x5], 8
338        SDOT    v23.4s,  v9.16b, v3.4b[1]
339
340        # BLOCK 2
341        SDOT    v24.4s, v10.16b, v0.4b[1]
342        LDR     d8,  [x5], 8
343        SDOT    v25.4s, v10.16b, v1.4b[1]
344        INS     v11.d[1], x11
345        SDOT    v26.4s, v10.16b, v2.4b[1]
346        LDR     x11,  [x5], 8
347        SDOT    v27.4s, v10.16b, v3.4b[1]
348
349        # BLOCK 3
350        SDOT    v28.4s, v11.16b, v0.4b[1]
351        LDR     d9,  [x5], 8
352        SDOT    v29.4s, v11.16b, v1.4b[1]
353        INS     v8.d[1], x11
354        SDOT    v30.4s, v11.16b, v2.4b[1]
355        LDR     x11,  [x5], 8
356        SDOT    v31.4s, v11.16b, v3.4b[1]
357
358        # BLOCK 0
359        SDOT    v16.4s,  v8.16b, v4.4b[0]
360        LDR     d10,  [x5], 8
361        SDOT    v17.4s,  v8.16b, v5.4b[0]
362        INS     v9.d[1], x11
363        SDOT    v18.4s,  v8.16b, v6.4b[0]
364        LDR     x11,  [x5], 8
365        SDOT    v19.4s,  v8.16b, v7.4b[0]
366
367        # BLOCK 1
368        SDOT    v20.4s,  v9.16b, v4.4b[0]
369        LDR     d11,  [x5], 8
370        SDOT    v21.4s,  v9.16b, v5.4b[0]
371        INS     v10.d[1], x11
372        SDOT    v22.4s,  v9.16b, v6.4b[0]
373        LDR     x11,  [x5], 8
374        SDOT    v23.4s,  v9.16b, v7.4b[0]
375
376        # BLOCK 2
377        SDOT    v24.4s, v10.16b, v4.4b[0]
378        LDR     d8,  [x5], 8
379        SDOT    v25.4s, v10.16b, v5.4b[0]
380        INS     v11.d[1], x11
381        SDOT    v26.4s, v10.16b, v6.4b[0]
382        LDR     x11,  [x5], 8
383        SDOT    v27.4s, v10.16b, v7.4b[0]
384
385        # BLOCK 3
386        SDOT    v28.4s, v11.16b, v4.4b[0]
387        LDR     d9,  [x5], 8
388        SDOT    v29.4s, v11.16b, v5.4b[0]
389        INS     v8.d[1], x11
390        SDOT    v30.4s, v11.16b, v6.4b[0]
391        LDR     x11,  [x5], 8
392        SDOT    v31.4s, v11.16b, v7.4b[0]
393
394        # BLOCK 0
395        SDOT    v16.4s,  v8.16b, v4.4b[1]
396        LDR     d10,  [x5], 8
397        SDOT    v17.4s,  v8.16b, v5.4b[1]
398        INS     v9.d[1], x11
399        SDOT    v18.4s,  v8.16b, v6.4b[1]
400        LDR     x11,  [x5], 8
401        SDOT    v19.4s,  v8.16b, v7.4b[1]
402
403        # BLOCK 1
404        SDOT    v20.4s,  v9.16b, v4.4b[1]
405        LDR     d11,  [x5], 8
406        SDOT    v21.4s,  v9.16b, v5.4b[1]
407        INS     v10.d[1], x11
408        SDOT    v22.4s,  v9.16b, v6.4b[1]
409        LDR     x11,  [x5], 8
410        SDOT    v23.4s,  v9.16b, v7.4b[1]
411
412        # BLOCK 2
413        SDOT    v24.4s, v10.16b, v4.4b[1]
414        SDOT    v25.4s, v10.16b, v5.4b[1]
415        INS     v11.d[1], x11
416        SDOT    v26.4s, v10.16b, v6.4b[1]
417        SDOT    v27.4s, v10.16b, v7.4b[1]
418        AND     x0, x2, 15              // kc remainder 0 to 12
419
420        # BLOCK 3
421        SDOT    v28.4s, v11.16b, v4.4b[1]
422        SDOT    v29.4s, v11.16b, v5.4b[1]
423        LDR     x11, [sp, 56]            // reload params pointer
424        SDOT    v30.4s, v11.16b, v6.4b[1]
425        SDOT    v31.4s, v11.16b, v7.4b[1]
426
427        # Is there a remainder?- 4 to 12 bytes of A
428        CBNZ    x0, 6f
429
430        .p2align 3
4314:
432        # ks loop
433        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
434        B.HI    1b
435
436        SCVTF   v16.4s, v16.4s
437        SCVTF   v17.4s, v17.4s
438        # Load per channel scale values from weights
439        LDR     q4, [x5], 16
440        SCVTF   v18.4s, v18.4s
441        SCVTF   v19.4s, v19.4s
442        LDR     q5, [x5], 16
443        SCVTF   v20.4s, v20.4s
444        SCVTF   v21.4s, v21.4s
445        SCVTF   v22.4s, v22.4s
446        SCVTF   v23.4s, v23.4s
447        SCVTF   v24.4s, v24.4s
448        SCVTF   v25.4s, v25.4s
449        SCVTF   v26.4s, v26.4s
450        SCVTF   v27.4s, v27.4s
451        SCVTF   v28.4s, v28.4s
452        SCVTF   v29.4s, v29.4s
453        SCVTF   v30.4s, v30.4s
454        SCVTF   v31.4s, v31.4s
455
456        LDR     q6, [x5], 16
457        FMUL    v16.4s, v16.4s, v4.4s
458        FMUL    v17.4s, v17.4s, v4.4s
459        FMUL    v18.4s, v18.4s, v4.4s
460        FMUL    v19.4s, v19.4s, v4.4s
461        FMUL    v20.4s, v20.4s, v5.4s
462        LDR     q4, [x5], 16
463        FMUL    v21.4s, v21.4s, v5.4s
464        FMUL    v22.4s, v22.4s, v5.4s
465        FMUL    v23.4s, v23.4s, v5.4s
466        FMUL    v24.4s, v24.4s, v6.4s
467        FMUL    v25.4s, v25.4s, v6.4s
468        FMUL    v26.4s, v26.4s, v6.4s
469        FMUL    v27.4s, v27.4s, v6.4s
470        FMUL    v28.4s, v28.4s, v4.4s
471        FMUL    v29.4s, v29.4s, v4.4s
472        FMUL    v30.4s, v30.4s, v4.4s
473        FMUL    v31.4s, v31.4s, v4.4s
474
475        FCVTNS  v16.4s, v16.4s
476        FCVTNS  v17.4s, v17.4s
477        FCVTNS  v18.4s, v18.4s
478        FCVTNS  v19.4s, v19.4s
479        FCVTNS  v20.4s, v20.4s
480        FCVTNS  v21.4s, v21.4s
481        FCVTNS  v22.4s, v22.4s
482        FCVTNS  v23.4s, v23.4s
483        FCVTNS  v24.4s, v24.4s
484        FCVTNS  v25.4s, v25.4s
485        FCVTNS  v26.4s, v26.4s
486        FCVTNS  v27.4s, v27.4s
487        FCVTNS  v28.4s, v28.4s
488        FCVTNS  v29.4s, v29.4s
489        FCVTNS  v30.4s, v30.4s
490        FCVTNS  v31.4s, v31.4s
491
492        SQXTN   v16.4h, v16.4s
493        SQXTN   v17.4h, v17.4s
494        SQXTN   v18.4h, v18.4s
495        SQXTN   v19.4h, v19.4s
496        SQXTN   v24.4h, v24.4s
497        SQXTN   v25.4h, v25.4s
498        SQXTN   v26.4h, v26.4s
499        SQXTN   v27.4h, v27.4s
500        LD1R    {v6.8h}, [x11], 2       // add bias
501
502        SQXTN2  v16.8h, v20.4s
503        SQXTN2  v17.8h, v21.4s
504        SQXTN2  v18.8h, v22.4s
505        SQXTN2  v19.8h, v23.4s
506        SQXTN2  v24.8h, v28.4s
507        SQXTN2  v25.8h, v29.4s
508        SQXTN2  v26.8h, v30.4s
509        SQXTN2  v27.8h, v31.4s
510
511        SQADD   v16.8h, v16.8h, v6.8h
512        SQADD   v17.8h, v17.8h, v6.8h
513        SQADD   v18.8h, v18.8h, v6.8h
514        SQADD   v19.8h, v19.8h, v6.8h
515        SQADD   v24.8h, v24.8h, v6.8h
516        SQADD   v25.8h, v25.8h, v6.8h
517        SQADD   v26.8h, v26.8h, v6.8h
518        SQADD   v27.8h, v27.8h, v6.8h
519        LD1R    {v4.16b}, [x11], 1      // clamp min value
520
521        SQXTN   v0.8b, v16.8h
522        SQXTN   v1.8b, v17.8h
523        SQXTN   v2.8b, v18.8h
524        SQXTN   v3.8b, v19.8h
525        LD1R    {v5.16b}, [x11]         // clamp max value
526        SQXTN2  v0.16b, v24.8h
527        SQXTN2  v1.16b, v25.8h
528        SQXTN2  v2.16b, v26.8h
529        SQXTN2  v3.16b, v27.8h
530        LDR     x0, [sp, 32]            // cn_stride
531        SMAX    v0.16b, v0.16b, v4.16b
532        SMAX    v1.16b, v1.16b, v4.16b
533        SUB     x11, x11, 3          // rewind params pointer
534        SMAX    v2.16b, v2.16b, v4.16b
535        SMAX    v3.16b, v3.16b, v4.16b
536        SUBS    x1, x1, 16
537        SMIN    v0.16b, v0.16b, v5.16b
538        SMIN    v1.16b, v1.16b, v5.16b
539        SMIN    v2.16b, v2.16b, v5.16b
540        SMIN    v3.16b, v3.16b, v5.16b
541        B.LO    7f
542
543        # Store full 4 x 16
544        ST1     {v3.16b},  [x7], x0
545        ST1     {v2.16b}, [x17], x0
546        ST1     {v1.16b}, [x16], x0
547        ST1     {v0.16b},  [x6], x0
548
549        SUB     x4, x4, x3              // a -= ks
550
551        # nc loop
552        B.HI    0b
553
554        # Restore d8-d11 from stack
555        LDP     d10, d11, [sp, 16]
556        LDP     d8,  d9, [sp], 32
557        RET
558
559        # Remainder- 4 to 12 bytes of A
560        # Although C4, its safe to read 16 bytes.
561        .p2align 3
5625:
563        AND     x0, x2, 15              // kc remainder 4 to 12
5646:
565        LDR     q0, [x13]
566        LDP     q8,  q9,  [x5], 32
567        LDR     q1, [x14]
568        LDR     q2, [x15]
569        LDR     q3, [x10]
570        LDP     q10, q11, [x5], 32
571        SDOT    v16.4s,  v8.16b, v0.4b[0]
572        SDOT    v17.4s,  v8.16b, v1.4b[0]
573        SDOT    v18.4s,  v8.16b, v2.4b[0]
574        SDOT    v19.4s,  v8.16b, v3.4b[0]
575        SDOT    v20.4s,  v9.16b, v0.4b[0]
576        SDOT    v21.4s,  v9.16b, v1.4b[0]
577        SDOT    v22.4s,  v9.16b, v2.4b[0]
578        SDOT    v23.4s,  v9.16b, v3.4b[0]
579        SDOT    v24.4s, v10.16b, v0.4b[0]
580        SDOT    v25.4s, v10.16b, v1.4b[0]
581        SDOT    v26.4s, v10.16b, v2.4b[0]
582        SDOT    v27.4s, v10.16b, v3.4b[0]
583        SDOT    v28.4s, v11.16b, v0.4b[0]
584        SDOT    v29.4s, v11.16b, v1.4b[0]
585        SDOT    v30.4s, v11.16b, v2.4b[0]
586        SDOT    v31.4s, v11.16b, v3.4b[0]
587        CMP     x0, 4
588        B.LS    4b
589        LDP     q8,  q9,  [x5], 32
590        LDP     q10, q11,  [x5], 32
591        SDOT    v16.4s,  v8.16b, v0.4b[1]
592        SDOT    v17.4s,  v8.16b, v1.4b[1]
593        SDOT    v18.4s,  v8.16b, v2.4b[1]
594        SDOT    v19.4s,  v8.16b, v3.4b[1]
595        SDOT    v20.4s,  v9.16b, v0.4b[1]
596        SDOT    v21.4s,  v9.16b, v1.4b[1]
597        SDOT    v22.4s,  v9.16b, v2.4b[1]
598        SDOT    v23.4s,  v9.16b, v3.4b[1]
599        SDOT    v24.4s, v10.16b, v0.4b[1]
600        SDOT    v25.4s, v10.16b, v1.4b[1]
601        SDOT    v26.4s, v10.16b, v2.4b[1]
602        SDOT    v27.4s, v10.16b, v3.4b[1]
603        SDOT    v28.4s, v11.16b, v0.4b[1]
604        SDOT    v29.4s, v11.16b, v1.4b[1]
605        SDOT    v30.4s, v11.16b, v2.4b[1]
606        SDOT    v31.4s, v11.16b, v3.4b[1]
607        CMP     x0, 8
608        B.LS    4b
609        LDP     q8,  q9,  [x5], 32
610        LDP     q10, q11,  [x5], 32
611        SDOT    v16.4s,  v8.16b, v0.4b[2]
612        SDOT    v17.4s,  v8.16b, v1.4b[2]
613        SDOT    v18.4s,  v8.16b, v2.4b[2]
614        SDOT    v19.4s,  v8.16b, v3.4b[2]
615        SDOT    v20.4s,  v9.16b, v0.4b[2]
616        SDOT    v21.4s,  v9.16b, v1.4b[2]
617        SDOT    v22.4s,  v9.16b, v2.4b[2]
618        SDOT    v23.4s,  v9.16b, v3.4b[2]
619        SDOT    v24.4s, v10.16b, v0.4b[2]
620        SDOT    v25.4s, v10.16b, v1.4b[2]
621        SDOT    v26.4s, v10.16b, v2.4b[2]
622        SDOT    v27.4s, v10.16b, v3.4b[2]
623        SDOT    v28.4s, v11.16b, v0.4b[2]
624        SDOT    v29.4s, v11.16b, v1.4b[2]
625        SDOT    v30.4s, v11.16b, v2.4b[2]
626        SDOT    v31.4s, v11.16b, v3.4b[2]
627        B       4b
628
629        # Store odd width
630        .p2align 3
6317:
632        TBZ     x1, 3, 8f
633        STR     d3, [x7], 8
634        STR     d2, [x17], 8
635        DUP     d3, v3.d[1]
636        DUP     d2, v2.d[1]
637        STR     d1, [x16], 8
638        STR     d0, [x6], 8
639        DUP     d1, v1.d[1]
640        DUP     d0, v0.d[1]
6418:
642        TBZ     x1, 2, 9f
643        STR     s3, [x7], 4
644        STR     s2, [x17], 4
645        DUP     s3, v3.s[1]
646        DUP     s2, v2.s[1]
647        STR     s1, [x16], 4
648        STR     s0, [x6], 4
649        DUP     s1, v1.s[1]
650        DUP     s0, v0.s[1]
6519:
652        TBZ     x1, 1, 10f
653        STR     h3, [x7], 2
654        STR     h2, [x17], 2
655        DUP     h3, v3.h[1]
656        DUP     h2, v2.h[1]
657        STR     h1, [x16], 2
658        STR     h0, [x6], 2
659        DUP     h1, v1.h[1]
660        DUP     h0, v0.h[1]
66110:
662        TBZ     x1, 0, 11f
663        STR     b3, [x7]
664        STR     b2, [x17]
665        STR     b1, [x16]
666        STR     b0, [x6]
66711:
668        # Restore d8-d11 from stack
669        LDP     d10, d11, [sp, 16]
670        LDP     d8,  d9, [sp], 32
671        RET
672
673END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
674
675#ifdef __ELF__
676.section ".note.GNU-stack","",%progbits
677#endif
678