xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert REQUANTIZATION in ["FP32", "RNDNU"]
7$assert not CHANNELWISE or REQUANTIZATION == "FP32"
8
9#include <xnnpack/assembly.h>
10
11$DATATYPE = "qc8" if CHANNELWISE else "qs8"
12$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
13$REWIND_DECREMENT = 3 if CHANNELWISE else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
14$if DATATYPE == "qu8": REWIND_DECREMENT += 4
15# void xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55(
16#     size_t mr,                 x0
17#     size_t nc,                 x1
18#     size_t kc,                 x2 / x0
19#     size_t ks,                 x3 / x9
20#     const int8_t**restrict a,  x4
21#     const int8_t* restrict w,  x5
22#     int8_t* restrict c,        x6
23#     size_t cm_stride,          x7
24#     size_t cn_stride,                  [sp] -> (x0)
25#     size_t a_offset,                   [sp + 8] -> x8
26#     const int8_t* zero,                [sp + 16] -> x12
27#     const union ${PARAMS_UNION} params [sp + 24] -> (x11)
28
29# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
30
31# Register usage
32# A0  x13  v0  v4
33# A1  x14  v1  v5
34# A2  x15  v2  v6
35# A3  x10  v3  v7
36# B    x5  v8  v9 v10 v11
37# C0   x6 v16 v20 v24 v28
38# C1  x16 v17 v21 v25 v29
39# C2  x17 v18 v22 v26 v30
40# C3   x7 v19 v23 v27 v31
41# unused v12 v13 v14 v15
42
43# x11 temp for Cortex-A55 loads
44
45BEGIN_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
46
47        # Clamp C pointers
48        CMP     x0, 2                   // if mr < 2
49        LDR     x8, [sp, 8]             // Load a_offset
50        ADD     x16, x6, x7             // c1 = c0 + cm_stride
51        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
52        CSEL    x16, x6,  x16, LO       //   c1 = c0
53        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
54        STP     d8,  d9, [sp, -32]!     // Save d8-d11 on stack
55
56        ADD     x17, x16, x7            // c2 = c1 + cm_stride
57        STP     d10, d11, [sp, 16]
58                                        // if mr <= 2
59        CSEL    x17, x16, x17, LS       //   c2 = c1
60        BIC     x2, x2, 3
61
62        CMP     x0, 4                   // if mr < 4
63        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
64        CSEL    x7,  x17, x7, LO        //   c3 = c2
65
66        .p2align 3
670:
68        # Load initial bias from w into accumulators
69        LDP     q16, q20, [x5], 32
70        MOV     v17.16b, v16.16b
71        MOV     v18.16b, v16.16b
72        LDP     q24, q28, [x5], 32
73        MOV     v19.16b, v16.16b
74        MOV     v21.16b, v20.16b
75        MOV     v22.16b, v20.16b
76        MOV     v23.16b, v20.16b
77        MOV     v25.16b, v24.16b
78        MOV     v26.16b, v24.16b
79        MOV     v27.16b, v24.16b
80        MOV     v29.16b, v28.16b
81        MOV     v30.16b, v28.16b
82        MOV     v31.16b, v28.16b
83        MOV     x9, x3                  // p = ks
84
85        .p2align 3
861:
87        # Load next 4 A pointers
88        LDP     x13, x14, [x4], 16
89        LDP     x15, x10, [x4], 16
90
91        CMP     x13, x12                // if a0 == zero
92        ADD     x13, x13, x8            // a0 += a_offset
93        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
94        CMP     x14, x12                // if a1 == zero
95        ADD     x14, x14, x8            // a1 += a_offset
96        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
97        CMP     x15, x12                // if a2 == zero
98        ADD     x15, x15, x8            // a2 += a_offset
99        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
100        CMP     x10, x12                // if a3 == zero
101        ADD     x10, x10, x8            // a3 += a_offset
102        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
103
104        # Is there at least 16 bytes for prologue/epilogue?
105        SUBS    x0, x2, 16              // k = kc - 16
106        B.LO    5f
107
108        # prologue - read A and B values for block 0 and 1
109        LDR     d0, [x13], 8
110        LDR     q8,  [x5], 16
111        LDR     d1, [x14], 8
112        LDR     d2, [x15], 8
113        LDR     d3, [x10], 8
114        SUBS    x0, x0, 16              // is there 16 for main loop?
115        LDR     d9,  [x5], 8
116        LDR     x11,  [x5], 8
117        # Is there at least 16 bytes for main loop?
118        B.LO    3f
119
120        # Main loop - 16 bytes of A in 4 groups.
121        # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
122        # 4 LD64 for A
123        # 4 LD128 for W. = 2 LD64 + INS.
124        # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.
125
126        .p2align 3
1272:
128        # BLOCK 0
129        SDOT    v16.4s,  v8.16b, v0.4b[0]
130        LDR     d10,  [x5], 8
131        SDOT    v17.4s,  v8.16b, v1.4b[0]
132        INS     v9.d[1], x11
133        SDOT    v18.4s,  v8.16b, v2.4b[0]
134        LDR     x11,  [x5], 8
135        SDOT    v19.4s,  v8.16b, v3.4b[0]
136        LDR     d4,  [x13], 8
137
138        # BLOCK 1
139        SDOT    v20.4s,  v9.16b, v0.4b[0]
140        LDR     d11,  [x5], 8
141        SDOT    v21.4s,  v9.16b, v1.4b[0]
142        INS     v10.d[1], x11
143        SDOT    v22.4s,  v9.16b, v2.4b[0]
144        LDR     x11,  [x5], 8
145        SDOT    v23.4s,  v9.16b, v3.4b[0]
146        LDR     d5, [x14], 8
147
148        # BLOCK 2
149        SDOT    v24.4s, v10.16b, v0.4b[0]
150        LDR     d8,  [x5], 8
151        SDOT    v25.4s, v10.16b, v1.4b[0]
152        INS     v11.d[1], x11
153        SDOT    v26.4s, v10.16b, v2.4b[0]
154        LDR     x11,  [x5], 8
155        SDOT    v27.4s, v10.16b, v3.4b[0]
156        LDR     d6, [x15], 8
157
158        # BLOCK 3
159        SDOT    v28.4s, v11.16b, v0.4b[0]
160        LDR     d9,  [x5], 8
161        SDOT    v29.4s, v11.16b, v1.4b[0]
162        INS     v8.d[1], x11
163        SDOT    v30.4s, v11.16b, v2.4b[0]
164        LDR     x11,  [x5], 8
165        SDOT    v31.4s, v11.16b, v3.4b[0]
166        LDR     d7,  [x10], 8
167
168        # BLOCK 0
169        SDOT    v16.4s,  v8.16b, v0.4b[1]
170        LDR     d10,  [x5], 8
171        SDOT    v17.4s,  v8.16b, v1.4b[1]
172        INS     v9.d[1], x11
173        SDOT    v18.4s,  v8.16b, v2.4b[1]
174        LDR     x11,  [x5], 8
175        SDOT    v19.4s,  v8.16b, v3.4b[1]
176
177        # BLOCK 1
178        SDOT    v20.4s,  v9.16b, v0.4b[1]
179        LDR     d11,  [x5], 8
180        SDOT    v21.4s,  v9.16b, v1.4b[1]
181        INS     v10.d[1], x11
182        SDOT    v22.4s,  v9.16b, v2.4b[1]
183        LDR     x11,  [x5], 8
184        SDOT    v23.4s,  v9.16b, v3.4b[1]
185
186        # BLOCK 2
187        SDOT    v24.4s, v10.16b, v0.4b[1]
188        LDR     d8,  [x5], 8
189        SDOT    v25.4s, v10.16b, v1.4b[1]
190        INS     v11.d[1], x11
191        SDOT    v26.4s, v10.16b, v2.4b[1]
192        LDR     x11,  [x5], 8
193        SDOT    v27.4s, v10.16b, v3.4b[1]
194
195        # BLOCK 3
196        SDOT    v28.4s, v11.16b, v0.4b[1]
197        LDR     d9,  [x5], 8
198        SDOT    v29.4s, v11.16b, v1.4b[1]
199        INS     v8.d[1], x11
200        SDOT    v30.4s, v11.16b, v2.4b[1]
201        LDR     x11,  [x5], 8
202        SDOT    v31.4s, v11.16b, v3.4b[1]
203
204        # BLOCK 0
205        SDOT    v16.4s,  v8.16b, v4.4b[0]
206        LDR     d10,  [x5], 8
207        SDOT    v17.4s,  v8.16b, v5.4b[0]
208        INS     v9.d[1], x11
209        SDOT    v18.4s,  v8.16b, v6.4b[0]
210        LDR     x11,  [x5], 8
211        SDOT    v19.4s,  v8.16b, v7.4b[0]
212        LDR     d0,  [x13], 8
213
214        # BLOCK 1
215        SDOT    v20.4s,  v9.16b, v4.4b[0]
216        LDR     d11,  [x5], 8
217        SDOT    v21.4s,  v9.16b, v5.4b[0]
218        INS     v10.d[1], x11
219        SDOT    v22.4s,  v9.16b, v6.4b[0]
220        LDR     x11,  [x5], 8
221        SDOT    v23.4s,  v9.16b, v7.4b[0]
222        LDR     d1, [x14], 8
223
224        # BLOCK 2
225        SDOT    v24.4s, v10.16b, v4.4b[0]
226        LDR     d8,  [x5], 8
227        SDOT    v25.4s, v10.16b, v5.4b[0]
228        INS     v11.d[1], x11
229        SDOT    v26.4s, v10.16b, v6.4b[0]
230        LDR     x11,  [x5], 8
231        SDOT    v27.4s, v10.16b, v7.4b[0]
232        LDR     d2, [x15], 8
233
234        # BLOCK 3
235        SDOT    v28.4s, v11.16b, v4.4b[0]
236        LDR     d9,  [x5], 8
237        SDOT    v29.4s, v11.16b, v5.4b[0]
238        INS     v8.d[1], x11
239        SDOT    v30.4s, v11.16b, v6.4b[0]
240        LDR     x11,  [x5], 8
241        SDOT    v31.4s, v11.16b, v7.4b[0]
242        LDR     d3,  [x10], 8
243
244        # BLOCK 0
245        SDOT    v16.4s,  v8.16b, v4.4b[1]
246        LDR     d10,  [x5], 8
247        SDOT    v17.4s,  v8.16b, v5.4b[1]
248        INS     v9.d[1], x11
249        SDOT    v18.4s,  v8.16b, v6.4b[1]
250        LDR     x11,  [x5], 8
251        SDOT    v19.4s,  v8.16b, v7.4b[1]
252
253        # BLOCK 1
254        SDOT    v20.4s,  v9.16b, v4.4b[1]
255        LDR     d11,  [x5], 8
256        SDOT    v21.4s,  v9.16b, v5.4b[1]
257        INS     v10.d[1], x11
258        SDOT    v22.4s,  v9.16b, v6.4b[1]
259        LDR     x11,  [x5], 8
260        SDOT    v23.4s,  v9.16b, v7.4b[1]
261
262        # BLOCK 2
263        SDOT    v24.4s, v10.16b, v4.4b[1]
264        LDR     d8,  [x5], 8            // First B values for block 0 and 1
265        SDOT    v25.4s, v10.16b, v5.4b[1]
266        INS     v11.d[1], x11
267        SDOT    v26.4s, v10.16b, v6.4b[1]
268        LDR     x11,  [x5], 8
269        SDOT    v27.4s, v10.16b, v7.4b[1]
270        SUBS    x0, x0, 16
271
272        # BLOCK 3
273        SDOT    v28.4s, v11.16b, v4.4b[1]
274        LDR     d9,  [x5], 8
275        SDOT    v29.4s, v11.16b, v5.4b[1]
276        INS     v8.d[1], x11
277        SDOT    v30.4s, v11.16b, v6.4b[1]
278        LDR     x11,  [x5], 8
279        SDOT    v31.4s, v11.16b, v7.4b[1]
280        B.HS    2b
281
282        # Epilogue.  Same as main loop but no preloads in final group
2833:
284        # BLOCK 0
285        SDOT    v16.4s,  v8.16b, v0.4b[0]
286        LDR     d10,  [x5], 8
287        SDOT    v17.4s,  v8.16b, v1.4b[0]
288        INS     v9.d[1], x11
289        SDOT    v18.4s,  v8.16b, v2.4b[0]
290        LDR     x11,  [x5], 8
291        SDOT    v19.4s,  v8.16b, v3.4b[0]
292        LDR     d4,  [x13], 8
293
294        # BLOCK 1
295        SDOT    v20.4s,  v9.16b, v0.4b[0]
296        LDR     d11,  [x5], 8
297        SDOT    v21.4s,  v9.16b, v1.4b[0]
298        INS     v10.d[1], x11
299        SDOT    v22.4s,  v9.16b, v2.4b[0]
300        LDR     x11,  [x5], 8
301        SDOT    v23.4s,  v9.16b, v3.4b[0]
302        LDR     d5, [x14], 8
303
304        # BLOCK 2
305        SDOT    v24.4s, v10.16b, v0.4b[0]
306        LDR     d8,  [x5], 8
307        SDOT    v25.4s, v10.16b, v1.4b[0]
308        INS     v11.d[1], x11
309        SDOT    v26.4s, v10.16b, v2.4b[0]
310        LDR     x11,  [x5], 8
311        SDOT    v27.4s, v10.16b, v3.4b[0]
312        LDR     d6, [x15], 8
313
314        # BLOCK 3
315        SDOT    v28.4s, v11.16b, v0.4b[0]
316        LDR     d9,  [x5], 8
317        SDOT    v29.4s, v11.16b, v1.4b[0]
318        INS     v8.d[1], x11
319        SDOT    v30.4s, v11.16b, v2.4b[0]
320        LDR     x11,  [x5], 8
321        SDOT    v31.4s, v11.16b, v3.4b[0]
322        LDR     d7,  [x10], 8
323
324        # BLOCK 0
325        SDOT    v16.4s,  v8.16b, v0.4b[1]
326        LDR     d10,  [x5], 8
327        SDOT    v17.4s,  v8.16b, v1.4b[1]
328        INS     v9.d[1], x11
329        SDOT    v18.4s,  v8.16b, v2.4b[1]
330        LDR     x11,  [x5], 8
331        SDOT    v19.4s,  v8.16b, v3.4b[1]
332
333        # BLOCK 1
334        SDOT    v20.4s,  v9.16b, v0.4b[1]
335        LDR     d11,  [x5], 8
336        SDOT    v21.4s,  v9.16b, v1.4b[1]
337        INS     v10.d[1], x11
338        SDOT    v22.4s,  v9.16b, v2.4b[1]
339        LDR     x11,  [x5], 8
340        SDOT    v23.4s,  v9.16b, v3.4b[1]
341
342        # BLOCK 2
343        SDOT    v24.4s, v10.16b, v0.4b[1]
344        LDR     d8,  [x5], 8
345        SDOT    v25.4s, v10.16b, v1.4b[1]
346        INS     v11.d[1], x11
347        SDOT    v26.4s, v10.16b, v2.4b[1]
348        LDR     x11,  [x5], 8
349        SDOT    v27.4s, v10.16b, v3.4b[1]
350
351        # BLOCK 3
352        SDOT    v28.4s, v11.16b, v0.4b[1]
353        LDR     d9,  [x5], 8
354        SDOT    v29.4s, v11.16b, v1.4b[1]
355        INS     v8.d[1], x11
356        SDOT    v30.4s, v11.16b, v2.4b[1]
357        LDR     x11,  [x5], 8
358        SDOT    v31.4s, v11.16b, v3.4b[1]
359
360        # BLOCK 0
361        SDOT    v16.4s,  v8.16b, v4.4b[0]
362        LDR     d10,  [x5], 8
363        SDOT    v17.4s,  v8.16b, v5.4b[0]
364        INS     v9.d[1], x11
365        SDOT    v18.4s,  v8.16b, v6.4b[0]
366        LDR     x11,  [x5], 8
367        SDOT    v19.4s,  v8.16b, v7.4b[0]
368
369        # BLOCK 1
370        SDOT    v20.4s,  v9.16b, v4.4b[0]
371        LDR     d11,  [x5], 8
372        SDOT    v21.4s,  v9.16b, v5.4b[0]
373        INS     v10.d[1], x11
374        SDOT    v22.4s,  v9.16b, v6.4b[0]
375        LDR     x11,  [x5], 8
376        SDOT    v23.4s,  v9.16b, v7.4b[0]
377
378        # BLOCK 2
379        SDOT    v24.4s, v10.16b, v4.4b[0]
380        LDR     d8,  [x5], 8
381        SDOT    v25.4s, v10.16b, v5.4b[0]
382        INS     v11.d[1], x11
383        SDOT    v26.4s, v10.16b, v6.4b[0]
384        LDR     x11,  [x5], 8
385        SDOT    v27.4s, v10.16b, v7.4b[0]
386
387        # BLOCK 3
388        SDOT    v28.4s, v11.16b, v4.4b[0]
389        LDR     d9,  [x5], 8
390        SDOT    v29.4s, v11.16b, v5.4b[0]
391        INS     v8.d[1], x11
392        SDOT    v30.4s, v11.16b, v6.4b[0]
393        LDR     x11,  [x5], 8
394        SDOT    v31.4s, v11.16b, v7.4b[0]
395
396        # BLOCK 0
397        SDOT    v16.4s,  v8.16b, v4.4b[1]
398        LDR     d10,  [x5], 8
399        SDOT    v17.4s,  v8.16b, v5.4b[1]
400        INS     v9.d[1], x11
401        SDOT    v18.4s,  v8.16b, v6.4b[1]
402        LDR     x11,  [x5], 8
403        SDOT    v19.4s,  v8.16b, v7.4b[1]
404
405        # BLOCK 1
406        SDOT    v20.4s,  v9.16b, v4.4b[1]
407        LDR     d11,  [x5], 8
408        SDOT    v21.4s,  v9.16b, v5.4b[1]
409        INS     v10.d[1], x11
410        SDOT    v22.4s,  v9.16b, v6.4b[1]
411        LDR     x11,  [x5], 8
412        SDOT    v23.4s,  v9.16b, v7.4b[1]
413
414        # BLOCK 2
415        SDOT    v24.4s, v10.16b, v4.4b[1]
416        SDOT    v25.4s, v10.16b, v5.4b[1]
417        INS     v11.d[1], x11
418        SDOT    v26.4s, v10.16b, v6.4b[1]
419        SDOT    v27.4s, v10.16b, v7.4b[1]
420        AND     x0, x2, 15              // kc remainder 0 to 12
421
422        # BLOCK 3
423        SDOT    v28.4s, v11.16b, v4.4b[1]
424        SDOT    v29.4s, v11.16b, v5.4b[1]
425        LDR     x11, [sp, 56]            // reload params pointer
426        SDOT    v30.4s, v11.16b, v6.4b[1]
427        SDOT    v31.4s, v11.16b, v7.4b[1]
428
429        # Is there a remainder?- 4 to 12 bytes of A
430        CBNZ    x0, 6f
431
432        .p2align 3
4334:
434        # ks loop
435        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
436        B.HI    1b
437
438        $if REQUANTIZATION == "RNDNU":
439          # Apply params - preshift, scale, postshift, bias and clamp
440          LD1R    {v4.4s}, [x11], 4
441          SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
442          SQSHL   v17.4s, v17.4s, v4.4s
443          SQSHL   v18.4s, v18.4s, v4.4s
444          SQSHL   v19.4s, v19.4s, v4.4s
445          SQSHL   v20.4s, v20.4s, v4.4s
446          SQSHL   v21.4s, v21.4s, v4.4s
447          SQSHL   v22.4s, v22.4s, v4.4s
448          SQSHL   v23.4s, v23.4s, v4.4s
449          LD1R    {v5.4s}, [x11], 4
450          SQSHL   v24.4s, v24.4s, v4.4s
451          SQSHL   v25.4s, v25.4s, v4.4s
452          SQSHL   v26.4s, v26.4s, v4.4s
453          SQSHL   v27.4s, v27.4s, v4.4s
454          SQSHL   v28.4s, v28.4s, v4.4s
455          SQSHL   v29.4s, v29.4s, v4.4s
456          SQSHL   v30.4s, v30.4s, v4.4s
457          SQSHL   v31.4s, v31.4s, v4.4s
458          LD1R    {v6.4s}, [x11], 4
459          SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
460          SQDMULH v17.4s, v17.4s, v5.4s
461          SQDMULH v18.4s, v18.4s, v5.4s
462          SQDMULH v19.4s, v19.4s, v5.4s
463          SQDMULH v20.4s, v20.4s, v5.4s
464          SQDMULH v21.4s, v21.4s, v5.4s
465          SQDMULH v22.4s, v22.4s, v5.4s
466          SQDMULH v23.4s, v23.4s, v5.4s
467          SQDMULH v24.4s, v24.4s, v5.4s
468          SQDMULH v25.4s, v25.4s, v5.4s
469          SQDMULH v26.4s, v26.4s, v5.4s
470          SQDMULH v27.4s, v27.4s, v5.4s
471          SQDMULH v28.4s, v28.4s, v5.4s
472          SQDMULH v29.4s, v29.4s, v5.4s
473          SQDMULH v30.4s, v30.4s, v5.4s
474          SQDMULH v31.4s, v31.4s, v5.4s
475          SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
476          SRSHL   v17.4s, v17.4s, v6.4s
477          SRSHL   v18.4s, v18.4s, v6.4s
478          SRSHL   v19.4s, v19.4s, v6.4s
479          SRSHL   v20.4s, v20.4s, v6.4s
480          SRSHL   v21.4s, v21.4s, v6.4s
481          SRSHL   v22.4s, v22.4s, v6.4s
482          SRSHL   v23.4s, v23.4s, v6.4s
483          SRSHL   v24.4s, v24.4s, v6.4s
484          SRSHL   v25.4s, v25.4s, v6.4s
485          SRSHL   v26.4s, v26.4s, v6.4s
486          SRSHL   v27.4s, v27.4s, v6.4s
487          SRSHL   v28.4s, v28.4s, v6.4s
488          SRSHL   v29.4s, v29.4s, v6.4s
489          SRSHL   v30.4s, v30.4s, v6.4s
490          SRSHL   v31.4s, v31.4s, v6.4s
491        $elif REQUANTIZATION == "FP32":
492          SCVTF   v16.4s, v16.4s
493          SCVTF   v17.4s, v17.4s
494          $if not CHANNELWISE:
495            # Apply params - scale, bias and clamp
496            LD1R    {v4.4s}, [x11], 4
497            SCVTF   v18.4s, v18.4s
498            SCVTF   v19.4s, v19.4s
499          $else:
500            # Load per channel scale values from weights
501            LDR     q4, [x5], 16
502            SCVTF   v18.4s, v18.4s
503            SCVTF   v19.4s, v19.4s
504            LDR     q5, [x5], 16
505          SCVTF   v20.4s, v20.4s
506          SCVTF   v21.4s, v21.4s
507          SCVTF   v22.4s, v22.4s
508          SCVTF   v23.4s, v23.4s
509          SCVTF   v24.4s, v24.4s
510          SCVTF   v25.4s, v25.4s
511          SCVTF   v26.4s, v26.4s
512          SCVTF   v27.4s, v27.4s
513          SCVTF   v28.4s, v28.4s
514          SCVTF   v29.4s, v29.4s
515          SCVTF   v30.4s, v30.4s
516          SCVTF   v31.4s, v31.4s
517
518          $if CHANNELWISE:
519            LDR     q6, [x5], 16
520            FMUL    v16.4s, v16.4s, v4.4s
521            FMUL    v17.4s, v17.4s, v4.4s
522            FMUL    v18.4s, v18.4s, v4.4s
523            FMUL    v19.4s, v19.4s, v4.4s
524            FMUL    v20.4s, v20.4s, v5.4s
525            LDR     q4, [x5], 16
526            FMUL    v21.4s, v21.4s, v5.4s
527            FMUL    v22.4s, v22.4s, v5.4s
528            FMUL    v23.4s, v23.4s, v5.4s
529            FMUL    v24.4s, v24.4s, v6.4s
530            FMUL    v25.4s, v25.4s, v6.4s
531            FMUL    v26.4s, v26.4s, v6.4s
532            FMUL    v27.4s, v27.4s, v6.4s
533            FMUL    v28.4s, v28.4s, v4.4s
534            FMUL    v29.4s, v29.4s, v4.4s
535            FMUL    v30.4s, v30.4s, v4.4s
536            FMUL    v31.4s, v31.4s, v4.4s
537          $else:
538            FMUL    v16.4s, v16.4s, v4.4s
539            FMUL    v17.4s, v17.4s, v4.4s
540            FMUL    v18.4s, v18.4s, v4.4s
541            FMUL    v19.4s, v19.4s, v4.4s
542            FMUL    v20.4s, v20.4s, v4.4s
543            FMUL    v21.4s, v21.4s, v4.4s
544            FMUL    v22.4s, v22.4s, v4.4s
545            FMUL    v23.4s, v23.4s, v4.4s
546            FMUL    v24.4s, v24.4s, v4.4s
547            FMUL    v25.4s, v25.4s, v4.4s
548            FMUL    v26.4s, v26.4s, v4.4s
549            FMUL    v27.4s, v27.4s, v4.4s
550            FMUL    v28.4s, v28.4s, v4.4s
551            FMUL    v29.4s, v29.4s, v4.4s
552            FMUL    v30.4s, v30.4s, v4.4s
553            FMUL    v31.4s, v31.4s, v4.4s
554
555          FCVTNS  v16.4s, v16.4s
556          FCVTNS  v17.4s, v17.4s
557          FCVTNS  v18.4s, v18.4s
558          FCVTNS  v19.4s, v19.4s
559          FCVTNS  v20.4s, v20.4s
560          FCVTNS  v21.4s, v21.4s
561          FCVTNS  v22.4s, v22.4s
562          FCVTNS  v23.4s, v23.4s
563          FCVTNS  v24.4s, v24.4s
564          FCVTNS  v25.4s, v25.4s
565          FCVTNS  v26.4s, v26.4s
566          FCVTNS  v27.4s, v27.4s
567          FCVTNS  v28.4s, v28.4s
568          FCVTNS  v29.4s, v29.4s
569          FCVTNS  v30.4s, v30.4s
570          FCVTNS  v31.4s, v31.4s
571
572        SQXTN   v16.4h, v16.4s
573        SQXTN   v17.4h, v17.4s
574        SQXTN   v18.4h, v18.4s
575        SQXTN   v19.4h, v19.4s
576        SQXTN   v24.4h, v24.4s
577        SQXTN   v25.4h, v25.4s
578        SQXTN   v26.4h, v26.4s
579        SQXTN   v27.4h, v27.4s
580        LD1R    {v6.8h}, [x11], 2       // add bias
581
582        SQXTN2  v16.8h, v20.4s
583        SQXTN2  v17.8h, v21.4s
584        SQXTN2  v18.8h, v22.4s
585        SQXTN2  v19.8h, v23.4s
586        SQXTN2  v24.8h, v28.4s
587        SQXTN2  v25.8h, v29.4s
588        SQXTN2  v26.8h, v30.4s
589        SQXTN2  v27.8h, v31.4s
590
591        SQADD   v16.8h, v16.8h, v6.8h
592        SQADD   v17.8h, v17.8h, v6.8h
593        SQADD   v18.8h, v18.8h, v6.8h
594        SQADD   v19.8h, v19.8h, v6.8h
595        SQADD   v24.8h, v24.8h, v6.8h
596        SQADD   v25.8h, v25.8h, v6.8h
597        SQADD   v26.8h, v26.8h, v6.8h
598        SQADD   v27.8h, v27.8h, v6.8h
599        LD1R    {v4.16b}, [x11], 1      // clamp min value
600
601        SQXTN   v0.8b, v16.8h
602        SQXTN   v1.8b, v17.8h
603        SQXTN   v2.8b, v18.8h
604        SQXTN   v3.8b, v19.8h
605        LD1R    {v5.16b}, [x11]         // clamp max value
606        SQXTN2  v0.16b, v24.8h
607        SQXTN2  v1.16b, v25.8h
608        SQXTN2  v2.16b, v26.8h
609        SQXTN2  v3.16b, v27.8h
610        LDR     x0, [sp, 32]            // cn_stride
611        SMAX    v0.16b, v0.16b, v4.16b
612        SMAX    v1.16b, v1.16b, v4.16b
613        SUB     x11, x11, ${REWIND_DECREMENT}          // rewind params pointer
614        SMAX    v2.16b, v2.16b, v4.16b
615        SMAX    v3.16b, v3.16b, v4.16b
616        SUBS    x1, x1, 16
617        SMIN    v0.16b, v0.16b, v5.16b
618        SMIN    v1.16b, v1.16b, v5.16b
619        SMIN    v2.16b, v2.16b, v5.16b
620        SMIN    v3.16b, v3.16b, v5.16b
621        B.LO    7f
622
623        # Store full 4 x 16
624        ST1     {v3.16b},  [x7], x0
625        ST1     {v2.16b}, [x17], x0
626        ST1     {v1.16b}, [x16], x0
627        ST1     {v0.16b},  [x6], x0
628
629        SUB     x4, x4, x3              // a -= ks
630
631        # nc loop
632        B.HI    0b
633
634        # Restore d8-d11 from stack
635        LDP     d10, d11, [sp, 16]
636        LDP     d8,  d9, [sp], 32
637        RET
638
639        # Remainder- 4 to 12 bytes of A
640        # Although C4, its safe to read 16 bytes.
641        .p2align 3
6425:
643        AND     x0, x2, 15              // kc remainder 4 to 12
6446:
645        LDR     q0, [x13]
646        LDP     q8,  q9,  [x5], 32
647        LDR     q1, [x14]
648        LDR     q2, [x15]
649        LDR     q3, [x10]
650        LDP     q10, q11, [x5], 32
651        SDOT    v16.4s,  v8.16b, v0.4b[0]
652        SDOT    v17.4s,  v8.16b, v1.4b[0]
653        SDOT    v18.4s,  v8.16b, v2.4b[0]
654        SDOT    v19.4s,  v8.16b, v3.4b[0]
655        SDOT    v20.4s,  v9.16b, v0.4b[0]
656        SDOT    v21.4s,  v9.16b, v1.4b[0]
657        SDOT    v22.4s,  v9.16b, v2.4b[0]
658        SDOT    v23.4s,  v9.16b, v3.4b[0]
659        SDOT    v24.4s, v10.16b, v0.4b[0]
660        SDOT    v25.4s, v10.16b, v1.4b[0]
661        SDOT    v26.4s, v10.16b, v2.4b[0]
662        SDOT    v27.4s, v10.16b, v3.4b[0]
663        SDOT    v28.4s, v11.16b, v0.4b[0]
664        SDOT    v29.4s, v11.16b, v1.4b[0]
665        SDOT    v30.4s, v11.16b, v2.4b[0]
666        SDOT    v31.4s, v11.16b, v3.4b[0]
667        CMP     x0, 4
668        B.LS    4b
669        LDP     q8,  q9,  [x5], 32
670        LDP     q10, q11,  [x5], 32
671        SDOT    v16.4s,  v8.16b, v0.4b[1]
672        SDOT    v17.4s,  v8.16b, v1.4b[1]
673        SDOT    v18.4s,  v8.16b, v2.4b[1]
674        SDOT    v19.4s,  v8.16b, v3.4b[1]
675        SDOT    v20.4s,  v9.16b, v0.4b[1]
676        SDOT    v21.4s,  v9.16b, v1.4b[1]
677        SDOT    v22.4s,  v9.16b, v2.4b[1]
678        SDOT    v23.4s,  v9.16b, v3.4b[1]
679        SDOT    v24.4s, v10.16b, v0.4b[1]
680        SDOT    v25.4s, v10.16b, v1.4b[1]
681        SDOT    v26.4s, v10.16b, v2.4b[1]
682        SDOT    v27.4s, v10.16b, v3.4b[1]
683        SDOT    v28.4s, v11.16b, v0.4b[1]
684        SDOT    v29.4s, v11.16b, v1.4b[1]
685        SDOT    v30.4s, v11.16b, v2.4b[1]
686        SDOT    v31.4s, v11.16b, v3.4b[1]
687        CMP     x0, 8
688        B.LS    4b
689        LDP     q8,  q9,  [x5], 32
690        LDP     q10, q11,  [x5], 32
691        SDOT    v16.4s,  v8.16b, v0.4b[2]
692        SDOT    v17.4s,  v8.16b, v1.4b[2]
693        SDOT    v18.4s,  v8.16b, v2.4b[2]
694        SDOT    v19.4s,  v8.16b, v3.4b[2]
695        SDOT    v20.4s,  v9.16b, v0.4b[2]
696        SDOT    v21.4s,  v9.16b, v1.4b[2]
697        SDOT    v22.4s,  v9.16b, v2.4b[2]
698        SDOT    v23.4s,  v9.16b, v3.4b[2]
699        SDOT    v24.4s, v10.16b, v0.4b[2]
700        SDOT    v25.4s, v10.16b, v1.4b[2]
701        SDOT    v26.4s, v10.16b, v2.4b[2]
702        SDOT    v27.4s, v10.16b, v3.4b[2]
703        SDOT    v28.4s, v11.16b, v0.4b[2]
704        SDOT    v29.4s, v11.16b, v1.4b[2]
705        SDOT    v30.4s, v11.16b, v2.4b[2]
706        SDOT    v31.4s, v11.16b, v3.4b[2]
707        B       4b
708
709        # Store odd width
710        .p2align 3
7117:
712        TBZ     x1, 3, 8f
713        STR     d3, [x7], 8
714        STR     d2, [x17], 8
715        DUP     d3, v3.d[1]
716        DUP     d2, v2.d[1]
717        STR     d1, [x16], 8
718        STR     d0, [x6], 8
719        DUP     d1, v1.d[1]
720        DUP     d0, v0.d[1]
7218:
722        TBZ     x1, 2, 9f
723        STR     s3, [x7], 4
724        STR     s2, [x17], 4
725        DUP     s3, v3.s[1]
726        DUP     s2, v2.s[1]
727        STR     s1, [x16], 4
728        STR     s0, [x6], 4
729        DUP     s1, v1.s[1]
730        DUP     s0, v0.s[1]
7319:
732        TBZ     x1, 1, 10f
733        STR     h3, [x7], 2
734        STR     h2, [x17], 2
735        DUP     h3, v3.h[1]
736        DUP     h2, v2.h[1]
737        STR     h1, [x16], 2
738        STR     h0, [x6], 2
739        DUP     h1, v1.h[1]
740        DUP     h0, v0.h[1]
74110:
742        TBZ     x1, 0, 11f
743        STR     b3, [x7]
744        STR     b2, [x17]
745        STR     b1, [x16]
746        STR     b0, [x6]
74711:
748        # Restore d8-d11 from stack
749        LDP     d10, d11, [sp, 16]
750        LDP     d8,  d9, [sp], 32
751        RET
752
753END_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
754
755#ifdef __ELF__
756.section ".note.GNU-stack","",%progbits
757#endif
758