1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const uint8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     uint8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# params structure is 20 bytes
26#  struct {
27#    uint8_t kernel_zero_point[4];
28#    int32_t right_pre_shift;
29#    int32_t multiplier;
30#    int32_t right_post_shift;
31#    int16_t output_zero_point;
32#    uint8_t output_min;
33#    uint8_t output_max;
34#  } rndnu_neon;
35#
36# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
37
38# Register usage
39# A0  x3 v0
40# A1 x15 v1
41# A2 x13 v2
42# A3  x4 v3
43# B   x5 v4  v5  v6
44# C0  x6 v16 v20 v24 v28
45# C1  x8 v17 v21 v25 v29
46# C2  x9 v18 v22 v26 v30
47# C3  x7 v19 v23 v27 v31
48# zero_point  v7
49# unused v8 v9 v10 v11 v12 v13 v14 v15
50
51# x10 x17 a53 temp registers
52
53BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
54
55        # Clamp A and C pointers
56        CMP     x0, 2                   // if mr < 2
57        LDP     x12, x11, [sp]          // Load cn_stride, params
58        ADD     x15, x3, x4             // a1 = a0 + a_stride
59        ADD     x8, x6, x7              // c1 = c0 + cm_stride
60        CSEL    x15, x3, x15, LO        //   a1 = a0
61        CSEL    x8, x6,  x8, LO         //   c1 = c0
62
63        ADD     x13, x15, x4            // a2 = a1 + a_stride
64        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
65                                        // if mr <= 2
66        CSEL    x13, x15, x13, LS       //   a2 = a1
67        CSEL    x9,  x8,  x9, LS        //   c2 = c1
68
69        CMP     x0, 4                   // if mr < 4
70        ADD     x4, x13, x4             // a3 = a2 + a_stride
71        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
72        CSEL    x4, x13, x4, LO         //   a3 = a2
73        CSEL    x7,  x9, x7, LO         //   c3 = c2
74        LD1R    {v7.4s}, [x11], 4        // kernel_zero_point
75
76        .p2align 3
770:
78        # Load initial bias from w into accumulators
79        LDP     q16, q20, [x5], 32
80        MOV     v17.16b, v16.16b
81        MOV     v18.16b, v16.16b
82        LDP     q24, q28, [x5], 32
83        MOV     v19.16b, v16.16b
84        MOV     v21.16b, v20.16b
85        MOV     v22.16b, v20.16b
86        MOV     v23.16b, v20.16b
87        SUBS    x0, x2, 8               // k = kc - 8
88        MOV     v25.16b, v24.16b
89        MOV     v26.16b, v24.16b
90        MOV     v27.16b, v24.16b
91        MOV     v29.16b, v28.16b
92        MOV     v30.16b, v28.16b
93        MOV     v31.16b, v28.16b
94        # Is there at least 8 bytes for epilogue?
95        B.LO    4f
96
97        # Prologue
98        LDR     d0, [x3], 8
99        LDP     d4, d6, [x5]
100        LDR     d1, [x15], 8
101        LDR     d2, [x13], 8
102        LDR     d3, [x4], 8
103        UXTL    v0.8h, v0.8b
104        LDR     x17, [x5, 16]
105        USUBL   v4.8h, v4.8b, v7.8b
106        UXTL    v1.8h, v1.8b
107        UXTL    v2.8h, v2.8b
108        UXTL    v3.8h, v3.8b
109        USUBL   v6.8h, v6.8b, v7.8b
110
111        SUBS    x0, x0, 8               // k = k - 8
112        # Is there at least 8 bytes for main loop?
113        B.LO    2f
114
115        # Main loop - 8 bytes of A
116        .p2align 3
1171:
118        SMLAL   v16.4s, v4.4h, v0.h[0]
119        SMLAL2  v20.4s, v4.8h, v0.h[0]
120        SMLAL   v17.4s, v4.4h, v1.h[0]
121        SMLAL2  v21.4s, v4.8h, v1.h[0]
122        SMLAL   v18.4s, v4.4h, v2.h[0]
123        SMLAL2  v22.4s, v4.8h, v2.h[0]
124        SMLAL   v19.4s, v4.4h, v3.h[0]
125        SMLAL2  v23.4s, v4.8h, v3.h[0]
126        LDR     d4, [x5, 24]
127        INS     v5.d[0], x17
128        SMLAL   v24.4s, v6.4h, v0.h[0]
129        SMLAL2  v28.4s, v6.8h, v0.h[0]
130        SMLAL   v25.4s, v6.4h, v1.h[0]
131        SMLAL2  v29.4s, v6.8h, v1.h[0]
132        USUBL   v5.8h, v5.8b, v7.8b
133        SMLAL   v26.4s, v6.4h, v2.h[0]
134        SMLAL2  v30.4s, v6.8h, v2.h[0]
135        SMLAL   v27.4s, v6.4h, v3.h[0]
136        SMLAL2  v31.4s, v6.8h, v3.h[0]
137        LDR     x17, [x5, 32]
138        SMLAL   v16.4s, v5.4h, v0.h[1]
139        SMLAL2  v20.4s, v5.8h, v0.h[1]
140        SMLAL   v17.4s, v5.4h, v1.h[1]
141        SMLAL2  v21.4s, v5.8h, v1.h[1]
142        USUBL   v4.8h, v4.8b, v7.8b
143        SMLAL   v18.4s, v5.4h, v2.h[1]
144        SMLAL2  v22.4s, v5.8h, v2.h[1]
145        SMLAL   v19.4s, v5.4h, v3.h[1]
146        SMLAL2  v23.4s, v5.8h, v3.h[1]
147        LDR     d5, [x5, 40]
148        INS     v6.d[0], x17
149        SMLAL   v24.4s, v4.4h, v0.h[1]
150        SMLAL2  v28.4s, v4.8h, v0.h[1]
151        SMLAL   v25.4s, v4.4h, v1.h[1]
152        SMLAL2  v29.4s, v4.8h, v1.h[1]
153        USUBL   v6.8h, v6.8b, v7.8b
154        SMLAL   v26.4s, v4.4h, v2.h[1]
155        SMLAL2  v30.4s, v4.8h, v2.h[1]
156        SMLAL   v27.4s, v4.4h, v3.h[1]
157        SMLAL2  v31.4s, v4.8h, v3.h[1]
158        LDR     x17, [x5, 48]
159        SMLAL   v16.4s, v6.4h, v0.h[2]
160        SMLAL2  v20.4s, v6.8h, v0.h[2]
161        SMLAL   v17.4s, v6.4h, v1.h[2]
162        USUBL   v5.8h, v5.8b, v7.8b
163        SMLAL2  v21.4s, v6.8h, v1.h[2]
164        SMLAL   v18.4s, v6.4h, v2.h[2]
165        SMLAL2  v22.4s, v6.8h, v2.h[2]
166        SMLAL   v19.4s, v6.4h, v3.h[2]
167        SMLAL2  v23.4s, v6.8h, v3.h[2]
168        LDR     d6, [x5, 56]
169        INS     v4.d[0], x17
170        SMLAL   v24.4s, v5.4h, v0.h[2]
171        SMLAL2  v28.4s, v5.8h, v0.h[2]
172        SMLAL   v25.4s, v5.4h, v1.h[2]
173        SMLAL2  v29.4s, v5.8h, v1.h[2]
174        USUBL   v4.8h, v4.8b, v7.8b
175        SMLAL   v26.4s, v5.4h, v2.h[2]
176        SMLAL2  v30.4s, v5.8h, v2.h[2]
177        SMLAL   v27.4s, v5.4h, v3.h[2]
178        SMLAL2  v31.4s, v5.8h, v3.h[2]
179        LDR     x17, [x5, 64]
180        SMLAL   v16.4s, v4.4h, v0.h[3]
181        SMLAL2  v20.4s, v4.8h, v0.h[3]
182        SMLAL   v17.4s, v4.4h, v1.h[3]
183        SMLAL2  v21.4s, v4.8h, v1.h[3]
184        USUBL   v6.8h, v6.8b, v7.8b
185        SMLAL   v18.4s, v4.4h, v2.h[3]
186        SMLAL2  v22.4s, v4.8h, v2.h[3]
187        SMLAL   v19.4s, v4.4h, v3.h[3]
188        SMLAL2  v23.4s, v4.8h, v3.h[3]
189        LDR     d4, [x5, 72]
190        INS     v5.d[0], x17
191        SMLAL   v24.4s, v6.4h, v0.h[3]
192        SMLAL2  v28.4s, v6.8h, v0.h[3]
193        USUBL   v5.8h, v5.8b, v7.8b
194        SMLAL   v25.4s, v6.4h, v1.h[3]
195        SMLAL2  v29.4s, v6.8h, v1.h[3]
196        SMLAL   v26.4s, v6.4h, v2.h[3]
197        SMLAL2  v30.4s, v6.8h, v2.h[3]
198        SMLAL   v27.4s, v6.4h, v3.h[3]
199        SMLAL2  v31.4s, v6.8h, v3.h[3]
200        LDR     x17, [x5, 80]
201        SMLAL   v16.4s, v5.4h, v0.h[4]
202        SMLAL2  v20.4s, v5.8h, v0.h[4]
203        SMLAL   v17.4s, v5.4h, v1.h[4]
204        SMLAL2  v21.4s, v5.8h, v1.h[4]
205        USUBL   v4.8h, v4.8b, v7.8b
206        SMLAL   v18.4s, v5.4h, v2.h[4]
207        SMLAL2  v22.4s, v5.8h, v2.h[4]
208        SMLAL   v19.4s, v5.4h, v3.h[4]
209        SMLAL2  v23.4s, v5.8h, v3.h[4]
210        LDR     d5, [x5, 88]
211        INS     v6.d[0], x17
212        SMLAL   v24.4s, v4.4h, v0.h[4]
213        SMLAL2  v28.4s, v4.8h, v0.h[4]
214        SMLAL   v25.4s, v4.4h, v1.h[4]
215        SMLAL2  v29.4s, v4.8h, v1.h[4]
216        USUBL   v6.8h, v6.8b, v7.8b
217        SMLAL   v26.4s, v4.4h, v2.h[4]
218        SMLAL2  v30.4s, v4.8h, v2.h[4]
219        SMLAL   v27.4s, v4.4h, v3.h[4]
220        SMLAL2  v31.4s, v4.8h, v3.h[4]
221        LDR     x17, [x5, 96]
222        SMLAL   v16.4s, v6.4h, v0.h[5]
223        SMLAL2  v20.4s, v6.8h, v0.h[5]
224        SMLAL   v17.4s, v6.4h, v1.h[5]
225        SMLAL2  v21.4s, v6.8h, v1.h[5]
226        USUBL   v5.8h, v5.8b, v7.8b
227        SMLAL   v18.4s, v6.4h, v2.h[5]
228        SMLAL2  v22.4s, v6.8h, v2.h[5]
229        SMLAL   v19.4s, v6.4h, v3.h[5]
230        SMLAL2  v23.4s, v6.8h, v3.h[5]
231        LDR     d6, [x5, 104]
232        INS     v4.d[0], x17
233        SMLAL   v24.4s, v5.4h, v0.h[5]
234        SMLAL2  v28.4s, v5.8h, v0.h[5]
235        SMLAL   v25.4s, v5.4h, v1.h[5]
236        SMLAL2  v29.4s, v5.8h, v1.h[5]
237        USUBL   v4.8h, v4.8b, v7.8b
238        SMLAL   v26.4s, v5.4h, v2.h[5]
239        SMLAL2  v30.4s, v5.8h, v2.h[5]
240        SMLAL   v27.4s, v5.4h, v3.h[5]
241        SMLAL2  v31.4s, v5.8h, v3.h[5]
242        USUBL   v6.8h, v6.8b, v7.8b
243        LDR     x17, [x5, 112]
244        SMLAL   v16.4s, v4.4h, v0.h[6]
245        SMLAL2  v20.4s, v4.8h, v0.h[6]
246        SMLAL   v17.4s, v4.4h, v1.h[6]
247        SMLAL2  v21.4s, v4.8h, v1.h[6]
248        SMLAL   v18.4s, v4.4h, v2.h[6]
249        SMLAL2  v22.4s, v4.8h, v2.h[6]
250        SMLAL   v19.4s, v4.4h, v3.h[6]
251        SMLAL2  v23.4s, v4.8h, v3.h[6]
252        LDR     d5, [x5, 120]
253        INS     v4.d[0], x17
254        SMLAL   v24.4s, v6.4h, v0.h[6]
255        SMLAL2  v28.4s, v6.8h, v0.h[6]
256        SMLAL   v25.4s, v6.4h, v1.h[6]
257        SMLAL2  v29.4s, v6.8h, v1.h[6]
258        USUBL   v4.8h, v4.8b, v7.8b
259        ADD     x5, x5, 128
260
261        SMLAL   v26.4s, v6.4h, v2.h[6]
262        SMLAL2  v30.4s, v6.8h, v2.h[6]
263        LDR     x17, [x5]
264        SMLAL   v27.4s, v6.4h, v3.h[6]
265        SMLAL2  v31.4s, v6.8h, v3.h[6]
266        USUBL   v5.8h, v5.8b, v7.8b
267        LDR     x10, [x3], 8
268
269        SMLAL   v16.4s, v4.4h, v0.h[7]
270        SMLAL2  v20.4s, v4.8h, v0.h[7]
271        SMLAL   v17.4s, v4.4h, v1.h[7]
272        SMLAL2  v21.4s, v4.8h, v1.h[7]
273        SMLAL   v18.4s, v4.4h, v2.h[7]
274        SMLAL2  v22.4s, v4.8h, v2.h[7]
275        SMLAL   v19.4s, v4.4h, v3.h[7]
276        SMLAL2  v23.4s, v4.8h, v3.h[7]
277        LDR     d6, [x5, 8]
278        INS     v4.d[0], x17
279        SMLAL   v24.4s, v5.4h, v0.h[7]
280        SMLAL2  v28.4s, v5.8h, v0.h[7]
281        LDR     x17, [x13], 8
282        SMLAL   v25.4s, v5.4h, v1.h[7]
283        SMLAL2  v29.4s, v5.8h, v1.h[7]
284        LDR     d1, [x15], 8
285        INS     v0.d[0], x10
286        SMLAL   v26.4s, v5.4h, v2.h[7]
287        SMLAL2  v30.4s, v5.8h, v2.h[7]
288        SMLAL   v27.4s, v5.4h, v3.h[7]
289        SMLAL2  v31.4s, v5.8h, v3.h[7]
290        LDR     d3, [x4], 8
291        INS     v2.d[0], x17
292
293        UXTL    v0.8h, v0.8b
294        UXTL    v1.8h, v1.8b
295        LDR     x17, [x5, 16]
296        USUBL   v4.8h, v4.8b, v7.8b
297        UXTL    v2.8h, v2.8b
298        SUBS    x0, x0, 8
299        UXTL    v3.8h, v3.8b
300        USUBL   v6.8h, v6.8b, v7.8b
301        B.HS    1b
302
303        # Epilogue.  Same as main loop but no preloads in final group
304
305        .p2align 3
3062:
307        SMLAL   v16.4s, v4.4h, v0.h[0]
308        SMLAL2  v20.4s, v4.8h, v0.h[0]
309        SMLAL   v17.4s, v4.4h, v1.h[0]
310        SMLAL2  v21.4s, v4.8h, v1.h[0]
311        SMLAL   v18.4s, v4.4h, v2.h[0]
312        SMLAL2  v22.4s, v4.8h, v2.h[0]
313        SMLAL   v19.4s, v4.4h, v3.h[0]
314        SMLAL2  v23.4s, v4.8h, v3.h[0]
315        LDR     d4, [x5, 24]
316        INS     v5.d[0], x17
317        SMLAL   v24.4s, v6.4h, v0.h[0]
318        SMLAL2  v28.4s, v6.8h, v0.h[0]
319        SMLAL   v25.4s, v6.4h, v1.h[0]
320        SMLAL2  v29.4s, v6.8h, v1.h[0]
321        USUBL   v5.8h, v5.8b, v7.8b
322        SMLAL   v26.4s, v6.4h, v2.h[0]
323        SMLAL2  v30.4s, v6.8h, v2.h[0]
324        SMLAL   v27.4s, v6.4h, v3.h[0]
325        SMLAL2  v31.4s, v6.8h, v3.h[0]
326        LDR     x17, [x5, 32]
327        SMLAL   v16.4s, v5.4h, v0.h[1]
328        SMLAL2  v20.4s, v5.8h, v0.h[1]
329        SMLAL   v17.4s, v5.4h, v1.h[1]
330        SMLAL2  v21.4s, v5.8h, v1.h[1]
331        USUBL   v4.8h, v4.8b, v7.8b
332        SMLAL   v18.4s, v5.4h, v2.h[1]
333        SMLAL2  v22.4s, v5.8h, v2.h[1]
334        SMLAL   v19.4s, v5.4h, v3.h[1]
335        SMLAL2  v23.4s, v5.8h, v3.h[1]
336        LDR     d5, [x5, 40]
337        INS     v6.d[0], x17
338        SMLAL   v24.4s, v4.4h, v0.h[1]
339        SMLAL2  v28.4s, v4.8h, v0.h[1]
340        SMLAL   v25.4s, v4.4h, v1.h[1]
341        SMLAL2  v29.4s, v4.8h, v1.h[1]
342        USUBL   v6.8h, v6.8b, v7.8b
343        SMLAL   v26.4s, v4.4h, v2.h[1]
344        SMLAL2  v30.4s, v4.8h, v2.h[1]
345        SMLAL   v27.4s, v4.4h, v3.h[1]
346        SMLAL2  v31.4s, v4.8h, v3.h[1]
347        LDR     x17, [x5, 48]
348        SMLAL   v16.4s, v6.4h, v0.h[2]
349        SMLAL2  v20.4s, v6.8h, v0.h[2]
350        SMLAL   v17.4s, v6.4h, v1.h[2]
351        USUBL   v5.8h, v5.8b, v7.8b
352        SMLAL2  v21.4s, v6.8h, v1.h[2]
353        SMLAL   v18.4s, v6.4h, v2.h[2]
354        SMLAL2  v22.4s, v6.8h, v2.h[2]
355        SMLAL   v19.4s, v6.4h, v3.h[2]
356        SMLAL2  v23.4s, v6.8h, v3.h[2]
357        LDR     d6, [x5, 56]
358        INS     v4.d[0], x17
359        SMLAL   v24.4s, v5.4h, v0.h[2]
360        SMLAL2  v28.4s, v5.8h, v0.h[2]
361        SMLAL   v25.4s, v5.4h, v1.h[2]
362        SMLAL2  v29.4s, v5.8h, v1.h[2]
363        USUBL   v4.8h, v4.8b, v7.8b
364        SMLAL   v26.4s, v5.4h, v2.h[2]
365        SMLAL2  v30.4s, v5.8h, v2.h[2]
366        SMLAL   v27.4s, v5.4h, v3.h[2]
367        SMLAL2  v31.4s, v5.8h, v3.h[2]
368        LDR     x17, [x5, 64]
369        SMLAL   v16.4s, v4.4h, v0.h[3]
370        SMLAL2  v20.4s, v4.8h, v0.h[3]
371        SMLAL   v17.4s, v4.4h, v1.h[3]
372        SMLAL2  v21.4s, v4.8h, v1.h[3]
373        USUBL   v6.8h, v6.8b, v7.8b
374        SMLAL   v18.4s, v4.4h, v2.h[3]
375        SMLAL2  v22.4s, v4.8h, v2.h[3]
376        SMLAL   v19.4s, v4.4h, v3.h[3]
377        SMLAL2  v23.4s, v4.8h, v3.h[3]
378        LDR     d4, [x5, 72]
379        INS     v5.d[0], x17
380        SMLAL   v24.4s, v6.4h, v0.h[3]
381        SMLAL2  v28.4s, v6.8h, v0.h[3]
382        USUBL   v5.8h, v5.8b, v7.8b
383        SMLAL   v25.4s, v6.4h, v1.h[3]
384        SMLAL2  v29.4s, v6.8h, v1.h[3]
385        SMLAL   v26.4s, v6.4h, v2.h[3]
386        SMLAL2  v30.4s, v6.8h, v2.h[3]
387        SMLAL   v27.4s, v6.4h, v3.h[3]
388        SMLAL2  v31.4s, v6.8h, v3.h[3]
389        LDR     x17, [x5, 80]
390        SMLAL   v16.4s, v5.4h, v0.h[4]
391        SMLAL2  v20.4s, v5.8h, v0.h[4]
392        SMLAL   v17.4s, v5.4h, v1.h[4]
393        SMLAL2  v21.4s, v5.8h, v1.h[4]
394        USUBL   v4.8h, v4.8b, v7.8b
395        SMLAL   v18.4s, v5.4h, v2.h[4]
396        SMLAL2  v22.4s, v5.8h, v2.h[4]
397        SMLAL   v19.4s, v5.4h, v3.h[4]
398        SMLAL2  v23.4s, v5.8h, v3.h[4]
399        LDR     d5, [x5, 88]
400        INS     v6.d[0], x17
401        SMLAL   v24.4s, v4.4h, v0.h[4]
402        SMLAL2  v28.4s, v4.8h, v0.h[4]
403        SMLAL   v25.4s, v4.4h, v1.h[4]
404        SMLAL2  v29.4s, v4.8h, v1.h[4]
405        USUBL   v6.8h, v6.8b, v7.8b
406        SMLAL   v26.4s, v4.4h, v2.h[4]
407        SMLAL2  v30.4s, v4.8h, v2.h[4]
408        SMLAL   v27.4s, v4.4h, v3.h[4]
409        SMLAL2  v31.4s, v4.8h, v3.h[4]
410        LDR     x17, [x5, 96]
411        SMLAL   v16.4s, v6.4h, v0.h[5]
412        SMLAL2  v20.4s, v6.8h, v0.h[5]
413        SMLAL   v17.4s, v6.4h, v1.h[5]
414        SMLAL2  v21.4s, v6.8h, v1.h[5]
415        USUBL   v5.8h, v5.8b, v7.8b
416        SMLAL   v18.4s, v6.4h, v2.h[5]
417        SMLAL2  v22.4s, v6.8h, v2.h[5]
418        SMLAL   v19.4s, v6.4h, v3.h[5]
419        SMLAL2  v23.4s, v6.8h, v3.h[5]
420        LDR     d6, [x5, 104]
421        INS     v4.d[0], x17
422        SMLAL   v24.4s, v5.4h, v0.h[5]
423        SMLAL2  v28.4s, v5.8h, v0.h[5]
424        SMLAL   v25.4s, v5.4h, v1.h[5]
425        SMLAL2  v29.4s, v5.8h, v1.h[5]
426        USUBL   v4.8h, v4.8b, v7.8b
427        SMLAL   v26.4s, v5.4h, v2.h[5]
428        SMLAL2  v30.4s, v5.8h, v2.h[5]
429        SMLAL   v27.4s, v5.4h, v3.h[5]
430        SMLAL2  v31.4s, v5.8h, v3.h[5]
431        USUBL   v6.8h, v6.8b, v7.8b
432        SMLAL   v16.4s, v4.4h, v0.h[6]
433        SMLAL2  v20.4s, v4.8h, v0.h[6]
434        SMLAL   v17.4s, v4.4h, v1.h[6]
435        SMLAL2  v21.4s, v4.8h, v1.h[6]
436        SMLAL   v18.4s, v4.4h, v2.h[6]
437        SMLAL2  v22.4s, v4.8h, v2.h[6]
438        SMLAL   v19.4s, v4.4h, v3.h[6]
439        SMLAL2  v23.4s, v4.8h, v3.h[6]
440        LDR     x17, [x5, 112]
441        SMLAL   v24.4s, v6.4h, v0.h[6]
442        SMLAL2  v28.4s, v6.8h, v0.h[6]
443        SMLAL   v25.4s, v6.4h, v1.h[6]
444        SMLAL2  v29.4s, v6.8h, v1.h[6]
445        LDR     d5, [x5, 120]
446        INS     v4.d[0], x17
447        USUBL   v4.8h, v4.8b, v7.8b
448        SMLAL   v26.4s, v6.4h, v2.h[6]
449        SMLAL2  v30.4s, v6.8h, v2.h[6]
450        SMLAL   v27.4s, v6.4h, v3.h[6]
451        SMLAL2  v31.4s, v6.8h, v3.h[6]
452        SMLAL   v16.4s, v4.4h, v0.h[7]
453        SMLAL2  v20.4s, v4.8h, v0.h[7]
454        SMLAL   v17.4s, v4.4h, v1.h[7]
455        SMLAL2  v21.4s, v4.8h, v1.h[7]
456        USUBL   v5.8h, v5.8b, v7.8b
457        SMLAL   v18.4s, v4.4h, v2.h[7]
458        SMLAL2  v22.4s, v4.8h, v2.h[7]
459        SMLAL   v19.4s, v4.4h, v3.h[7]
460        SMLAL2  v23.4s, v4.8h, v3.h[7]
461        ADD     x5, x5, 128
462        SMLAL   v24.4s, v5.4h, v0.h[7]
463        SMLAL2  v28.4s, v5.8h, v0.h[7]
464        SMLAL   v25.4s, v5.4h, v1.h[7]
465        SMLAL2  v29.4s, v5.8h, v1.h[7]
466        AND     x0, x2, 7               // kc remainder 0 to 7
467        SMLAL   v26.4s, v5.4h, v2.h[7]
468        SMLAL2  v30.4s, v5.8h, v2.h[7]
469        SMLAL   v27.4s, v5.4h, v3.h[7]
470        SMLAL2  v31.4s, v5.8h, v3.h[7]
471
472        # Is there a remainder?- 1 to 7 bytes of A
473        CBNZ    x0, 4f
474
4753:
476        # Apply params - preshift, scale, postshift, bias and clamp
477        LD1R    {v4.4s}, [x11], 4
478        SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
479        SQSHL   v17.4s, v17.4s, v4.4s
480        SQSHL   v18.4s, v18.4s, v4.4s
481        SQSHL   v19.4s, v19.4s, v4.4s
482        SQSHL   v20.4s, v20.4s, v4.4s
483        SQSHL   v21.4s, v21.4s, v4.4s
484        SQSHL   v22.4s, v22.4s, v4.4s
485        SQSHL   v23.4s, v23.4s, v4.4s
486        LD1R    {v5.4s}, [x11], 4
487        SQSHL   v24.4s, v24.4s, v4.4s
488        SQSHL   v25.4s, v25.4s, v4.4s
489        SQSHL   v26.4s, v26.4s, v4.4s
490        SQSHL   v27.4s, v27.4s, v4.4s
491        SQSHL   v28.4s, v28.4s, v4.4s
492        SQSHL   v29.4s, v29.4s, v4.4s
493        SQSHL   v30.4s, v30.4s, v4.4s
494        SQSHL   v31.4s, v31.4s, v4.4s
495        LD1R    {v6.4s}, [x11], 4
496        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
497        SQDMULH v17.4s, v17.4s, v5.4s
498        SQDMULH v18.4s, v18.4s, v5.4s
499        SQDMULH v19.4s, v19.4s, v5.4s
500        SQDMULH v20.4s, v20.4s, v5.4s
501        SQDMULH v21.4s, v21.4s, v5.4s
502        SQDMULH v22.4s, v22.4s, v5.4s
503        SQDMULH v23.4s, v23.4s, v5.4s
504        SQDMULH v24.4s, v24.4s, v5.4s
505        SQDMULH v25.4s, v25.4s, v5.4s
506        SQDMULH v26.4s, v26.4s, v5.4s
507        SQDMULH v27.4s, v27.4s, v5.4s
508        SQDMULH v28.4s, v28.4s, v5.4s
509        SQDMULH v29.4s, v29.4s, v5.4s
510        SQDMULH v30.4s, v30.4s, v5.4s
511        SQDMULH v31.4s, v31.4s, v5.4s
512        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
513        SRSHL   v17.4s, v17.4s, v6.4s
514        SRSHL   v18.4s, v18.4s, v6.4s
515        SRSHL   v19.4s, v19.4s, v6.4s
516        SRSHL   v20.4s, v20.4s, v6.4s
517        SRSHL   v21.4s, v21.4s, v6.4s
518        SRSHL   v22.4s, v22.4s, v6.4s
519        SRSHL   v23.4s, v23.4s, v6.4s
520        SRSHL   v24.4s, v24.4s, v6.4s
521        SRSHL   v25.4s, v25.4s, v6.4s
522        SRSHL   v26.4s, v26.4s, v6.4s
523        SRSHL   v27.4s, v27.4s, v6.4s
524        SRSHL   v28.4s, v28.4s, v6.4s
525        SRSHL   v29.4s, v29.4s, v6.4s
526        SRSHL   v30.4s, v30.4s, v6.4s
527        SRSHL   v31.4s, v31.4s, v6.4s
528
529        SQXTN   v16.4h, v16.4s
530        SQXTN   v17.4h, v17.4s
531        SQXTN   v18.4h, v18.4s
532        SQXTN   v19.4h, v19.4s
533        SQXTN   v24.4h, v24.4s
534        SQXTN   v25.4h, v25.4s
535        SQXTN   v26.4h, v26.4s
536        SQXTN   v27.4h, v27.4s
537        LD1R    {v6.8h}, [x11], 2       // add bias
538
539        SQXTN2  v16.8h, v20.4s
540        SQXTN2  v17.8h, v21.4s
541        SQXTN2  v18.8h, v22.4s
542        SQXTN2  v19.8h, v23.4s
543        SQXTN2  v24.8h, v28.4s
544        SQXTN2  v25.8h, v29.4s
545        SQXTN2  v26.8h, v30.4s
546        SQXTN2  v27.8h, v31.4s
547
548        SQADD   v16.8h, v16.8h, v6.8h
549        SQADD   v17.8h, v17.8h, v6.8h
550        SQADD   v18.8h, v18.8h, v6.8h
551        SQADD   v19.8h, v19.8h, v6.8h
552        SQADD   v24.8h, v24.8h, v6.8h
553        SQADD   v25.8h, v25.8h, v6.8h
554        SQADD   v26.8h, v26.8h, v6.8h
555        SQADD   v27.8h, v27.8h, v6.8h
556        LD1R    {v4.16b}, [x11], 1      // clamp min value
557
558        SQXTUN   v0.8b, v16.8h
559        SQXTUN   v1.8b, v17.8h
560        SQXTUN   v2.8b, v18.8h
561        SQXTUN   v3.8b, v19.8h
562        LD1R    {v5.16b}, [x11]         // clamp max value
563        SQXTUN2  v0.16b, v24.8h
564        SQXTUN2  v1.16b, v25.8h
565        SQXTUN2  v2.16b, v26.8h
566        SQXTUN2  v3.16b, v27.8h
567        SUB     x11, x11, 15             // rewind params pointer
568
569        UMAX    v0.16b, v0.16b, v4.16b
570        UMAX    v1.16b, v1.16b, v4.16b
571        UMAX    v2.16b, v2.16b, v4.16b
572        UMAX    v3.16b, v3.16b, v4.16b
573        SUBS    x1, x1, 16
574        UMIN    v0.16b, v0.16b, v5.16b
575        UMIN    v1.16b, v1.16b, v5.16b
576        UMIN    v2.16b, v2.16b, v5.16b
577        UMIN    v3.16b, v3.16b, v5.16b
578        B.LO    5f
579
580        # Store full 4 x 16
581        ST1     {v0.16b}, [x6], x12
582        SUB     x3,  x3, x2             // a0 -= kc
583        ST1     {v1.16b}, [x8], x12
584        SUB     x15, x15, x2            // a1 -= kc
585        ST1     {v2.16b}, [x9], x12
586        SUB     x13, x13, x2            // a2 -= kc
587        ST1     {v3.16b}, [x7], x12
588        SUB     x4,  x4, x2             // a3 -= kc
589        B.NE    0b
590        RET
591
592        # Remainder- 1 to 7 bytes of A
593        .p2align 3
5944:
595        AND     x0, x2, 7               // kc remainder 1 to 7
596
597        LD1     {v0.8b},  [x3], x0
598        LDP     d4, d5, [x5], 16
599        LD1     {v1.8b}, [x15], x0
600        LD1     {v2.8b}, [x13], x0
601        LD1     {v3.8b},  [x4], x0
602        UXTL    v0.8h, v0.8b
603        USUBL   v4.8h, v4.8b, v7.8b
604        USUBL   v5.8h, v5.8b, v7.8b
605        UXTL    v1.8h, v1.8b
606        UXTL    v2.8h, v2.8b
607        UXTL    v3.8h, v3.8b
608        SMLAL   v16.4s, v4.4h, v0.h[0]
609        SMLAL2  v20.4s, v4.8h, v0.h[0]
610        SMLAL   v24.4s, v5.4h, v0.h[0]
611        SMLAL2  v28.4s, v5.8h, v0.h[0]
612        SMLAL   v17.4s, v4.4h, v1.h[0]
613        SMLAL2  v21.4s, v4.8h, v1.h[0]
614        SMLAL   v25.4s, v5.4h, v1.h[0]
615        SMLAL2  v29.4s, v5.8h, v1.h[0]
616        SMLAL   v18.4s, v4.4h, v2.h[0]
617        SMLAL2  v22.4s, v4.8h, v2.h[0]
618        SMLAL   v26.4s, v5.4h, v2.h[0]
619        SMLAL2  v30.4s, v5.8h, v2.h[0]
620        SMLAL   v19.4s, v4.4h, v3.h[0]
621        SMLAL2  v23.4s, v4.8h, v3.h[0]
622        SMLAL   v27.4s, v5.4h, v3.h[0]
623        SMLAL2  v31.4s, v5.8h, v3.h[0]
624        CMP     x0, 2
625        B.LO    3b
626
627        LDP     d4, d5, [x5], 16
628        USUBL   v4.8h, v4.8b, v7.8b
629        USUBL   v5.8h, v5.8b, v7.8b
630        SMLAL   v16.4s, v4.4h, v0.h[1]
631        SMLAL2  v20.4s, v4.8h, v0.h[1]
632        SMLAL   v24.4s, v5.4h, v0.h[1]
633        SMLAL2  v28.4s, v5.8h, v0.h[1]
634        SMLAL   v17.4s, v4.4h, v1.h[1]
635        SMLAL2  v21.4s, v4.8h, v1.h[1]
636        SMLAL   v25.4s, v5.4h, v1.h[1]
637        SMLAL2  v29.4s, v5.8h, v1.h[1]
638        SMLAL   v18.4s, v4.4h, v2.h[1]
639        SMLAL2  v22.4s, v4.8h, v2.h[1]
640        SMLAL   v26.4s, v5.4h, v2.h[1]
641        SMLAL2  v30.4s, v5.8h, v2.h[1]
642        SMLAL   v19.4s, v4.4h, v3.h[1]
643        SMLAL2  v23.4s, v4.8h, v3.h[1]
644        SMLAL   v27.4s, v5.4h, v3.h[1]
645        SMLAL2  v31.4s, v5.8h, v3.h[1]
646        B.EQ    3b
647
648        LDP     d4, d5, [x5], 16
649        USUBL   v4.8h, v4.8b, v7.8b
650        USUBL   v5.8h, v5.8b, v7.8b
651        SMLAL   v16.4s, v4.4h, v0.h[2]
652        SMLAL2  v20.4s, v4.8h, v0.h[2]
653        SMLAL   v24.4s, v5.4h, v0.h[2]
654        SMLAL2  v28.4s, v5.8h, v0.h[2]
655        SMLAL   v17.4s, v4.4h, v1.h[2]
656        SMLAL2  v21.4s, v4.8h, v1.h[2]
657        SMLAL   v25.4s, v5.4h, v1.h[2]
658        SMLAL2  v29.4s, v5.8h, v1.h[2]
659        SMLAL   v18.4s, v4.4h, v2.h[2]
660        SMLAL2  v22.4s, v4.8h, v2.h[2]
661        SMLAL   v26.4s, v5.4h, v2.h[2]
662        SMLAL2  v30.4s, v5.8h, v2.h[2]
663        SMLAL   v19.4s, v4.4h, v3.h[2]
664        SMLAL2  v23.4s, v4.8h, v3.h[2]
665        SMLAL   v27.4s, v5.4h, v3.h[2]
666        SMLAL2  v31.4s, v5.8h, v3.h[2]
667        CMP     x0, 4
668        B.LO    3b
669
670        LDP     d4, d5, [x5], 16
671        USUBL   v4.8h, v4.8b, v7.8b
672        USUBL   v5.8h, v5.8b, v7.8b
673        SMLAL   v16.4s, v4.4h, v0.h[3]
674        SMLAL2  v20.4s, v4.8h, v0.h[3]
675        SMLAL   v24.4s, v5.4h, v0.h[3]
676        SMLAL2  v28.4s, v5.8h, v0.h[3]
677        SMLAL   v17.4s, v4.4h, v1.h[3]
678        SMLAL2  v21.4s, v4.8h, v1.h[3]
679        SMLAL   v25.4s, v5.4h, v1.h[3]
680        SMLAL2  v29.4s, v5.8h, v1.h[3]
681        SMLAL   v18.4s, v4.4h, v2.h[3]
682        SMLAL2  v22.4s, v4.8h, v2.h[3]
683        SMLAL   v26.4s, v5.4h, v2.h[3]
684        SMLAL2  v30.4s, v5.8h, v2.h[3]
685        SMLAL   v19.4s, v4.4h, v3.h[3]
686        SMLAL2  v23.4s, v4.8h, v3.h[3]
687        SMLAL   v27.4s, v5.4h, v3.h[3]
688        SMLAL2  v31.4s, v5.8h, v3.h[3]
689        B.EQ    3b
690
691        LDP     d4, d5, [x5], 16
692        USUBL   v4.8h, v4.8b, v7.8b
693        USUBL   v5.8h, v5.8b, v7.8b
694        SMLAL   v16.4s, v4.4h, v0.h[4]
695        SMLAL2  v20.4s, v4.8h, v0.h[4]
696        SMLAL   v24.4s, v5.4h, v0.h[4]
697        SMLAL2  v28.4s, v5.8h, v0.h[4]
698        SMLAL   v17.4s, v4.4h, v1.h[4]
699        SMLAL2  v21.4s, v4.8h, v1.h[4]
700        SMLAL   v25.4s, v5.4h, v1.h[4]
701        SMLAL2  v29.4s, v5.8h, v1.h[4]
702        SMLAL   v18.4s, v4.4h, v2.h[4]
703        SMLAL2  v22.4s, v4.8h, v2.h[4]
704        SMLAL   v26.4s, v5.4h, v2.h[4]
705        SMLAL2  v30.4s, v5.8h, v2.h[4]
706        SMLAL   v19.4s, v4.4h, v3.h[4]
707        SMLAL2  v23.4s, v4.8h, v3.h[4]
708        SMLAL   v27.4s, v5.4h, v3.h[4]
709        SMLAL2  v31.4s, v5.8h, v3.h[4]
710        CMP     x0, 6
711        B.LO    3b
712
713        LDP     d4, d5, [x5], 16
714        USUBL   v4.8h, v4.8b, v7.8b
715        USUBL   v5.8h, v5.8b, v7.8b
716        SMLAL   v16.4s, v4.4h, v0.h[5]
717        SMLAL2  v20.4s, v4.8h, v0.h[5]
718        SMLAL   v24.4s, v5.4h, v0.h[5]
719        SMLAL2  v28.4s, v5.8h, v0.h[5]
720        SMLAL   v17.4s, v4.4h, v1.h[5]
721        SMLAL2  v21.4s, v4.8h, v1.h[5]
722        SMLAL   v25.4s, v5.4h, v1.h[5]
723        SMLAL2  v29.4s, v5.8h, v1.h[5]
724        SMLAL   v18.4s, v4.4h, v2.h[5]
725        SMLAL2  v22.4s, v4.8h, v2.h[5]
726        SMLAL   v26.4s, v5.4h, v2.h[5]
727        SMLAL2  v30.4s, v5.8h, v2.h[5]
728        SMLAL   v19.4s, v4.4h, v3.h[5]
729        SMLAL2  v23.4s, v4.8h, v3.h[5]
730        SMLAL   v27.4s, v5.4h, v3.h[5]
731        SMLAL2  v31.4s, v5.8h, v3.h[5]
732        B.EQ    3b
733
734        LDP     d4, d5, [x5], 16
735        USUBL   v4.8h, v4.8b, v7.8b
736        USUBL   v5.8h, v5.8b, v7.8b
737        SMLAL   v16.4s, v4.4h, v0.h[6]
738        SMLAL2  v20.4s, v4.8h, v0.h[6]
739        SMLAL   v24.4s, v5.4h, v0.h[6]
740        SMLAL2  v28.4s, v5.8h, v0.h[6]
741        SMLAL   v17.4s, v4.4h, v1.h[6]
742        SMLAL2  v21.4s, v4.8h, v1.h[6]
743        SMLAL   v25.4s, v5.4h, v1.h[6]
744        SMLAL2  v29.4s, v5.8h, v1.h[6]
745        SMLAL   v18.4s, v4.4h, v2.h[6]
746        SMLAL2  v22.4s, v4.8h, v2.h[6]
747        SMLAL   v26.4s, v5.4h, v2.h[6]
748        SMLAL2  v30.4s, v5.8h, v2.h[6]
749        SMLAL   v19.4s, v4.4h, v3.h[6]
750        SMLAL2  v23.4s, v4.8h, v3.h[6]
751        SMLAL   v27.4s, v5.4h, v3.h[6]
752        SMLAL2  v31.4s, v5.8h, v3.h[6]
753        B       3b
754
755        # Store odd width
756        .p2align 3
7575:
758        TBZ     x1, 3, 6f
759        STR     d0, [x6], 8
760        STR     d1, [x8], 8
761        DUP     d0, v0.d[1]
762        DUP     d1, v1.d[1]
763        STR     d2, [x9], 8
764        STR     d3, [x7], 8
765        DUP     d2, v2.d[1]
766        DUP     d3, v3.d[1]
7676:
768        TBZ     x1, 2, 7f
769        STR     s0, [x6], 4
770        STR     s1, [x8], 4
771        DUP     s0, v0.s[1]
772        DUP     s1, v1.s[1]
773        STR     s2, [x9], 4
774        STR     s3, [x7], 4
775        DUP     s2, v2.s[1]
776        DUP     s3, v3.s[1]
7777:
778        TBZ     x1, 1, 8f
779        STR     h0, [x6], 2
780        STR     h1, [x8], 2
781        DUP     h0, v0.h[1]
782        DUP     h1, v1.h[1]
783        STR     h2, [x9], 2
784        STR     h3, [x7], 2
785        DUP     h2, v2.h[1]
786        DUP     h3, v3.h[1]
7878:
788        TBZ     x1, 0, 9f
789        STR     b0, [x6]
790        STR     b1, [x8]
791        STR     b2, [x9]
792        STR     b3, [x7]
7939:
794        RET
795
796END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
797
798#ifdef __ELF__
799.section ".note.GNU-stack","",%progbits
800#endif
801