1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const uint8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     uint8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
24
25# params structure is 20 bytes
26#  struct {
27#    uint8_t kernel_zero_point;
28#    uint8_t padding[3];
29#    int32_t right_pre_shift;
30#    int32_t multiplier;
31#    int32_t right_post_shift;
32#    int16_t output_zero_point;
33#    uint8_t output_min;
34#    uint8_t output_max;
35#  } rndnu_neon;
36#
37# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
38
39# Register usage
40# A0  x3 v0
41# A1 x15 v1
42# A2 x13 v2
43# A3  x4 v3
44# B   x5 v4  v5  v6
45# C0  x6 v16 v20 v24 v28
46# C1  x8 v17 v21 v25 v29
47# C2  x9 v18 v22 v26 v30
48# C3  x7 v19 v23 v27 v31
49# zero_point  v7
50# unused v8 v9 v10 v11 v12 v13 v14 v15
51
52BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75
53
54        # Clamp A and C pointers
55        CMP     x0, 2                   // if mr < 2
56        LDP     x12, x11, [sp]          // Load cn_stride, params
57        ADD     x15, x3, x4             // a1 = a0 + a_stride
58        ADD     x8, x6, x7              // c1 = c0 + cm_stride
59        CSEL    x15, x3, x15, LO        //   a1 = a0
60        CSEL    x8, x6,  x8, LO         //   c1 = c0
61
62        ADD     x13, x15, x4            // a2 = a1 + a_stride
63        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
64                                        // if mr <= 2
65        CSEL    x13, x15, x13, LS       //   a2 = a1
66        CSEL    x9,  x8,  x9, LS        //   c2 = c1
67
68        CMP     x0, 4                   // if mr < 4
69        ADD     x4, x13, x4             // a3 = a2 + a_stride
70        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
71        CSEL    x4, x13, x4, LO         //   a3 = a2
72        CSEL    x7,  x9, x7, LO         //   c3 = c2
73        LD1R    {v7.4s}, [x11], 4        // kernel_zero_point
74
75        .p2align 3
760:
77        # Load initial bias from w into accumulators
78        LDP     q16, q20, [x5], 32
79        MOV     v17.16b, v16.16b
80        MOV     v18.16b, v16.16b
81        LDP     q24, q28, [x5], 32
82        MOV     v19.16b, v16.16b
83        MOV     v21.16b, v20.16b
84        MOV     v22.16b, v20.16b
85        MOV     v23.16b, v20.16b
86        SUBS    x0, x2, 8               // k = kc - 8
87        MOV     v25.16b, v24.16b
88        MOV     v26.16b, v24.16b
89        MOV     v27.16b, v24.16b
90        MOV     v29.16b, v28.16b
91        MOV     v30.16b, v28.16b
92        MOV     v31.16b, v28.16b
93        # Is there at least 8 bytes for epilogue?
94        B.LO    4f
95
96        # Prologue
97        LDR     d0, [x3], 8
98        LDP     d4, d6, [x5]
99        LDR     d1, [x15], 8
100        LDR     d2, [x13], 8
101        LDR     d3, [x4], 8
102        UXTL    v0.8h, v0.8b
103        USUBL   v4.8h, v4.8b, v7.8b
104        UXTL    v1.8h, v1.8b
105        UXTL    v2.8h, v2.8b
106        UXTL    v3.8h, v3.8b
107        USUBL   v6.8h, v6.8b, v7.8b
108
109        SUBS    x0, x0, 8               // k = k - 8
110        # Is there at least 8 bytes for main loop?
111        B.LO    2f
112
113        # Main loop - 8 bytes of A
114        .p2align 3
1151:
116        SMLAL   v16.4s, v4.4h, v0.h[0]
117        SMLAL2  v20.4s, v4.8h, v0.h[0]
118        SMLAL   v17.4s, v4.4h, v1.h[0]
119        SMLAL2  v21.4s, v4.8h, v1.h[0]
120        SMLAL   v18.4s, v4.4h, v2.h[0]
121        SMLAL2  v22.4s, v4.8h, v2.h[0]
122        SMLAL   v19.4s, v4.4h, v3.h[0]
123        SMLAL2  v23.4s, v4.8h, v3.h[0]
124        LDR     d5, [x5, 16]
125        SMLAL   v24.4s, v6.4h, v0.h[0]
126        LDR     d4, [x5, 24]
127        SMLAL2  v28.4s, v6.8h, v0.h[0]
128        SMLAL   v25.4s, v6.4h, v1.h[0]
129        SMLAL2  v29.4s, v6.8h, v1.h[0]
130        USUBL   v5.8h, v5.8b, v7.8b
131        SMLAL   v26.4s, v6.4h, v2.h[0]
132        SMLAL2  v30.4s, v6.8h, v2.h[0]
133        SMLAL   v27.4s, v6.4h, v3.h[0]
134        SMLAL2  v31.4s, v6.8h, v3.h[0]
135        SMLAL   v16.4s, v5.4h, v0.h[1]
136        SMLAL2  v20.4s, v5.8h, v0.h[1]
137        SMLAL   v17.4s, v5.4h, v1.h[1]
138        SMLAL2  v21.4s, v5.8h, v1.h[1]
139        USUBL   v4.8h, v4.8b, v7.8b
140        SMLAL   v18.4s, v5.4h, v2.h[1]
141        SMLAL2  v22.4s, v5.8h, v2.h[1]
142        SMLAL   v19.4s, v5.4h, v3.h[1]
143        SMLAL2  v23.4s, v5.8h, v3.h[1]
144        LDR     d6, [x5, 32]
145        SMLAL   v24.4s, v4.4h, v0.h[1]
146        LDR     d5, [x5, 40]
147        SMLAL2  v28.4s, v4.8h, v0.h[1]
148        SMLAL   v25.4s, v4.4h, v1.h[1]
149        SMLAL2  v29.4s, v4.8h, v1.h[1]
150        USUBL   v6.8h, v6.8b, v7.8b
151        SMLAL   v26.4s, v4.4h, v2.h[1]
152        SMLAL2  v30.4s, v4.8h, v2.h[1]
153        SMLAL   v27.4s, v4.4h, v3.h[1]
154        SMLAL2  v31.4s, v4.8h, v3.h[1]
155        SMLAL   v16.4s, v6.4h, v0.h[2]
156        SMLAL2  v20.4s, v6.8h, v0.h[2]
157        SMLAL   v17.4s, v6.4h, v1.h[2]
158        USUBL   v5.8h, v5.8b, v7.8b
159        SMLAL2  v21.4s, v6.8h, v1.h[2]
160        SMLAL   v18.4s, v6.4h, v2.h[2]
161        SMLAL2  v22.4s, v6.8h, v2.h[2]
162        SMLAL   v19.4s, v6.4h, v3.h[2]
163        SMLAL2  v23.4s, v6.8h, v3.h[2]
164        LDR     d4, [x5, 48]
165        SMLAL   v24.4s, v5.4h, v0.h[2]
166        LDR     d6, [x5, 56]
167        SMLAL2  v28.4s, v5.8h, v0.h[2]
168        SMLAL   v25.4s, v5.4h, v1.h[2]
169        SMLAL2  v29.4s, v5.8h, v1.h[2]
170        USUBL   v4.8h, v4.8b, v7.8b
171        SMLAL   v26.4s, v5.4h, v2.h[2]
172        SMLAL2  v30.4s, v5.8h, v2.h[2]
173        SMLAL   v27.4s, v5.4h, v3.h[2]
174        SMLAL2  v31.4s, v5.8h, v3.h[2]
175        SMLAL   v16.4s, v4.4h, v0.h[3]
176        SMLAL2  v20.4s, v4.8h, v0.h[3]
177        SMLAL   v17.4s, v4.4h, v1.h[3]
178        SMLAL2  v21.4s, v4.8h, v1.h[3]
179        USUBL   v6.8h, v6.8b, v7.8b
180        SMLAL   v18.4s, v4.4h, v2.h[3]
181        SMLAL2  v22.4s, v4.8h, v2.h[3]
182        SMLAL   v19.4s, v4.4h, v3.h[3]
183        SMLAL2  v23.4s, v4.8h, v3.h[3]
184        LDR     d5, [x5, 64]
185        SMLAL   v24.4s, v6.4h, v0.h[3]
186        LDR     d4, [x5, 72]
187        SMLAL2  v28.4s, v6.8h, v0.h[3]
188        USUBL   v5.8h, v5.8b, v7.8b
189        SMLAL   v25.4s, v6.4h, v1.h[3]
190        SMLAL2  v29.4s, v6.8h, v1.h[3]
191        SMLAL   v26.4s, v6.4h, v2.h[3]
192        SMLAL2  v30.4s, v6.8h, v2.h[3]
193        SMLAL   v27.4s, v6.4h, v3.h[3]
194        SMLAL2  v31.4s, v6.8h, v3.h[3]
195        SMLAL   v16.4s, v5.4h, v0.h[4]
196        SMLAL2  v20.4s, v5.8h, v0.h[4]
197        SMLAL   v17.4s, v5.4h, v1.h[4]
198        SMLAL2  v21.4s, v5.8h, v1.h[4]
199        USUBL   v4.8h, v4.8b, v7.8b
200        SMLAL   v18.4s, v5.4h, v2.h[4]
201        SMLAL2  v22.4s, v5.8h, v2.h[4]
202        SMLAL   v19.4s, v5.4h, v3.h[4]
203        SMLAL2  v23.4s, v5.8h, v3.h[4]
204        LDR     d6, [x5, 80]
205        SMLAL   v24.4s, v4.4h, v0.h[4]
206        LDR     d5, [x5, 88]
207        SMLAL2  v28.4s, v4.8h, v0.h[4]
208        SMLAL   v25.4s, v4.4h, v1.h[4]
209        SMLAL2  v29.4s, v4.8h, v1.h[4]
210        USUBL   v6.8h, v6.8b, v7.8b
211        SMLAL   v26.4s, v4.4h, v2.h[4]
212        SMLAL2  v30.4s, v4.8h, v2.h[4]
213        SMLAL   v27.4s, v4.4h, v3.h[4]
214        SMLAL2  v31.4s, v4.8h, v3.h[4]
215        SMLAL   v16.4s, v6.4h, v0.h[5]
216        SMLAL2  v20.4s, v6.8h, v0.h[5]
217        SMLAL   v17.4s, v6.4h, v1.h[5]
218        SMLAL2  v21.4s, v6.8h, v1.h[5]
219        USUBL   v5.8h, v5.8b, v7.8b
220        SMLAL   v18.4s, v6.4h, v2.h[5]
221        SMLAL2  v22.4s, v6.8h, v2.h[5]
222        SMLAL   v19.4s, v6.4h, v3.h[5]
223        SMLAL2  v23.4s, v6.8h, v3.h[5]
224        LDR     d4, [x5, 96]
225        SMLAL   v24.4s, v5.4h, v0.h[5]
226        LDR     d6, [x5, 104]
227        SMLAL2  v28.4s, v5.8h, v0.h[5]
228        SMLAL   v25.4s, v5.4h, v1.h[5]
229        SMLAL2  v29.4s, v5.8h, v1.h[5]
230        USUBL   v4.8h, v4.8b, v7.8b
231        SMLAL   v26.4s, v5.4h, v2.h[5]
232        SMLAL2  v30.4s, v5.8h, v2.h[5]
233        SMLAL   v27.4s, v5.4h, v3.h[5]
234        SMLAL2  v31.4s, v5.8h, v3.h[5]
235        USUBL   v6.8h, v6.8b, v7.8b
236        SMLAL   v16.4s, v4.4h, v0.h[6]
237        SMLAL2  v20.4s, v4.8h, v0.h[6]
238        SMLAL   v17.4s, v4.4h, v1.h[6]
239        SMLAL2  v21.4s, v4.8h, v1.h[6]
240        SMLAL   v18.4s, v4.4h, v2.h[6]
241        SMLAL2  v22.4s, v4.8h, v2.h[6]
242        SMLAL   v19.4s, v4.4h, v3.h[6]
243        SMLAL2  v23.4s, v4.8h, v3.h[6]
244        LDR     d4, [x5, 112]
245        SMLAL   v24.4s, v6.4h, v0.h[6]
246        LDR     d5, [x5, 120]
247        SMLAL2  v28.4s, v6.8h, v0.h[6]
248        SMLAL   v25.4s, v6.4h, v1.h[6]
249        SMLAL2  v29.4s, v6.8h, v1.h[6]
250        USUBL   v4.8h, v4.8b, v7.8b
251        ADD     x5, x5, 128
252
253        SMLAL   v26.4s, v6.4h, v2.h[6]
254        SMLAL2  v30.4s, v6.8h, v2.h[6]
255        SMLAL   v27.4s, v6.4h, v3.h[6]
256        SMLAL2  v31.4s, v6.8h, v3.h[6]
257        USUBL   v5.8h, v5.8b, v7.8b
258
259        SMLAL   v16.4s, v4.4h, v0.h[7]
260        SMLAL2  v20.4s, v4.8h, v0.h[7]
261        SMLAL   v17.4s, v4.4h, v1.h[7]
262        SMLAL2  v21.4s, v4.8h, v1.h[7]
263        SMLAL   v18.4s, v4.4h, v2.h[7]
264        SMLAL2  v22.4s, v4.8h, v2.h[7]
265        SMLAL   v19.4s, v4.4h, v3.h[7]
266        SMLAL2  v23.4s, v4.8h, v3.h[7]
267        LDR     d4, [x5]
268        SMLAL   v24.4s, v5.4h, v0.h[7]
269        LDR     d6, [x5, 8]
270        SMLAL2  v28.4s, v5.8h, v0.h[7]
271        SMLAL   v25.4s, v5.4h, v1.h[7]
272        SMLAL2  v29.4s, v5.8h, v1.h[7]
273        LDR     d0, [x3], 8
274        SMLAL   v26.4s, v5.4h, v2.h[7]
275        LDR     d1, [x15], 8
276        SMLAL2  v30.4s, v5.8h, v2.h[7]
277        SMLAL   v27.4s, v5.4h, v3.h[7]
278        SMLAL2  v31.4s, v5.8h, v3.h[7]
279        LDR     d2, [x13], 8
280
281        UXTL    v0.8h, v0.8b
282        LDR     d3, [x4], 8
283        UXTL    v1.8h, v1.8b
284        USUBL   v4.8h, v4.8b, v7.8b
285        UXTL    v2.8h, v2.8b
286        SUBS    x0, x0, 8
287        UXTL    v3.8h, v3.8b
288        USUBL   v6.8h, v6.8b, v7.8b
289        B.HS    1b
290
291        # Epilogue.  Same as main loop but no preloads in final group
292
293        .p2align 3
2942:
295        SMLAL   v16.4s, v4.4h, v0.h[0]
296        SMLAL2  v20.4s, v4.8h, v0.h[0]
297        SMLAL   v17.4s, v4.4h, v1.h[0]
298        SMLAL2  v21.4s, v4.8h, v1.h[0]
299        SMLAL   v18.4s, v4.4h, v2.h[0]
300        SMLAL2  v22.4s, v4.8h, v2.h[0]
301        SMLAL   v19.4s, v4.4h, v3.h[0]
302        SMLAL2  v23.4s, v4.8h, v3.h[0]
303        LDR     d5, [x5, 16]
304        SMLAL   v24.4s, v6.4h, v0.h[0]
305        LDR     d4, [x5, 24]
306        SMLAL2  v28.4s, v6.8h, v0.h[0]
307        SMLAL   v25.4s, v6.4h, v1.h[0]
308        SMLAL2  v29.4s, v6.8h, v1.h[0]
309        USUBL   v5.8h, v5.8b, v7.8b
310        SMLAL   v26.4s, v6.4h, v2.h[0]
311        SMLAL2  v30.4s, v6.8h, v2.h[0]
312        SMLAL   v27.4s, v6.4h, v3.h[0]
313        SMLAL2  v31.4s, v6.8h, v3.h[0]
314        SMLAL   v16.4s, v5.4h, v0.h[1]
315        SMLAL2  v20.4s, v5.8h, v0.h[1]
316        SMLAL   v17.4s, v5.4h, v1.h[1]
317        SMLAL2  v21.4s, v5.8h, v1.h[1]
318        USUBL   v4.8h, v4.8b, v7.8b
319        SMLAL   v18.4s, v5.4h, v2.h[1]
320        SMLAL2  v22.4s, v5.8h, v2.h[1]
321        SMLAL   v19.4s, v5.4h, v3.h[1]
322        SMLAL2  v23.4s, v5.8h, v3.h[1]
323        LDR     d6, [x5, 32]
324        SMLAL   v24.4s, v4.4h, v0.h[1]
325        LDR     d5, [x5, 40]
326        SMLAL2  v28.4s, v4.8h, v0.h[1]
327        SMLAL   v25.4s, v4.4h, v1.h[1]
328        SMLAL2  v29.4s, v4.8h, v1.h[1]
329        USUBL   v6.8h, v6.8b, v7.8b
330        SMLAL   v26.4s, v4.4h, v2.h[1]
331        SMLAL2  v30.4s, v4.8h, v2.h[1]
332        SMLAL   v27.4s, v4.4h, v3.h[1]
333        SMLAL2  v31.4s, v4.8h, v3.h[1]
334        SMLAL   v16.4s, v6.4h, v0.h[2]
335        SMLAL2  v20.4s, v6.8h, v0.h[2]
336        SMLAL   v17.4s, v6.4h, v1.h[2]
337        USUBL   v5.8h, v5.8b, v7.8b
338        SMLAL2  v21.4s, v6.8h, v1.h[2]
339        SMLAL   v18.4s, v6.4h, v2.h[2]
340        SMLAL2  v22.4s, v6.8h, v2.h[2]
341        SMLAL   v19.4s, v6.4h, v3.h[2]
342        SMLAL2  v23.4s, v6.8h, v3.h[2]
343        LDR     d4, [x5, 48]
344        SMLAL   v24.4s, v5.4h, v0.h[2]
345        LDR     d6, [x5, 56]
346        SMLAL2  v28.4s, v5.8h, v0.h[2]
347        SMLAL   v25.4s, v5.4h, v1.h[2]
348        SMLAL2  v29.4s, v5.8h, v1.h[2]
349        USUBL   v4.8h, v4.8b, v7.8b
350        SMLAL   v26.4s, v5.4h, v2.h[2]
351        SMLAL2  v30.4s, v5.8h, v2.h[2]
352        SMLAL   v27.4s, v5.4h, v3.h[2]
353        SMLAL2  v31.4s, v5.8h, v3.h[2]
354        SMLAL   v16.4s, v4.4h, v0.h[3]
355        SMLAL2  v20.4s, v4.8h, v0.h[3]
356        SMLAL   v17.4s, v4.4h, v1.h[3]
357        SMLAL2  v21.4s, v4.8h, v1.h[3]
358        USUBL   v6.8h, v6.8b, v7.8b
359        SMLAL   v18.4s, v4.4h, v2.h[3]
360        SMLAL2  v22.4s, v4.8h, v2.h[3]
361        SMLAL   v19.4s, v4.4h, v3.h[3]
362        SMLAL2  v23.4s, v4.8h, v3.h[3]
363        LDR     d5, [x5, 64]
364        SMLAL   v24.4s, v6.4h, v0.h[3]
365        LDR     d4, [x5, 72]
366        SMLAL2  v28.4s, v6.8h, v0.h[3]
367        USUBL   v5.8h, v5.8b, v7.8b
368        SMLAL   v25.4s, v6.4h, v1.h[3]
369        SMLAL2  v29.4s, v6.8h, v1.h[3]
370        SMLAL   v26.4s, v6.4h, v2.h[3]
371        SMLAL2  v30.4s, v6.8h, v2.h[3]
372        SMLAL   v27.4s, v6.4h, v3.h[3]
373        SMLAL2  v31.4s, v6.8h, v3.h[3]
374        SMLAL   v16.4s, v5.4h, v0.h[4]
375        SMLAL2  v20.4s, v5.8h, v0.h[4]
376        SMLAL   v17.4s, v5.4h, v1.h[4]
377        SMLAL2  v21.4s, v5.8h, v1.h[4]
378        USUBL   v4.8h, v4.8b, v7.8b
379        SMLAL   v18.4s, v5.4h, v2.h[4]
380        SMLAL2  v22.4s, v5.8h, v2.h[4]
381        SMLAL   v19.4s, v5.4h, v3.h[4]
382        SMLAL2  v23.4s, v5.8h, v3.h[4]
383        LDR     d6, [x5, 80]
384        SMLAL   v24.4s, v4.4h, v0.h[4]
385        LDR     d5, [x5, 88]
386        SMLAL2  v28.4s, v4.8h, v0.h[4]
387        SMLAL   v25.4s, v4.4h, v1.h[4]
388        SMLAL2  v29.4s, v4.8h, v1.h[4]
389        USUBL   v6.8h, v6.8b, v7.8b
390        SMLAL   v26.4s, v4.4h, v2.h[4]
391        SMLAL2  v30.4s, v4.8h, v2.h[4]
392        SMLAL   v27.4s, v4.4h, v3.h[4]
393        SMLAL2  v31.4s, v4.8h, v3.h[4]
394        SMLAL   v16.4s, v6.4h, v0.h[5]
395        SMLAL2  v20.4s, v6.8h, v0.h[5]
396        SMLAL   v17.4s, v6.4h, v1.h[5]
397        SMLAL2  v21.4s, v6.8h, v1.h[5]
398        USUBL   v5.8h, v5.8b, v7.8b
399        SMLAL   v18.4s, v6.4h, v2.h[5]
400        SMLAL2  v22.4s, v6.8h, v2.h[5]
401        SMLAL   v19.4s, v6.4h, v3.h[5]
402        SMLAL2  v23.4s, v6.8h, v3.h[5]
403        LDR     d4, [x5, 96]
404        SMLAL   v24.4s, v5.4h, v0.h[5]
405        LDR     d6, [x5, 104]
406        SMLAL2  v28.4s, v5.8h, v0.h[5]
407        SMLAL   v25.4s, v5.4h, v1.h[5]
408        SMLAL2  v29.4s, v5.8h, v1.h[5]
409        USUBL   v4.8h, v4.8b, v7.8b
410        SMLAL   v26.4s, v5.4h, v2.h[5]
411        SMLAL2  v30.4s, v5.8h, v2.h[5]
412        SMLAL   v27.4s, v5.4h, v3.h[5]
413        SMLAL2  v31.4s, v5.8h, v3.h[5]
414        USUBL   v6.8h, v6.8b, v7.8b
415        SMLAL   v16.4s, v4.4h, v0.h[6]
416        SMLAL2  v20.4s, v4.8h, v0.h[6]
417        SMLAL   v17.4s, v4.4h, v1.h[6]
418        SMLAL2  v21.4s, v4.8h, v1.h[6]
419        SMLAL   v18.4s, v4.4h, v2.h[6]
420        SMLAL2  v22.4s, v4.8h, v2.h[6]
421        SMLAL   v19.4s, v4.4h, v3.h[6]
422        SMLAL2  v23.4s, v4.8h, v3.h[6]
423        SMLAL   v24.4s, v6.4h, v0.h[6]
424        SMLAL2  v28.4s, v6.8h, v0.h[6]
425        SMLAL   v25.4s, v6.4h, v1.h[6]
426        SMLAL2  v29.4s, v6.8h, v1.h[6]
427        LDR     d4, [x5, 112]
428        USUBL   v4.8h, v4.8b, v7.8b
429        LDR     d5, [x5, 120]
430        SMLAL   v26.4s, v6.4h, v2.h[6]
431        SMLAL2  v30.4s, v6.8h, v2.h[6]
432        SMLAL   v27.4s, v6.4h, v3.h[6]
433        SMLAL2  v31.4s, v6.8h, v3.h[6]
434        SMLAL   v16.4s, v4.4h, v0.h[7]
435        SMLAL2  v20.4s, v4.8h, v0.h[7]
436        SMLAL   v17.4s, v4.4h, v1.h[7]
437        SMLAL2  v21.4s, v4.8h, v1.h[7]
438        USUBL   v5.8h, v5.8b, v7.8b
439        SMLAL   v18.4s, v4.4h, v2.h[7]
440        SMLAL2  v22.4s, v4.8h, v2.h[7]
441        SMLAL   v19.4s, v4.4h, v3.h[7]
442        SMLAL2  v23.4s, v4.8h, v3.h[7]
443        ADD     x5, x5, 128
444        SMLAL   v24.4s, v5.4h, v0.h[7]
445        SMLAL2  v28.4s, v5.8h, v0.h[7]
446        SMLAL   v25.4s, v5.4h, v1.h[7]
447        SMLAL2  v29.4s, v5.8h, v1.h[7]
448        AND     x0, x2, 7               // kc remainder 0 to 7
449        SMLAL   v26.4s, v5.4h, v2.h[7]
450        SMLAL2  v30.4s, v5.8h, v2.h[7]
451        SMLAL   v27.4s, v5.4h, v3.h[7]
452        SMLAL2  v31.4s, v5.8h, v3.h[7]
453
454        # Is there a remainder?- 1 to 7 bytes of A
455        CBNZ    x0, 4f
456
4573:
458        # Apply params - preshift, scale, postshift, bias and clamp
459        LD1R    {v4.4s}, [x11], 4
460        SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
461        SQSHL   v17.4s, v17.4s, v4.4s
462        SQSHL   v18.4s, v18.4s, v4.4s
463        SQSHL   v19.4s, v19.4s, v4.4s
464        SQSHL   v20.4s, v20.4s, v4.4s
465        SQSHL   v21.4s, v21.4s, v4.4s
466        SQSHL   v22.4s, v22.4s, v4.4s
467        SQSHL   v23.4s, v23.4s, v4.4s
468        LD1R    {v5.4s}, [x11], 4
469        SQSHL   v24.4s, v24.4s, v4.4s
470        SQSHL   v25.4s, v25.4s, v4.4s
471        SQSHL   v26.4s, v26.4s, v4.4s
472        SQSHL   v27.4s, v27.4s, v4.4s
473        SQSHL   v28.4s, v28.4s, v4.4s
474        SQSHL   v29.4s, v29.4s, v4.4s
475        SQSHL   v30.4s, v30.4s, v4.4s
476        SQSHL   v31.4s, v31.4s, v4.4s
477        LD1R    {v6.4s}, [x11], 4
478        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
479        SQDMULH v17.4s, v17.4s, v5.4s
480        SQDMULH v18.4s, v18.4s, v5.4s
481        SQDMULH v19.4s, v19.4s, v5.4s
482        SQDMULH v20.4s, v20.4s, v5.4s
483        SQDMULH v21.4s, v21.4s, v5.4s
484        SQDMULH v22.4s, v22.4s, v5.4s
485        SQDMULH v23.4s, v23.4s, v5.4s
486        SQDMULH v24.4s, v24.4s, v5.4s
487        SQDMULH v25.4s, v25.4s, v5.4s
488        SQDMULH v26.4s, v26.4s, v5.4s
489        SQDMULH v27.4s, v27.4s, v5.4s
490        SQDMULH v28.4s, v28.4s, v5.4s
491        SQDMULH v29.4s, v29.4s, v5.4s
492        SQDMULH v30.4s, v30.4s, v5.4s
493        SQDMULH v31.4s, v31.4s, v5.4s
494        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
495        SRSHL   v17.4s, v17.4s, v6.4s
496        SRSHL   v18.4s, v18.4s, v6.4s
497        SRSHL   v19.4s, v19.4s, v6.4s
498        SRSHL   v20.4s, v20.4s, v6.4s
499        SRSHL   v21.4s, v21.4s, v6.4s
500        SRSHL   v22.4s, v22.4s, v6.4s
501        SRSHL   v23.4s, v23.4s, v6.4s
502        SRSHL   v24.4s, v24.4s, v6.4s
503        SRSHL   v25.4s, v25.4s, v6.4s
504        SRSHL   v26.4s, v26.4s, v6.4s
505        SRSHL   v27.4s, v27.4s, v6.4s
506        SRSHL   v28.4s, v28.4s, v6.4s
507        SRSHL   v29.4s, v29.4s, v6.4s
508        SRSHL   v30.4s, v30.4s, v6.4s
509        SRSHL   v31.4s, v31.4s, v6.4s
510
511        SQXTN   v16.4h, v16.4s
512        SQXTN   v17.4h, v17.4s
513        SQXTN   v18.4h, v18.4s
514        SQXTN   v19.4h, v19.4s
515        SQXTN   v24.4h, v24.4s
516        SQXTN   v25.4h, v25.4s
517        SQXTN   v26.4h, v26.4s
518        SQXTN   v27.4h, v27.4s
519        LD1R    {v6.8h}, [x11], 2       // add bias
520
521        SQXTN2  v16.8h, v20.4s
522        SQXTN2  v17.8h, v21.4s
523        SQXTN2  v18.8h, v22.4s
524        SQXTN2  v19.8h, v23.4s
525        SQXTN2  v24.8h, v28.4s
526        SQXTN2  v25.8h, v29.4s
527        SQXTN2  v26.8h, v30.4s
528        SQXTN2  v27.8h, v31.4s
529
530        SQADD   v16.8h, v16.8h, v6.8h
531        SQADD   v17.8h, v17.8h, v6.8h
532        SQADD   v18.8h, v18.8h, v6.8h
533        SQADD   v19.8h, v19.8h, v6.8h
534        SQADD   v24.8h, v24.8h, v6.8h
535        SQADD   v25.8h, v25.8h, v6.8h
536        SQADD   v26.8h, v26.8h, v6.8h
537        SQADD   v27.8h, v27.8h, v6.8h
538        LD1R    {v4.16b}, [x11], 1      // clamp min value
539
540        SQXTUN   v0.8b, v16.8h
541        SQXTUN   v1.8b, v17.8h
542        SQXTUN   v2.8b, v18.8h
543        SQXTUN   v3.8b, v19.8h
544        LD1R    {v5.16b}, [x11]         // clamp max value
545        SQXTUN2  v0.16b, v24.8h
546        SQXTUN2  v1.16b, v25.8h
547        SQXTUN2  v2.16b, v26.8h
548        SQXTUN2  v3.16b, v27.8h
549        SUB     x11, x11, 15             // rewind params pointer
550
551        UMAX    v0.16b, v0.16b, v4.16b
552        UMAX    v1.16b, v1.16b, v4.16b
553        UMAX    v2.16b, v2.16b, v4.16b
554        UMAX    v3.16b, v3.16b, v4.16b
555        SUBS    x1, x1, 16
556        UMIN    v0.16b, v0.16b, v5.16b
557        UMIN    v1.16b, v1.16b, v5.16b
558        UMIN    v2.16b, v2.16b, v5.16b
559        UMIN    v3.16b, v3.16b, v5.16b
560        B.LO    5f
561
562        # Store full 4 x 16
563        ST1     {v0.16b}, [x6], x12
564        SUB     x3,  x3, x2             // a0 -= kc
565        ST1     {v1.16b}, [x8], x12
566        SUB     x15, x15, x2            // a1 -= kc
567        ST1     {v2.16b}, [x9], x12
568        SUB     x13, x13, x2            // a2 -= kc
569        ST1     {v3.16b}, [x7], x12
570        SUB     x4,  x4, x2             // a3 -= kc
571        B.NE    0b
572        RET
573
574        # Remainder- 1 to 7 bytes of A
575        .p2align 3
5764:
577        AND     x0, x2, 7               // kc remainder 1 to 7
578
579        LD1     {v0.8b},  [x3], x0
580        LDP     d4, d5, [x5], 16
581        LD1     {v1.8b}, [x15], x0
582        LD1     {v2.8b}, [x13], x0
583        LD1     {v3.8b},  [x4], x0
584        UXTL    v0.8h, v0.8b
585        USUBL   v4.8h, v4.8b, v7.8b
586        USUBL   v5.8h, v5.8b, v7.8b
587        UXTL    v1.8h, v1.8b
588        UXTL    v2.8h, v2.8b
589        UXTL    v3.8h, v3.8b
590        SMLAL   v16.4s, v4.4h, v0.h[0]
591        SMLAL2  v20.4s, v4.8h, v0.h[0]
592        SMLAL   v24.4s, v5.4h, v0.h[0]
593        SMLAL2  v28.4s, v5.8h, v0.h[0]
594        SMLAL   v17.4s, v4.4h, v1.h[0]
595        SMLAL2  v21.4s, v4.8h, v1.h[0]
596        SMLAL   v25.4s, v5.4h, v1.h[0]
597        SMLAL2  v29.4s, v5.8h, v1.h[0]
598        SMLAL   v18.4s, v4.4h, v2.h[0]
599        SMLAL2  v22.4s, v4.8h, v2.h[0]
600        SMLAL   v26.4s, v5.4h, v2.h[0]
601        SMLAL2  v30.4s, v5.8h, v2.h[0]
602        SMLAL   v19.4s, v4.4h, v3.h[0]
603        SMLAL2  v23.4s, v4.8h, v3.h[0]
604        SMLAL   v27.4s, v5.4h, v3.h[0]
605        SMLAL2  v31.4s, v5.8h, v3.h[0]
606        CMP     x0, 2
607        B.LO    3b
608
609        LDP     d4, d5, [x5], 16
610        USUBL   v4.8h, v4.8b, v7.8b
611        USUBL   v5.8h, v5.8b, v7.8b
612        SMLAL   v16.4s, v4.4h, v0.h[1]
613        SMLAL2  v20.4s, v4.8h, v0.h[1]
614        SMLAL   v24.4s, v5.4h, v0.h[1]
615        SMLAL2  v28.4s, v5.8h, v0.h[1]
616        SMLAL   v17.4s, v4.4h, v1.h[1]
617        SMLAL2  v21.4s, v4.8h, v1.h[1]
618        SMLAL   v25.4s, v5.4h, v1.h[1]
619        SMLAL2  v29.4s, v5.8h, v1.h[1]
620        SMLAL   v18.4s, v4.4h, v2.h[1]
621        SMLAL2  v22.4s, v4.8h, v2.h[1]
622        SMLAL   v26.4s, v5.4h, v2.h[1]
623        SMLAL2  v30.4s, v5.8h, v2.h[1]
624        SMLAL   v19.4s, v4.4h, v3.h[1]
625        SMLAL2  v23.4s, v4.8h, v3.h[1]
626        SMLAL   v27.4s, v5.4h, v3.h[1]
627        SMLAL2  v31.4s, v5.8h, v3.h[1]
628        B.EQ    3b
629
630        LDP     d4, d5, [x5], 16
631        USUBL   v4.8h, v4.8b, v7.8b
632        USUBL   v5.8h, v5.8b, v7.8b
633        SMLAL   v16.4s, v4.4h, v0.h[2]
634        SMLAL2  v20.4s, v4.8h, v0.h[2]
635        SMLAL   v24.4s, v5.4h, v0.h[2]
636        SMLAL2  v28.4s, v5.8h, v0.h[2]
637        SMLAL   v17.4s, v4.4h, v1.h[2]
638        SMLAL2  v21.4s, v4.8h, v1.h[2]
639        SMLAL   v25.4s, v5.4h, v1.h[2]
640        SMLAL2  v29.4s, v5.8h, v1.h[2]
641        SMLAL   v18.4s, v4.4h, v2.h[2]
642        SMLAL2  v22.4s, v4.8h, v2.h[2]
643        SMLAL   v26.4s, v5.4h, v2.h[2]
644        SMLAL2  v30.4s, v5.8h, v2.h[2]
645        SMLAL   v19.4s, v4.4h, v3.h[2]
646        SMLAL2  v23.4s, v4.8h, v3.h[2]
647        SMLAL   v27.4s, v5.4h, v3.h[2]
648        SMLAL2  v31.4s, v5.8h, v3.h[2]
649        CMP     x0, 4
650        B.LO    3b
651
652        LDP     d4, d5, [x5], 16
653        USUBL   v4.8h, v4.8b, v7.8b
654        USUBL   v5.8h, v5.8b, v7.8b
655        SMLAL   v16.4s, v4.4h, v0.h[3]
656        SMLAL2  v20.4s, v4.8h, v0.h[3]
657        SMLAL   v24.4s, v5.4h, v0.h[3]
658        SMLAL2  v28.4s, v5.8h, v0.h[3]
659        SMLAL   v17.4s, v4.4h, v1.h[3]
660        SMLAL2  v21.4s, v4.8h, v1.h[3]
661        SMLAL   v25.4s, v5.4h, v1.h[3]
662        SMLAL2  v29.4s, v5.8h, v1.h[3]
663        SMLAL   v18.4s, v4.4h, v2.h[3]
664        SMLAL2  v22.4s, v4.8h, v2.h[3]
665        SMLAL   v26.4s, v5.4h, v2.h[3]
666        SMLAL2  v30.4s, v5.8h, v2.h[3]
667        SMLAL   v19.4s, v4.4h, v3.h[3]
668        SMLAL2  v23.4s, v4.8h, v3.h[3]
669        SMLAL   v27.4s, v5.4h, v3.h[3]
670        SMLAL2  v31.4s, v5.8h, v3.h[3]
671        B.EQ    3b
672
673        LDP     d4, d5, [x5], 16
674        USUBL   v4.8h, v4.8b, v7.8b
675        USUBL   v5.8h, v5.8b, v7.8b
676        SMLAL   v16.4s, v4.4h, v0.h[4]
677        SMLAL2  v20.4s, v4.8h, v0.h[4]
678        SMLAL   v24.4s, v5.4h, v0.h[4]
679        SMLAL2  v28.4s, v5.8h, v0.h[4]
680        SMLAL   v17.4s, v4.4h, v1.h[4]
681        SMLAL2  v21.4s, v4.8h, v1.h[4]
682        SMLAL   v25.4s, v5.4h, v1.h[4]
683        SMLAL2  v29.4s, v5.8h, v1.h[4]
684        SMLAL   v18.4s, v4.4h, v2.h[4]
685        SMLAL2  v22.4s, v4.8h, v2.h[4]
686        SMLAL   v26.4s, v5.4h, v2.h[4]
687        SMLAL2  v30.4s, v5.8h, v2.h[4]
688        SMLAL   v19.4s, v4.4h, v3.h[4]
689        SMLAL2  v23.4s, v4.8h, v3.h[4]
690        SMLAL   v27.4s, v5.4h, v3.h[4]
691        SMLAL2  v31.4s, v5.8h, v3.h[4]
692        CMP     x0, 6
693        B.LO    3b
694
695        LDP     d4, d5, [x5], 16
696        USUBL   v4.8h, v4.8b, v7.8b
697        USUBL   v5.8h, v5.8b, v7.8b
698        SMLAL   v16.4s, v4.4h, v0.h[5]
699        SMLAL2  v20.4s, v4.8h, v0.h[5]
700        SMLAL   v24.4s, v5.4h, v0.h[5]
701        SMLAL2  v28.4s, v5.8h, v0.h[5]
702        SMLAL   v17.4s, v4.4h, v1.h[5]
703        SMLAL2  v21.4s, v4.8h, v1.h[5]
704        SMLAL   v25.4s, v5.4h, v1.h[5]
705        SMLAL2  v29.4s, v5.8h, v1.h[5]
706        SMLAL   v18.4s, v4.4h, v2.h[5]
707        SMLAL2  v22.4s, v4.8h, v2.h[5]
708        SMLAL   v26.4s, v5.4h, v2.h[5]
709        SMLAL2  v30.4s, v5.8h, v2.h[5]
710        SMLAL   v19.4s, v4.4h, v3.h[5]
711        SMLAL2  v23.4s, v4.8h, v3.h[5]
712        SMLAL   v27.4s, v5.4h, v3.h[5]
713        SMLAL2  v31.4s, v5.8h, v3.h[5]
714        B.EQ    3b
715
716        LDP     d4, d5, [x5], 16
717        USUBL   v4.8h, v4.8b, v7.8b
718        USUBL   v5.8h, v5.8b, v7.8b
719        SMLAL   v16.4s, v4.4h, v0.h[6]
720        SMLAL2  v20.4s, v4.8h, v0.h[6]
721        SMLAL   v24.4s, v5.4h, v0.h[6]
722        SMLAL2  v28.4s, v5.8h, v0.h[6]
723        SMLAL   v17.4s, v4.4h, v1.h[6]
724        SMLAL2  v21.4s, v4.8h, v1.h[6]
725        SMLAL   v25.4s, v5.4h, v1.h[6]
726        SMLAL2  v29.4s, v5.8h, v1.h[6]
727        SMLAL   v18.4s, v4.4h, v2.h[6]
728        SMLAL2  v22.4s, v4.8h, v2.h[6]
729        SMLAL   v26.4s, v5.4h, v2.h[6]
730        SMLAL2  v30.4s, v5.8h, v2.h[6]
731        SMLAL   v19.4s, v4.4h, v3.h[6]
732        SMLAL2  v23.4s, v4.8h, v3.h[6]
733        SMLAL   v27.4s, v5.4h, v3.h[6]
734        SMLAL2  v31.4s, v5.8h, v3.h[6]
735        B       3b
736
737        # Store odd width
738        .p2align 3
7395:
740        TBZ     x1, 3, 6f
741        STR     d0, [x6], 8
742        STR     d1, [x8], 8
743        DUP     d0, v0.d[1]
744        DUP     d1, v1.d[1]
745        STR     d2, [x9], 8
746        STR     d3, [x7], 8
747        DUP     d2, v2.d[1]
748        DUP     d3, v3.d[1]
7496:
750        TBZ     x1, 2, 7f
751        STR     s0, [x6], 4
752        STR     s1, [x8], 4
753        DUP     s0, v0.s[1]
754        DUP     s1, v1.s[1]
755        STR     s2, [x9], 4
756        STR     s3, [x7], 4
757        DUP     s2, v2.s[1]
758        DUP     s3, v3.s[1]
7597:
760        TBZ     x1, 1, 8f
761        STR     h0, [x6], 2
762        STR     h1, [x8], 2
763        DUP     h0, v0.h[1]
764        DUP     h1, v1.h[1]
765        STR     h2, [x9], 2
766        STR     h3, [x7], 2
767        DUP     h2, v2.h[1]
768        DUP     h3, v3.h[1]
7698:
770        TBZ     x1, 0, 9f
771        STR     b0, [x6]
772        STR     b1, [x8]
773        STR     b2, [x9]
774        STR     b3, [x7]
7759:
776        RET
777
778END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75
779
780#ifdef __ELF__
781.section ".note.GNU-stack","",%progbits
782#endif
783