1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const uint8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# uint8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# params structure is 20 bytes 26# struct { 27# uint8_t kernel_zero_point[4]; 28# int32_t right_pre_shift; 29# int32_t multiplier; 30# int32_t right_post_shift; 31# int16_t output_zero_point; 32# uint8_t output_min; 33# uint8_t output_max; 34# } rndnu_neon; 35# 36# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 37 38# Register usage 39# A0 x3 v0 40# A1 x15 v1 41# A2 x13 v2 42# A3 x4 v3 43# B x5 v4 v5 v6 44# C0 x6 v16 v20 v24 v28 45# C1 x8 v17 v21 v25 v29 46# C2 x9 v18 v22 v26 v30 47# C3 x7 v19 v23 v27 v31 48# zero_point v7 49# unused v8 v9 v10 v11 v12 v13 v14 v15 50 51# x10 x17 a53 temp registers 52 53BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53 54 55 # Clamp A and C pointers 56 CMP x0, 2 // if mr < 2 57 LDP x12, x11, [sp] // Load cn_stride, params 58 ADD x15, x3, x4 // a1 = a0 + a_stride 59 ADD x8, x6, x7 // c1 = c0 + cm_stride 60 CSEL x15, x3, x15, LO // a1 = a0 61 CSEL x8, x6, x8, LO // c1 = c0 62 63 ADD x13, x15, x4 // a2 = a1 + a_stride 64 ADD x9, x8, x7 // c2 = c1 + cm_stride 65 // if mr <= 2 66 CSEL x13, x15, x13, LS // a2 = a1 67 CSEL x9, x8, x9, LS // c2 = c1 68 69 CMP x0, 4 // if mr < 4 70 ADD x4, x13, x4 // a3 = a2 + a_stride 71 ADD x7, x9, x7 // c3 = c2 + cm_stride 72 CSEL x4, x13, x4, LO // a3 = a2 73 CSEL x7, x9, x7, LO // c3 = c2 74 LD1R {v7.4s}, [x11], 4 // kernel_zero_point 75 76 .p2align 3 770: 78 # Load initial bias from w into accumulators 79 LDP q16, q20, [x5], 32 80 MOV v17.16b, v16.16b 81 MOV v18.16b, v16.16b 82 LDP q24, q28, [x5], 32 83 MOV v19.16b, v16.16b 84 MOV v21.16b, v20.16b 85 MOV v22.16b, v20.16b 86 MOV v23.16b, v20.16b 87 SUBS x0, x2, 8 // k = kc - 8 88 MOV v25.16b, v24.16b 89 MOV v26.16b, v24.16b 90 MOV v27.16b, v24.16b 91 MOV v29.16b, v28.16b 92 MOV v30.16b, v28.16b 93 MOV v31.16b, v28.16b 94 # Is there at least 8 bytes for epilogue? 95 B.LO 4f 96 97 # Prologue 98 LDR d0, [x3], 8 99 LDP d4, d6, [x5] 100 LDR d1, [x15], 8 101 LDR d2, [x13], 8 102 LDR d3, [x4], 8 103 UXTL v0.8h, v0.8b 104 LDR x17, [x5, 16] 105 USUBL v4.8h, v4.8b, v7.8b 106 UXTL v1.8h, v1.8b 107 UXTL v2.8h, v2.8b 108 UXTL v3.8h, v3.8b 109 USUBL v6.8h, v6.8b, v7.8b 110 111 SUBS x0, x0, 8 // k = k - 8 112 # Is there at least 8 bytes for main loop? 113 B.LO 2f 114 115 # Main loop - 8 bytes of A 116 .p2align 3 1171: 118 SMLAL v16.4s, v4.4h, v0.h[0] 119 SMLAL2 v20.4s, v4.8h, v0.h[0] 120 SMLAL v17.4s, v4.4h, v1.h[0] 121 SMLAL2 v21.4s, v4.8h, v1.h[0] 122 SMLAL v18.4s, v4.4h, v2.h[0] 123 SMLAL2 v22.4s, v4.8h, v2.h[0] 124 SMLAL v19.4s, v4.4h, v3.h[0] 125 SMLAL2 v23.4s, v4.8h, v3.h[0] 126 LDR d4, [x5, 24] 127 INS v5.d[0], x17 128 SMLAL v24.4s, v6.4h, v0.h[0] 129 SMLAL2 v28.4s, v6.8h, v0.h[0] 130 SMLAL v25.4s, v6.4h, v1.h[0] 131 SMLAL2 v29.4s, v6.8h, v1.h[0] 132 USUBL v5.8h, v5.8b, v7.8b 133 SMLAL v26.4s, v6.4h, v2.h[0] 134 SMLAL2 v30.4s, v6.8h, v2.h[0] 135 SMLAL v27.4s, v6.4h, v3.h[0] 136 SMLAL2 v31.4s, v6.8h, v3.h[0] 137 LDR x17, [x5, 32] 138 SMLAL v16.4s, v5.4h, v0.h[1] 139 SMLAL2 v20.4s, v5.8h, v0.h[1] 140 SMLAL v17.4s, v5.4h, v1.h[1] 141 SMLAL2 v21.4s, v5.8h, v1.h[1] 142 USUBL v4.8h, v4.8b, v7.8b 143 SMLAL v18.4s, v5.4h, v2.h[1] 144 SMLAL2 v22.4s, v5.8h, v2.h[1] 145 SMLAL v19.4s, v5.4h, v3.h[1] 146 SMLAL2 v23.4s, v5.8h, v3.h[1] 147 LDR d5, [x5, 40] 148 INS v6.d[0], x17 149 SMLAL v24.4s, v4.4h, v0.h[1] 150 SMLAL2 v28.4s, v4.8h, v0.h[1] 151 SMLAL v25.4s, v4.4h, v1.h[1] 152 SMLAL2 v29.4s, v4.8h, v1.h[1] 153 USUBL v6.8h, v6.8b, v7.8b 154 SMLAL v26.4s, v4.4h, v2.h[1] 155 SMLAL2 v30.4s, v4.8h, v2.h[1] 156 SMLAL v27.4s, v4.4h, v3.h[1] 157 SMLAL2 v31.4s, v4.8h, v3.h[1] 158 LDR x17, [x5, 48] 159 SMLAL v16.4s, v6.4h, v0.h[2] 160 SMLAL2 v20.4s, v6.8h, v0.h[2] 161 SMLAL v17.4s, v6.4h, v1.h[2] 162 USUBL v5.8h, v5.8b, v7.8b 163 SMLAL2 v21.4s, v6.8h, v1.h[2] 164 SMLAL v18.4s, v6.4h, v2.h[2] 165 SMLAL2 v22.4s, v6.8h, v2.h[2] 166 SMLAL v19.4s, v6.4h, v3.h[2] 167 SMLAL2 v23.4s, v6.8h, v3.h[2] 168 LDR d6, [x5, 56] 169 INS v4.d[0], x17 170 SMLAL v24.4s, v5.4h, v0.h[2] 171 SMLAL2 v28.4s, v5.8h, v0.h[2] 172 SMLAL v25.4s, v5.4h, v1.h[2] 173 SMLAL2 v29.4s, v5.8h, v1.h[2] 174 USUBL v4.8h, v4.8b, v7.8b 175 SMLAL v26.4s, v5.4h, v2.h[2] 176 SMLAL2 v30.4s, v5.8h, v2.h[2] 177 SMLAL v27.4s, v5.4h, v3.h[2] 178 SMLAL2 v31.4s, v5.8h, v3.h[2] 179 LDR x17, [x5, 64] 180 SMLAL v16.4s, v4.4h, v0.h[3] 181 SMLAL2 v20.4s, v4.8h, v0.h[3] 182 SMLAL v17.4s, v4.4h, v1.h[3] 183 SMLAL2 v21.4s, v4.8h, v1.h[3] 184 USUBL v6.8h, v6.8b, v7.8b 185 SMLAL v18.4s, v4.4h, v2.h[3] 186 SMLAL2 v22.4s, v4.8h, v2.h[3] 187 SMLAL v19.4s, v4.4h, v3.h[3] 188 SMLAL2 v23.4s, v4.8h, v3.h[3] 189 LDR d4, [x5, 72] 190 INS v5.d[0], x17 191 SMLAL v24.4s, v6.4h, v0.h[3] 192 SMLAL2 v28.4s, v6.8h, v0.h[3] 193 USUBL v5.8h, v5.8b, v7.8b 194 SMLAL v25.4s, v6.4h, v1.h[3] 195 SMLAL2 v29.4s, v6.8h, v1.h[3] 196 SMLAL v26.4s, v6.4h, v2.h[3] 197 SMLAL2 v30.4s, v6.8h, v2.h[3] 198 SMLAL v27.4s, v6.4h, v3.h[3] 199 SMLAL2 v31.4s, v6.8h, v3.h[3] 200 LDR x17, [x5, 80] 201 SMLAL v16.4s, v5.4h, v0.h[4] 202 SMLAL2 v20.4s, v5.8h, v0.h[4] 203 SMLAL v17.4s, v5.4h, v1.h[4] 204 SMLAL2 v21.4s, v5.8h, v1.h[4] 205 USUBL v4.8h, v4.8b, v7.8b 206 SMLAL v18.4s, v5.4h, v2.h[4] 207 SMLAL2 v22.4s, v5.8h, v2.h[4] 208 SMLAL v19.4s, v5.4h, v3.h[4] 209 SMLAL2 v23.4s, v5.8h, v3.h[4] 210 LDR d5, [x5, 88] 211 INS v6.d[0], x17 212 SMLAL v24.4s, v4.4h, v0.h[4] 213 SMLAL2 v28.4s, v4.8h, v0.h[4] 214 SMLAL v25.4s, v4.4h, v1.h[4] 215 SMLAL2 v29.4s, v4.8h, v1.h[4] 216 USUBL v6.8h, v6.8b, v7.8b 217 SMLAL v26.4s, v4.4h, v2.h[4] 218 SMLAL2 v30.4s, v4.8h, v2.h[4] 219 SMLAL v27.4s, v4.4h, v3.h[4] 220 SMLAL2 v31.4s, v4.8h, v3.h[4] 221 LDR x17, [x5, 96] 222 SMLAL v16.4s, v6.4h, v0.h[5] 223 SMLAL2 v20.4s, v6.8h, v0.h[5] 224 SMLAL v17.4s, v6.4h, v1.h[5] 225 SMLAL2 v21.4s, v6.8h, v1.h[5] 226 USUBL v5.8h, v5.8b, v7.8b 227 SMLAL v18.4s, v6.4h, v2.h[5] 228 SMLAL2 v22.4s, v6.8h, v2.h[5] 229 SMLAL v19.4s, v6.4h, v3.h[5] 230 SMLAL2 v23.4s, v6.8h, v3.h[5] 231 LDR d6, [x5, 104] 232 INS v4.d[0], x17 233 SMLAL v24.4s, v5.4h, v0.h[5] 234 SMLAL2 v28.4s, v5.8h, v0.h[5] 235 SMLAL v25.4s, v5.4h, v1.h[5] 236 SMLAL2 v29.4s, v5.8h, v1.h[5] 237 USUBL v4.8h, v4.8b, v7.8b 238 SMLAL v26.4s, v5.4h, v2.h[5] 239 SMLAL2 v30.4s, v5.8h, v2.h[5] 240 SMLAL v27.4s, v5.4h, v3.h[5] 241 SMLAL2 v31.4s, v5.8h, v3.h[5] 242 USUBL v6.8h, v6.8b, v7.8b 243 LDR x17, [x5, 112] 244 SMLAL v16.4s, v4.4h, v0.h[6] 245 SMLAL2 v20.4s, v4.8h, v0.h[6] 246 SMLAL v17.4s, v4.4h, v1.h[6] 247 SMLAL2 v21.4s, v4.8h, v1.h[6] 248 SMLAL v18.4s, v4.4h, v2.h[6] 249 SMLAL2 v22.4s, v4.8h, v2.h[6] 250 SMLAL v19.4s, v4.4h, v3.h[6] 251 SMLAL2 v23.4s, v4.8h, v3.h[6] 252 LDR d5, [x5, 120] 253 INS v4.d[0], x17 254 SMLAL v24.4s, v6.4h, v0.h[6] 255 SMLAL2 v28.4s, v6.8h, v0.h[6] 256 SMLAL v25.4s, v6.4h, v1.h[6] 257 SMLAL2 v29.4s, v6.8h, v1.h[6] 258 USUBL v4.8h, v4.8b, v7.8b 259 ADD x5, x5, 128 260 261 SMLAL v26.4s, v6.4h, v2.h[6] 262 SMLAL2 v30.4s, v6.8h, v2.h[6] 263 LDR x17, [x5] 264 SMLAL v27.4s, v6.4h, v3.h[6] 265 SMLAL2 v31.4s, v6.8h, v3.h[6] 266 USUBL v5.8h, v5.8b, v7.8b 267 LDR x10, [x3], 8 268 269 SMLAL v16.4s, v4.4h, v0.h[7] 270 SMLAL2 v20.4s, v4.8h, v0.h[7] 271 SMLAL v17.4s, v4.4h, v1.h[7] 272 SMLAL2 v21.4s, v4.8h, v1.h[7] 273 SMLAL v18.4s, v4.4h, v2.h[7] 274 SMLAL2 v22.4s, v4.8h, v2.h[7] 275 SMLAL v19.4s, v4.4h, v3.h[7] 276 SMLAL2 v23.4s, v4.8h, v3.h[7] 277 LDR d6, [x5, 8] 278 INS v4.d[0], x17 279 SMLAL v24.4s, v5.4h, v0.h[7] 280 SMLAL2 v28.4s, v5.8h, v0.h[7] 281 LDR x17, [x13], 8 282 SMLAL v25.4s, v5.4h, v1.h[7] 283 SMLAL2 v29.4s, v5.8h, v1.h[7] 284 LDR d1, [x15], 8 285 INS v0.d[0], x10 286 SMLAL v26.4s, v5.4h, v2.h[7] 287 SMLAL2 v30.4s, v5.8h, v2.h[7] 288 SMLAL v27.4s, v5.4h, v3.h[7] 289 SMLAL2 v31.4s, v5.8h, v3.h[7] 290 LDR d3, [x4], 8 291 INS v2.d[0], x17 292 293 UXTL v0.8h, v0.8b 294 UXTL v1.8h, v1.8b 295 LDR x17, [x5, 16] 296 USUBL v4.8h, v4.8b, v7.8b 297 UXTL v2.8h, v2.8b 298 SUBS x0, x0, 8 299 UXTL v3.8h, v3.8b 300 USUBL v6.8h, v6.8b, v7.8b 301 B.HS 1b 302 303 # Epilogue. Same as main loop but no preloads in final group 304 305 .p2align 3 3062: 307 SMLAL v16.4s, v4.4h, v0.h[0] 308 SMLAL2 v20.4s, v4.8h, v0.h[0] 309 SMLAL v17.4s, v4.4h, v1.h[0] 310 SMLAL2 v21.4s, v4.8h, v1.h[0] 311 SMLAL v18.4s, v4.4h, v2.h[0] 312 SMLAL2 v22.4s, v4.8h, v2.h[0] 313 SMLAL v19.4s, v4.4h, v3.h[0] 314 SMLAL2 v23.4s, v4.8h, v3.h[0] 315 LDR d4, [x5, 24] 316 INS v5.d[0], x17 317 SMLAL v24.4s, v6.4h, v0.h[0] 318 SMLAL2 v28.4s, v6.8h, v0.h[0] 319 SMLAL v25.4s, v6.4h, v1.h[0] 320 SMLAL2 v29.4s, v6.8h, v1.h[0] 321 USUBL v5.8h, v5.8b, v7.8b 322 SMLAL v26.4s, v6.4h, v2.h[0] 323 SMLAL2 v30.4s, v6.8h, v2.h[0] 324 SMLAL v27.4s, v6.4h, v3.h[0] 325 SMLAL2 v31.4s, v6.8h, v3.h[0] 326 LDR x17, [x5, 32] 327 SMLAL v16.4s, v5.4h, v0.h[1] 328 SMLAL2 v20.4s, v5.8h, v0.h[1] 329 SMLAL v17.4s, v5.4h, v1.h[1] 330 SMLAL2 v21.4s, v5.8h, v1.h[1] 331 USUBL v4.8h, v4.8b, v7.8b 332 SMLAL v18.4s, v5.4h, v2.h[1] 333 SMLAL2 v22.4s, v5.8h, v2.h[1] 334 SMLAL v19.4s, v5.4h, v3.h[1] 335 SMLAL2 v23.4s, v5.8h, v3.h[1] 336 LDR d5, [x5, 40] 337 INS v6.d[0], x17 338 SMLAL v24.4s, v4.4h, v0.h[1] 339 SMLAL2 v28.4s, v4.8h, v0.h[1] 340 SMLAL v25.4s, v4.4h, v1.h[1] 341 SMLAL2 v29.4s, v4.8h, v1.h[1] 342 USUBL v6.8h, v6.8b, v7.8b 343 SMLAL v26.4s, v4.4h, v2.h[1] 344 SMLAL2 v30.4s, v4.8h, v2.h[1] 345 SMLAL v27.4s, v4.4h, v3.h[1] 346 SMLAL2 v31.4s, v4.8h, v3.h[1] 347 LDR x17, [x5, 48] 348 SMLAL v16.4s, v6.4h, v0.h[2] 349 SMLAL2 v20.4s, v6.8h, v0.h[2] 350 SMLAL v17.4s, v6.4h, v1.h[2] 351 USUBL v5.8h, v5.8b, v7.8b 352 SMLAL2 v21.4s, v6.8h, v1.h[2] 353 SMLAL v18.4s, v6.4h, v2.h[2] 354 SMLAL2 v22.4s, v6.8h, v2.h[2] 355 SMLAL v19.4s, v6.4h, v3.h[2] 356 SMLAL2 v23.4s, v6.8h, v3.h[2] 357 LDR d6, [x5, 56] 358 INS v4.d[0], x17 359 SMLAL v24.4s, v5.4h, v0.h[2] 360 SMLAL2 v28.4s, v5.8h, v0.h[2] 361 SMLAL v25.4s, v5.4h, v1.h[2] 362 SMLAL2 v29.4s, v5.8h, v1.h[2] 363 USUBL v4.8h, v4.8b, v7.8b 364 SMLAL v26.4s, v5.4h, v2.h[2] 365 SMLAL2 v30.4s, v5.8h, v2.h[2] 366 SMLAL v27.4s, v5.4h, v3.h[2] 367 SMLAL2 v31.4s, v5.8h, v3.h[2] 368 LDR x17, [x5, 64] 369 SMLAL v16.4s, v4.4h, v0.h[3] 370 SMLAL2 v20.4s, v4.8h, v0.h[3] 371 SMLAL v17.4s, v4.4h, v1.h[3] 372 SMLAL2 v21.4s, v4.8h, v1.h[3] 373 USUBL v6.8h, v6.8b, v7.8b 374 SMLAL v18.4s, v4.4h, v2.h[3] 375 SMLAL2 v22.4s, v4.8h, v2.h[3] 376 SMLAL v19.4s, v4.4h, v3.h[3] 377 SMLAL2 v23.4s, v4.8h, v3.h[3] 378 LDR d4, [x5, 72] 379 INS v5.d[0], x17 380 SMLAL v24.4s, v6.4h, v0.h[3] 381 SMLAL2 v28.4s, v6.8h, v0.h[3] 382 USUBL v5.8h, v5.8b, v7.8b 383 SMLAL v25.4s, v6.4h, v1.h[3] 384 SMLAL2 v29.4s, v6.8h, v1.h[3] 385 SMLAL v26.4s, v6.4h, v2.h[3] 386 SMLAL2 v30.4s, v6.8h, v2.h[3] 387 SMLAL v27.4s, v6.4h, v3.h[3] 388 SMLAL2 v31.4s, v6.8h, v3.h[3] 389 LDR x17, [x5, 80] 390 SMLAL v16.4s, v5.4h, v0.h[4] 391 SMLAL2 v20.4s, v5.8h, v0.h[4] 392 SMLAL v17.4s, v5.4h, v1.h[4] 393 SMLAL2 v21.4s, v5.8h, v1.h[4] 394 USUBL v4.8h, v4.8b, v7.8b 395 SMLAL v18.4s, v5.4h, v2.h[4] 396 SMLAL2 v22.4s, v5.8h, v2.h[4] 397 SMLAL v19.4s, v5.4h, v3.h[4] 398 SMLAL2 v23.4s, v5.8h, v3.h[4] 399 LDR d5, [x5, 88] 400 INS v6.d[0], x17 401 SMLAL v24.4s, v4.4h, v0.h[4] 402 SMLAL2 v28.4s, v4.8h, v0.h[4] 403 SMLAL v25.4s, v4.4h, v1.h[4] 404 SMLAL2 v29.4s, v4.8h, v1.h[4] 405 USUBL v6.8h, v6.8b, v7.8b 406 SMLAL v26.4s, v4.4h, v2.h[4] 407 SMLAL2 v30.4s, v4.8h, v2.h[4] 408 SMLAL v27.4s, v4.4h, v3.h[4] 409 SMLAL2 v31.4s, v4.8h, v3.h[4] 410 LDR x17, [x5, 96] 411 SMLAL v16.4s, v6.4h, v0.h[5] 412 SMLAL2 v20.4s, v6.8h, v0.h[5] 413 SMLAL v17.4s, v6.4h, v1.h[5] 414 SMLAL2 v21.4s, v6.8h, v1.h[5] 415 USUBL v5.8h, v5.8b, v7.8b 416 SMLAL v18.4s, v6.4h, v2.h[5] 417 SMLAL2 v22.4s, v6.8h, v2.h[5] 418 SMLAL v19.4s, v6.4h, v3.h[5] 419 SMLAL2 v23.4s, v6.8h, v3.h[5] 420 LDR d6, [x5, 104] 421 INS v4.d[0], x17 422 SMLAL v24.4s, v5.4h, v0.h[5] 423 SMLAL2 v28.4s, v5.8h, v0.h[5] 424 SMLAL v25.4s, v5.4h, v1.h[5] 425 SMLAL2 v29.4s, v5.8h, v1.h[5] 426 USUBL v4.8h, v4.8b, v7.8b 427 SMLAL v26.4s, v5.4h, v2.h[5] 428 SMLAL2 v30.4s, v5.8h, v2.h[5] 429 SMLAL v27.4s, v5.4h, v3.h[5] 430 SMLAL2 v31.4s, v5.8h, v3.h[5] 431 USUBL v6.8h, v6.8b, v7.8b 432 SMLAL v16.4s, v4.4h, v0.h[6] 433 SMLAL2 v20.4s, v4.8h, v0.h[6] 434 SMLAL v17.4s, v4.4h, v1.h[6] 435 SMLAL2 v21.4s, v4.8h, v1.h[6] 436 SMLAL v18.4s, v4.4h, v2.h[6] 437 SMLAL2 v22.4s, v4.8h, v2.h[6] 438 SMLAL v19.4s, v4.4h, v3.h[6] 439 SMLAL2 v23.4s, v4.8h, v3.h[6] 440 LDR x17, [x5, 112] 441 SMLAL v24.4s, v6.4h, v0.h[6] 442 SMLAL2 v28.4s, v6.8h, v0.h[6] 443 SMLAL v25.4s, v6.4h, v1.h[6] 444 SMLAL2 v29.4s, v6.8h, v1.h[6] 445 LDR d5, [x5, 120] 446 INS v4.d[0], x17 447 USUBL v4.8h, v4.8b, v7.8b 448 SMLAL v26.4s, v6.4h, v2.h[6] 449 SMLAL2 v30.4s, v6.8h, v2.h[6] 450 SMLAL v27.4s, v6.4h, v3.h[6] 451 SMLAL2 v31.4s, v6.8h, v3.h[6] 452 SMLAL v16.4s, v4.4h, v0.h[7] 453 SMLAL2 v20.4s, v4.8h, v0.h[7] 454 SMLAL v17.4s, v4.4h, v1.h[7] 455 SMLAL2 v21.4s, v4.8h, v1.h[7] 456 USUBL v5.8h, v5.8b, v7.8b 457 SMLAL v18.4s, v4.4h, v2.h[7] 458 SMLAL2 v22.4s, v4.8h, v2.h[7] 459 SMLAL v19.4s, v4.4h, v3.h[7] 460 SMLAL2 v23.4s, v4.8h, v3.h[7] 461 ADD x5, x5, 128 462 SMLAL v24.4s, v5.4h, v0.h[7] 463 SMLAL2 v28.4s, v5.8h, v0.h[7] 464 SMLAL v25.4s, v5.4h, v1.h[7] 465 SMLAL2 v29.4s, v5.8h, v1.h[7] 466 AND x0, x2, 7 // kc remainder 0 to 7 467 SMLAL v26.4s, v5.4h, v2.h[7] 468 SMLAL2 v30.4s, v5.8h, v2.h[7] 469 SMLAL v27.4s, v5.4h, v3.h[7] 470 SMLAL2 v31.4s, v5.8h, v3.h[7] 471 472 # Is there a remainder?- 1 to 7 bytes of A 473 CBNZ x0, 4f 474 4753: 476 # Apply params - preshift, scale, postshift, bias and clamp 477 LD1R {v4.4s}, [x11], 4 478 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 479 SQSHL v17.4s, v17.4s, v4.4s 480 SQSHL v18.4s, v18.4s, v4.4s 481 SQSHL v19.4s, v19.4s, v4.4s 482 SQSHL v20.4s, v20.4s, v4.4s 483 SQSHL v21.4s, v21.4s, v4.4s 484 SQSHL v22.4s, v22.4s, v4.4s 485 SQSHL v23.4s, v23.4s, v4.4s 486 LD1R {v5.4s}, [x11], 4 487 SQSHL v24.4s, v24.4s, v4.4s 488 SQSHL v25.4s, v25.4s, v4.4s 489 SQSHL v26.4s, v26.4s, v4.4s 490 SQSHL v27.4s, v27.4s, v4.4s 491 SQSHL v28.4s, v28.4s, v4.4s 492 SQSHL v29.4s, v29.4s, v4.4s 493 SQSHL v30.4s, v30.4s, v4.4s 494 SQSHL v31.4s, v31.4s, v4.4s 495 LD1R {v6.4s}, [x11], 4 496 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 497 SQDMULH v17.4s, v17.4s, v5.4s 498 SQDMULH v18.4s, v18.4s, v5.4s 499 SQDMULH v19.4s, v19.4s, v5.4s 500 SQDMULH v20.4s, v20.4s, v5.4s 501 SQDMULH v21.4s, v21.4s, v5.4s 502 SQDMULH v22.4s, v22.4s, v5.4s 503 SQDMULH v23.4s, v23.4s, v5.4s 504 SQDMULH v24.4s, v24.4s, v5.4s 505 SQDMULH v25.4s, v25.4s, v5.4s 506 SQDMULH v26.4s, v26.4s, v5.4s 507 SQDMULH v27.4s, v27.4s, v5.4s 508 SQDMULH v28.4s, v28.4s, v5.4s 509 SQDMULH v29.4s, v29.4s, v5.4s 510 SQDMULH v30.4s, v30.4s, v5.4s 511 SQDMULH v31.4s, v31.4s, v5.4s 512 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 513 SRSHL v17.4s, v17.4s, v6.4s 514 SRSHL v18.4s, v18.4s, v6.4s 515 SRSHL v19.4s, v19.4s, v6.4s 516 SRSHL v20.4s, v20.4s, v6.4s 517 SRSHL v21.4s, v21.4s, v6.4s 518 SRSHL v22.4s, v22.4s, v6.4s 519 SRSHL v23.4s, v23.4s, v6.4s 520 SRSHL v24.4s, v24.4s, v6.4s 521 SRSHL v25.4s, v25.4s, v6.4s 522 SRSHL v26.4s, v26.4s, v6.4s 523 SRSHL v27.4s, v27.4s, v6.4s 524 SRSHL v28.4s, v28.4s, v6.4s 525 SRSHL v29.4s, v29.4s, v6.4s 526 SRSHL v30.4s, v30.4s, v6.4s 527 SRSHL v31.4s, v31.4s, v6.4s 528 529 SQXTN v16.4h, v16.4s 530 SQXTN v17.4h, v17.4s 531 SQXTN v18.4h, v18.4s 532 SQXTN v19.4h, v19.4s 533 SQXTN v24.4h, v24.4s 534 SQXTN v25.4h, v25.4s 535 SQXTN v26.4h, v26.4s 536 SQXTN v27.4h, v27.4s 537 LD1R {v6.8h}, [x11], 2 // add bias 538 539 SQXTN2 v16.8h, v20.4s 540 SQXTN2 v17.8h, v21.4s 541 SQXTN2 v18.8h, v22.4s 542 SQXTN2 v19.8h, v23.4s 543 SQXTN2 v24.8h, v28.4s 544 SQXTN2 v25.8h, v29.4s 545 SQXTN2 v26.8h, v30.4s 546 SQXTN2 v27.8h, v31.4s 547 548 SQADD v16.8h, v16.8h, v6.8h 549 SQADD v17.8h, v17.8h, v6.8h 550 SQADD v18.8h, v18.8h, v6.8h 551 SQADD v19.8h, v19.8h, v6.8h 552 SQADD v24.8h, v24.8h, v6.8h 553 SQADD v25.8h, v25.8h, v6.8h 554 SQADD v26.8h, v26.8h, v6.8h 555 SQADD v27.8h, v27.8h, v6.8h 556 LD1R {v4.16b}, [x11], 1 // clamp min value 557 558 SQXTUN v0.8b, v16.8h 559 SQXTUN v1.8b, v17.8h 560 SQXTUN v2.8b, v18.8h 561 SQXTUN v3.8b, v19.8h 562 LD1R {v5.16b}, [x11] // clamp max value 563 SQXTUN2 v0.16b, v24.8h 564 SQXTUN2 v1.16b, v25.8h 565 SQXTUN2 v2.16b, v26.8h 566 SQXTUN2 v3.16b, v27.8h 567 SUB x11, x11, 15 // rewind params pointer 568 569 UMAX v0.16b, v0.16b, v4.16b 570 UMAX v1.16b, v1.16b, v4.16b 571 UMAX v2.16b, v2.16b, v4.16b 572 UMAX v3.16b, v3.16b, v4.16b 573 SUBS x1, x1, 16 574 UMIN v0.16b, v0.16b, v5.16b 575 UMIN v1.16b, v1.16b, v5.16b 576 UMIN v2.16b, v2.16b, v5.16b 577 UMIN v3.16b, v3.16b, v5.16b 578 B.LO 5f 579 580 # Store full 4 x 16 581 ST1 {v0.16b}, [x6], x12 582 SUB x3, x3, x2 // a0 -= kc 583 ST1 {v1.16b}, [x8], x12 584 SUB x15, x15, x2 // a1 -= kc 585 ST1 {v2.16b}, [x9], x12 586 SUB x13, x13, x2 // a2 -= kc 587 ST1 {v3.16b}, [x7], x12 588 SUB x4, x4, x2 // a3 -= kc 589 B.NE 0b 590 RET 591 592 # Remainder- 1 to 7 bytes of A 593 .p2align 3 5944: 595 AND x0, x2, 7 // kc remainder 1 to 7 596 597 LD1 {v0.8b}, [x3], x0 598 LDP d4, d5, [x5], 16 599 LD1 {v1.8b}, [x15], x0 600 LD1 {v2.8b}, [x13], x0 601 LD1 {v3.8b}, [x4], x0 602 UXTL v0.8h, v0.8b 603 USUBL v4.8h, v4.8b, v7.8b 604 USUBL v5.8h, v5.8b, v7.8b 605 UXTL v1.8h, v1.8b 606 UXTL v2.8h, v2.8b 607 UXTL v3.8h, v3.8b 608 SMLAL v16.4s, v4.4h, v0.h[0] 609 SMLAL2 v20.4s, v4.8h, v0.h[0] 610 SMLAL v24.4s, v5.4h, v0.h[0] 611 SMLAL2 v28.4s, v5.8h, v0.h[0] 612 SMLAL v17.4s, v4.4h, v1.h[0] 613 SMLAL2 v21.4s, v4.8h, v1.h[0] 614 SMLAL v25.4s, v5.4h, v1.h[0] 615 SMLAL2 v29.4s, v5.8h, v1.h[0] 616 SMLAL v18.4s, v4.4h, v2.h[0] 617 SMLAL2 v22.4s, v4.8h, v2.h[0] 618 SMLAL v26.4s, v5.4h, v2.h[0] 619 SMLAL2 v30.4s, v5.8h, v2.h[0] 620 SMLAL v19.4s, v4.4h, v3.h[0] 621 SMLAL2 v23.4s, v4.8h, v3.h[0] 622 SMLAL v27.4s, v5.4h, v3.h[0] 623 SMLAL2 v31.4s, v5.8h, v3.h[0] 624 CMP x0, 2 625 B.LO 3b 626 627 LDP d4, d5, [x5], 16 628 USUBL v4.8h, v4.8b, v7.8b 629 USUBL v5.8h, v5.8b, v7.8b 630 SMLAL v16.4s, v4.4h, v0.h[1] 631 SMLAL2 v20.4s, v4.8h, v0.h[1] 632 SMLAL v24.4s, v5.4h, v0.h[1] 633 SMLAL2 v28.4s, v5.8h, v0.h[1] 634 SMLAL v17.4s, v4.4h, v1.h[1] 635 SMLAL2 v21.4s, v4.8h, v1.h[1] 636 SMLAL v25.4s, v5.4h, v1.h[1] 637 SMLAL2 v29.4s, v5.8h, v1.h[1] 638 SMLAL v18.4s, v4.4h, v2.h[1] 639 SMLAL2 v22.4s, v4.8h, v2.h[1] 640 SMLAL v26.4s, v5.4h, v2.h[1] 641 SMLAL2 v30.4s, v5.8h, v2.h[1] 642 SMLAL v19.4s, v4.4h, v3.h[1] 643 SMLAL2 v23.4s, v4.8h, v3.h[1] 644 SMLAL v27.4s, v5.4h, v3.h[1] 645 SMLAL2 v31.4s, v5.8h, v3.h[1] 646 B.EQ 3b 647 648 LDP d4, d5, [x5], 16 649 USUBL v4.8h, v4.8b, v7.8b 650 USUBL v5.8h, v5.8b, v7.8b 651 SMLAL v16.4s, v4.4h, v0.h[2] 652 SMLAL2 v20.4s, v4.8h, v0.h[2] 653 SMLAL v24.4s, v5.4h, v0.h[2] 654 SMLAL2 v28.4s, v5.8h, v0.h[2] 655 SMLAL v17.4s, v4.4h, v1.h[2] 656 SMLAL2 v21.4s, v4.8h, v1.h[2] 657 SMLAL v25.4s, v5.4h, v1.h[2] 658 SMLAL2 v29.4s, v5.8h, v1.h[2] 659 SMLAL v18.4s, v4.4h, v2.h[2] 660 SMLAL2 v22.4s, v4.8h, v2.h[2] 661 SMLAL v26.4s, v5.4h, v2.h[2] 662 SMLAL2 v30.4s, v5.8h, v2.h[2] 663 SMLAL v19.4s, v4.4h, v3.h[2] 664 SMLAL2 v23.4s, v4.8h, v3.h[2] 665 SMLAL v27.4s, v5.4h, v3.h[2] 666 SMLAL2 v31.4s, v5.8h, v3.h[2] 667 CMP x0, 4 668 B.LO 3b 669 670 LDP d4, d5, [x5], 16 671 USUBL v4.8h, v4.8b, v7.8b 672 USUBL v5.8h, v5.8b, v7.8b 673 SMLAL v16.4s, v4.4h, v0.h[3] 674 SMLAL2 v20.4s, v4.8h, v0.h[3] 675 SMLAL v24.4s, v5.4h, v0.h[3] 676 SMLAL2 v28.4s, v5.8h, v0.h[3] 677 SMLAL v17.4s, v4.4h, v1.h[3] 678 SMLAL2 v21.4s, v4.8h, v1.h[3] 679 SMLAL v25.4s, v5.4h, v1.h[3] 680 SMLAL2 v29.4s, v5.8h, v1.h[3] 681 SMLAL v18.4s, v4.4h, v2.h[3] 682 SMLAL2 v22.4s, v4.8h, v2.h[3] 683 SMLAL v26.4s, v5.4h, v2.h[3] 684 SMLAL2 v30.4s, v5.8h, v2.h[3] 685 SMLAL v19.4s, v4.4h, v3.h[3] 686 SMLAL2 v23.4s, v4.8h, v3.h[3] 687 SMLAL v27.4s, v5.4h, v3.h[3] 688 SMLAL2 v31.4s, v5.8h, v3.h[3] 689 B.EQ 3b 690 691 LDP d4, d5, [x5], 16 692 USUBL v4.8h, v4.8b, v7.8b 693 USUBL v5.8h, v5.8b, v7.8b 694 SMLAL v16.4s, v4.4h, v0.h[4] 695 SMLAL2 v20.4s, v4.8h, v0.h[4] 696 SMLAL v24.4s, v5.4h, v0.h[4] 697 SMLAL2 v28.4s, v5.8h, v0.h[4] 698 SMLAL v17.4s, v4.4h, v1.h[4] 699 SMLAL2 v21.4s, v4.8h, v1.h[4] 700 SMLAL v25.4s, v5.4h, v1.h[4] 701 SMLAL2 v29.4s, v5.8h, v1.h[4] 702 SMLAL v18.4s, v4.4h, v2.h[4] 703 SMLAL2 v22.4s, v4.8h, v2.h[4] 704 SMLAL v26.4s, v5.4h, v2.h[4] 705 SMLAL2 v30.4s, v5.8h, v2.h[4] 706 SMLAL v19.4s, v4.4h, v3.h[4] 707 SMLAL2 v23.4s, v4.8h, v3.h[4] 708 SMLAL v27.4s, v5.4h, v3.h[4] 709 SMLAL2 v31.4s, v5.8h, v3.h[4] 710 CMP x0, 6 711 B.LO 3b 712 713 LDP d4, d5, [x5], 16 714 USUBL v4.8h, v4.8b, v7.8b 715 USUBL v5.8h, v5.8b, v7.8b 716 SMLAL v16.4s, v4.4h, v0.h[5] 717 SMLAL2 v20.4s, v4.8h, v0.h[5] 718 SMLAL v24.4s, v5.4h, v0.h[5] 719 SMLAL2 v28.4s, v5.8h, v0.h[5] 720 SMLAL v17.4s, v4.4h, v1.h[5] 721 SMLAL2 v21.4s, v4.8h, v1.h[5] 722 SMLAL v25.4s, v5.4h, v1.h[5] 723 SMLAL2 v29.4s, v5.8h, v1.h[5] 724 SMLAL v18.4s, v4.4h, v2.h[5] 725 SMLAL2 v22.4s, v4.8h, v2.h[5] 726 SMLAL v26.4s, v5.4h, v2.h[5] 727 SMLAL2 v30.4s, v5.8h, v2.h[5] 728 SMLAL v19.4s, v4.4h, v3.h[5] 729 SMLAL2 v23.4s, v4.8h, v3.h[5] 730 SMLAL v27.4s, v5.4h, v3.h[5] 731 SMLAL2 v31.4s, v5.8h, v3.h[5] 732 B.EQ 3b 733 734 LDP d4, d5, [x5], 16 735 USUBL v4.8h, v4.8b, v7.8b 736 USUBL v5.8h, v5.8b, v7.8b 737 SMLAL v16.4s, v4.4h, v0.h[6] 738 SMLAL2 v20.4s, v4.8h, v0.h[6] 739 SMLAL v24.4s, v5.4h, v0.h[6] 740 SMLAL2 v28.4s, v5.8h, v0.h[6] 741 SMLAL v17.4s, v4.4h, v1.h[6] 742 SMLAL2 v21.4s, v4.8h, v1.h[6] 743 SMLAL v25.4s, v5.4h, v1.h[6] 744 SMLAL2 v29.4s, v5.8h, v1.h[6] 745 SMLAL v18.4s, v4.4h, v2.h[6] 746 SMLAL2 v22.4s, v4.8h, v2.h[6] 747 SMLAL v26.4s, v5.4h, v2.h[6] 748 SMLAL2 v30.4s, v5.8h, v2.h[6] 749 SMLAL v19.4s, v4.4h, v3.h[6] 750 SMLAL2 v23.4s, v4.8h, v3.h[6] 751 SMLAL v27.4s, v5.4h, v3.h[6] 752 SMLAL2 v31.4s, v5.8h, v3.h[6] 753 B 3b 754 755 # Store odd width 756 .p2align 3 7575: 758 TBZ x1, 3, 6f 759 STR d0, [x6], 8 760 STR d1, [x8], 8 761 DUP d0, v0.d[1] 762 DUP d1, v1.d[1] 763 STR d2, [x9], 8 764 STR d3, [x7], 8 765 DUP d2, v2.d[1] 766 DUP d3, v3.d[1] 7676: 768 TBZ x1, 2, 7f 769 STR s0, [x6], 4 770 STR s1, [x8], 4 771 DUP s0, v0.s[1] 772 DUP s1, v1.s[1] 773 STR s2, [x9], 4 774 STR s3, [x7], 4 775 DUP s2, v2.s[1] 776 DUP s3, v3.s[1] 7777: 778 TBZ x1, 1, 8f 779 STR h0, [x6], 2 780 STR h1, [x8], 2 781 DUP h0, v0.h[1] 782 DUP h1, v1.h[1] 783 STR h2, [x9], 2 784 STR h3, [x7], 2 785 DUP h2, v2.h[1] 786 DUP h3, v3.h[1] 7878: 788 TBZ x1, 0, 9f 789 STR b0, [x6] 790 STR b1, [x8] 791 STR b2, [x9] 792 STR b3, [x7] 7939: 794 RET 795 796END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53 797 798#ifdef __ELF__ 799.section ".note.GNU-stack","",%progbits 800#endif 801