1// Auto-generated file. Do not edit! 2// Template: src/qu8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qu8_conv_minmax_params params) [sp + 8] -> x11 24 25# params structure is 20 bytes 26# struct { 27# uint8_t kernel_zero_point[4]; 28# int32_t right_pre_shift; 29# int32_t multiplier; 30# int32_t right_post_shift; 31# int16_t output_zero_point; 32# int8_t output_min; 33# int8_t output_max; 34# } rndnu_neon; 35 36# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 37 38# Register usage 39# A0 x3 v0 v4 40# A1 x15 v1 v5 41# A2 x13 v2 v6 42# A3 x4 v3 (v0) 43# B x5 v8 v9 v10 v11 44# C0 x6 v16 v20 v24 v28 45# C1 x8 v17 v21 v25 v29 46# C2 x9 v18 v22 v26 v30 47# C3 x7 v19 v23 v27 v31 48# zero point v7 v12 v13 v14 v15 49 50# x14 temp for Cortex-A55 loads 51 52BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55 53 54 # Clamp A and C pointers 55 CMP x0, 2 // if mr < 2 56 LDP x12, x11, [sp] // cn_stride, params 57 ADD x2, x2, 3 // kc = (kc + 3) & ~3 58 ADD x15, x3, x4 // a1 = a0 + a_stride 59 ADD x8, x6, x7 // c1 = c0 + cm_stride 60 61 # Save d8-d15 to stack 62 STP d8, d9, [sp, -64]! 63 CSEL x15, x3, x15, LO // a1 = a0 64 CSEL x8, x6, x8, LO // c1 = c0 65 BIC x2, x2, 3 66 STP d10, d11, [sp, 16] 67 68 ADD x13, x15, x4 // a2 = a1 + a_stride 69 ADD x9, x8, x7 // c2 = c1 + cm_stride 70 STP d12, d13, [sp, 32] 71 // if mr <= 2 72 CSEL x13, x15, x13, LS // a2 = a1 73 CSEL x9, x8, x9, LS // c2 = c1 74 STP d14, d15, [sp, 48] 75 76 CMP x0, 4 // if mr < 4 77 ADD x4, x13, x4 // a3 = a2 + a_stride 78 ADD x7, x9, x7 // c3 = c2 + cm_stride 79 80 LD1R {v7.4s}, [x11], 4 // kernel_zero_point 81 82 CSEL x4, x13, x4, LO // a3 = a2 83 CSEL x7, x9, x7, LO // c3 = c2 84 85 86 .p2align 3 870: 88 # Load initial bias from w into accumulators 89 LDP q16, q20, [x5], 32 90 91 MOVI v12.4s, 0 92 MOVI v13.4s, 0 93 MOVI v14.4s, 0 94 MOVI v15.4s, 0 95 96 LDP q24, q28, [x5], 32 97 MOV v17.16b, v16.16b 98 MOV v18.16b, v16.16b 99 MOV v19.16b, v16.16b 100 MOV v21.16b, v20.16b 101 SUBS x0, x2, 16 // k = kc - 16 102 MOV v22.16b, v20.16b 103 MOV v23.16b, v20.16b 104 MOV v25.16b, v24.16b 105 MOV v26.16b, v24.16b 106 MOV v27.16b, v24.16b 107 MOV v29.16b, v28.16b 108 MOV v30.16b, v28.16b 109 MOV v31.16b, v28.16b 110 111 # Is there at least 16 bytes for prologue/epilogue? 112 B.LO 4f 113 114 # prologue - read A and B values for block 0 and 1 115 LDR d0, [x3], 8 116 LDR q8, [x5], 16 117 LDR d1, [x15], 8 118 LDR d2, [x13], 8 119 LDR d3, [x4], 8 120 SUBS x0, x0, 16 // is there 16 for main loop? 121 LDR d9, [x5], 8 122 LDR x14, [x5], 8 123 # Is there at least 16 bytes for main loop? 124 B.LO 2f 125 126 # Main loop - 16 bytes of A in 4 groups. 127 # 4 row of 4 vectors wide = 16 UDOT instructions for 4 channels 128 # 4 LD64 for A 129 # 4 LD128 for W. = 2 LD64 + INS. 130 # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS. 131 132 .p2align 3 1331: 134 # BLOCK 0 135 UDOT v16.4s, v8.16b, v0.4b[0] 136 LDR d10, [x5], 8 137 UDOT v17.4s, v8.16b, v1.4b[0] 138 INS v9.d[1], x14 139 UDOT v18.4s, v8.16b, v2.4b[0] 140 LDR x14, [x5], 8 141 UDOT v19.4s, v8.16b, v3.4b[0] 142 143 # BLOCK 1 144 UDOT v20.4s, v9.16b, v0.4b[0] 145 LDR d11, [x5], 8 146 UDOT v21.4s, v9.16b, v1.4b[0] 147 INS v10.d[1], x14 148 UDOT v22.4s, v9.16b, v2.4b[0] 149 LDR x14, [x5], 8 150 UDOT v23.4s, v9.16b, v3.4b[0] 151 152 # BLOCK 2 153 UDOT v24.4s, v10.16b, v0.4b[0] 154 LDR d8, [x5], 8 155 UDOT v25.4s, v10.16b, v1.4b[0] 156 INS v11.d[1], x14 157 UDOT v26.4s, v10.16b, v2.4b[0] 158 LDR x14, [x5], 8 159 UDOT v27.4s, v10.16b, v3.4b[0] 160 161 # BLOCK 3 162 UDOT v28.4s, v11.16b, v0.4b[0] 163 LDR d9, [x5], 8 164 UDOT v29.4s, v11.16b, v1.4b[0] 165 INS v8.d[1], x14 166 UDOT v30.4s, v11.16b, v2.4b[0] 167 LDR x14, [x5], 8 168 UDOT v31.4s, v11.16b, v3.4b[0] 169 170 UDOT v12.2s, v7.8b, v0.8b 171 UDOT v13.2s, v7.8b, v1.8b 172 UDOT v14.2s, v7.8b, v2.8b 173 UDOT v15.2s, v7.8b, v3.8b 174 175 # BLOCK 0 176 UDOT v16.4s, v8.16b, v0.4b[1] 177 LDR d10, [x5], 8 178 UDOT v17.4s, v8.16b, v1.4b[1] 179 INS v9.d[1], x14 180 UDOT v18.4s, v8.16b, v2.4b[1] 181 LDR x14, [x5], 8 182 UDOT v19.4s, v8.16b, v3.4b[1] 183 LDR d4, [x3], 8 184 185 # BLOCK 1 186 UDOT v20.4s, v9.16b, v0.4b[1] 187 LDR d11, [x5], 8 188 UDOT v21.4s, v9.16b, v1.4b[1] 189 INS v10.d[1], x14 190 UDOT v22.4s, v9.16b, v2.4b[1] 191 LDR x14, [x5], 8 192 UDOT v23.4s, v9.16b, v3.4b[1] 193 LDR d5, [x15], 8 194 195 # BLOCK 2 196 UDOT v24.4s, v10.16b, v0.4b[1] 197 LDR d8, [x5], 8 198 UDOT v25.4s, v10.16b, v1.4b[1] 199 INS v11.d[1], x14 200 UDOT v26.4s, v10.16b, v2.4b[1] 201 LDR x14, [x5], 8 202 UDOT v27.4s, v10.16b, v3.4b[1] 203 LDR d6, [x13], 8 204 205 # BLOCK 3 206 UDOT v28.4s, v11.16b, v0.4b[1] 207 LDR d9, [x5], 8 208 UDOT v29.4s, v11.16b, v1.4b[1] 209 INS v8.d[1], x14 210 UDOT v30.4s, v11.16b, v2.4b[1] 211 LDR x14, [x5], 8 212 UDOT v31.4s, v11.16b, v3.4b[1] 213 LDR d0, [x4], 8 214 215 # BLOCK 0 216 UDOT v16.4s, v8.16b, v4.4b[0] 217 LDR d10, [x5], 8 218 UDOT v17.4s, v8.16b, v5.4b[0] 219 INS v9.d[1], x14 220 UDOT v18.4s, v8.16b, v6.4b[0] 221 LDR x14, [x5], 8 222 UDOT v19.4s, v8.16b, v0.4b[0] 223 224 # BLOCK 1 225 UDOT v20.4s, v9.16b, v4.4b[0] 226 LDR d11, [x5], 8 227 UDOT v21.4s, v9.16b, v5.4b[0] 228 INS v10.d[1], x14 229 UDOT v22.4s, v9.16b, v6.4b[0] 230 LDR x14, [x5], 8 231 UDOT v23.4s, v9.16b, v0.4b[0] 232 233 # BLOCK 2 234 UDOT v24.4s, v10.16b, v4.4b[0] 235 LDR d8, [x5], 8 236 UDOT v25.4s, v10.16b, v5.4b[0] 237 INS v11.d[1], x14 238 UDOT v26.4s, v10.16b, v6.4b[0] 239 LDR x14, [x5], 8 240 UDOT v27.4s, v10.16b, v0.4b[0] 241 242 # BLOCK 3 243 UDOT v28.4s, v11.16b, v4.4b[0] 244 LDR d9, [x5], 8 245 UDOT v29.4s, v11.16b, v5.4b[0] 246 INS v8.d[1], x14 247 UDOT v30.4s, v11.16b, v6.4b[0] 248 LDR x14, [x5], 8 249 UDOT v31.4s, v11.16b, v0.4b[0] 250 251 # BLOCK 0 252 UDOT v16.4s, v8.16b, v4.4b[1] 253 LDR d10, [x5], 8 254 UDOT v17.4s, v8.16b, v5.4b[1] 255 INS v9.d[1], x14 256 UDOT v18.4s, v8.16b, v6.4b[1] 257 LDR x14, [x5], 8 258 UDOT v19.4s, v8.16b, v0.4b[1] 259 LDR d1, [x15], 8 260 261 # BLOCK 1 262 UDOT v20.4s, v9.16b, v4.4b[1] 263 LDR d11, [x5], 8 264 UDOT v21.4s, v9.16b, v5.4b[1] 265 INS v10.d[1], x14 266 UDOT v22.4s, v9.16b, v6.4b[1] 267 LDR x14, [x5], 8 268 UDOT v23.4s, v9.16b, v0.4b[1] 269 LDR d2, [x13], 8 270 271 # BLOCK 2 272 UDOT v24.4s, v10.16b, v4.4b[1] 273 LDR d8, [x5], 8 // First B values for block 0 and 1 274 UDOT v25.4s, v10.16b, v5.4b[1] 275 INS v11.d[1], x14 276 UDOT v26.4s, v10.16b, v6.4b[1] 277 LDR x14, [x5], 8 278 UDOT v27.4s, v10.16b, v0.4b[1] 279 LDR d3, [x4], 8 280 281 # BLOCK 3 special 282 UDOT v31.4s, v11.16b, v0.4b[1] 283 LDR d9, [x5], 8 284 UDOT v15.2s, v7.8b, v0.8b // free up v0 early 285 INS v8.d[1], x14 286 UDOT v28.4s, v11.16b, v4.4b[1] 287 LDR x14, [x5], 8 288 UDOT v29.4s, v11.16b, v5.4b[1] 289 LDR d0, [x3], 8 290 UDOT v30.4s, v11.16b, v6.4b[1] 291 SUBS x0, x0, 16 292 293 UDOT v12.2s, v7.8b, v4.8b 294 UDOT v13.2s, v7.8b, v5.8b 295 UDOT v14.2s, v7.8b, v6.8b 296 B.HS 1b 297 298 # Epilogue. Same as main loop but no preloads in final group 2992: 300 # BLOCK 0 301 UDOT v16.4s, v8.16b, v0.4b[0] 302 LDR d10, [x5], 8 303 UDOT v17.4s, v8.16b, v1.4b[0] 304 INS v9.d[1], x14 305 UDOT v18.4s, v8.16b, v2.4b[0] 306 LDR x14, [x5], 8 307 UDOT v19.4s, v8.16b, v3.4b[0] 308 309 # BLOCK 1 310 UDOT v20.4s, v9.16b, v0.4b[0] 311 LDR d11, [x5], 8 312 UDOT v21.4s, v9.16b, v1.4b[0] 313 INS v10.d[1], x14 314 UDOT v22.4s, v9.16b, v2.4b[0] 315 LDR x14, [x5], 8 316 UDOT v23.4s, v9.16b, v3.4b[0] 317 318 # BLOCK 2 319 UDOT v24.4s, v10.16b, v0.4b[0] 320 LDR d8, [x5], 8 321 UDOT v25.4s, v10.16b, v1.4b[0] 322 INS v11.d[1], x14 323 UDOT v26.4s, v10.16b, v2.4b[0] 324 LDR x14, [x5], 8 325 UDOT v27.4s, v10.16b, v3.4b[0] 326 327 # BLOCK 3 328 UDOT v28.4s, v11.16b, v0.4b[0] 329 LDR d9, [x5], 8 330 UDOT v29.4s, v11.16b, v1.4b[0] 331 INS v8.d[1], x14 332 UDOT v30.4s, v11.16b, v2.4b[0] 333 LDR x14, [x5], 8 334 UDOT v31.4s, v11.16b, v3.4b[0] 335 336 UDOT v12.2s, v7.8b, v0.8b 337 UDOT v13.2s, v7.8b, v1.8b 338 UDOT v14.2s, v7.8b, v2.8b 339 UDOT v15.2s, v7.8b, v3.8b 340 341 # BLOCK 0 342 UDOT v16.4s, v8.16b, v0.4b[1] 343 LDR d10, [x5], 8 344 UDOT v17.4s, v8.16b, v1.4b[1] 345 INS v9.d[1], x14 346 UDOT v18.4s, v8.16b, v2.4b[1] 347 LDR x14, [x5], 8 348 UDOT v19.4s, v8.16b, v3.4b[1] 349 LDR d4, [x3], 8 350 351 # BLOCK 1 352 UDOT v20.4s, v9.16b, v0.4b[1] 353 LDR d11, [x5], 8 354 UDOT v21.4s, v9.16b, v1.4b[1] 355 INS v10.d[1], x14 356 UDOT v22.4s, v9.16b, v2.4b[1] 357 LDR x14, [x5], 8 358 UDOT v23.4s, v9.16b, v3.4b[1] 359 LDR d5, [x15], 8 360 361 # BLOCK 2 362 UDOT v24.4s, v10.16b, v0.4b[1] 363 LDR d8, [x5], 8 364 UDOT v25.4s, v10.16b, v1.4b[1] 365 INS v11.d[1], x14 366 UDOT v26.4s, v10.16b, v2.4b[1] 367 LDR x14, [x5], 8 368 UDOT v27.4s, v10.16b, v3.4b[1] 369 LDR d6, [x13], 8 370 371 # BLOCK 3 372 UDOT v28.4s, v11.16b, v0.4b[1] 373 LDR d9, [x5], 8 374 UDOT v29.4s, v11.16b, v1.4b[1] 375 INS v8.d[1], x14 376 UDOT v30.4s, v11.16b, v2.4b[1] 377 LDR x14, [x5], 8 378 UDOT v31.4s, v11.16b, v3.4b[1] 379 LDR d0, [x4], 8 380 381 # BLOCK 0 382 UDOT v16.4s, v8.16b, v4.4b[0] 383 LDR d10, [x5], 8 384 UDOT v17.4s, v8.16b, v5.4b[0] 385 INS v9.d[1], x14 386 UDOT v18.4s, v8.16b, v6.4b[0] 387 LDR x14, [x5], 8 388 UDOT v19.4s, v8.16b, v0.4b[0] 389 390 # BLOCK 1 391 UDOT v20.4s, v9.16b, v4.4b[0] 392 LDR d11, [x5], 8 393 UDOT v21.4s, v9.16b, v5.4b[0] 394 INS v10.d[1], x14 395 UDOT v22.4s, v9.16b, v6.4b[0] 396 LDR x14, [x5], 8 397 UDOT v23.4s, v9.16b, v0.4b[0] 398 399 # BLOCK 2 400 UDOT v24.4s, v10.16b, v4.4b[0] 401 LDR d8, [x5], 8 402 UDOT v25.4s, v10.16b, v5.4b[0] 403 INS v11.d[1], x14 404 UDOT v26.4s, v10.16b, v6.4b[0] 405 LDR x14, [x5], 8 406 UDOT v27.4s, v10.16b, v0.4b[0] 407 408 # BLOCK 3 409 UDOT v28.4s, v11.16b, v4.4b[0] 410 LDR d9, [x5], 8 411 UDOT v29.4s, v11.16b, v5.4b[0] 412 INS v8.d[1], x14 413 UDOT v30.4s, v11.16b, v6.4b[0] 414 LDR x14, [x5], 8 415 UDOT v31.4s, v11.16b, v0.4b[0] 416 417 # BLOCK 0 418 UDOT v16.4s, v8.16b, v4.4b[1] 419 LDR d10, [x5], 8 420 UDOT v17.4s, v8.16b, v5.4b[1] 421 INS v9.d[1], x14 422 UDOT v18.4s, v8.16b, v6.4b[1] 423 LDR x14, [x5], 8 424 UDOT v19.4s, v8.16b, v0.4b[1] 425 426 # BLOCK 1 427 UDOT v20.4s, v9.16b, v4.4b[1] 428 LDR d11, [x5], 8 429 UDOT v21.4s, v9.16b, v5.4b[1] 430 INS v10.d[1], x14 431 UDOT v22.4s, v9.16b, v6.4b[1] 432 LDR x14, [x5], 8 433 UDOT v23.4s, v9.16b, v0.4b[1] 434 435 # BLOCK 2 436 UDOT v24.4s, v10.16b, v4.4b[1] 437 UDOT v25.4s, v10.16b, v5.4b[1] 438 INS v11.d[1], x14 439 UDOT v26.4s, v10.16b, v6.4b[1] 440 UDOT v27.4s, v10.16b, v0.4b[1] 441 442 # BLOCK 3 443 UDOT v28.4s, v11.16b, v4.4b[1] 444 UDOT v29.4s, v11.16b, v5.4b[1] 445 UDOT v30.4s, v11.16b, v6.4b[1] 446 UDOT v31.4s, v11.16b, v0.4b[1] 447 AND x0, x2, 15 // kc remainder 0 to 12 448 449 UDOT v12.2s, v7.8b, v4.8b 450 UDOT v13.2s, v7.8b, v5.8b 451 UDOT v14.2s, v7.8b, v6.8b 452 UDOT v15.2s, v7.8b, v0.8b 453 454 # Is there a remainder?- 4 to 12 bytes of A 455 CBNZ x0, 4f 456 4573: 458 ADDP v0.2s, v12.2s, v13.2s 459 ADDP v1.2s, v14.2s, v15.2s 460 DUP v12.4s, v0.s[0] 461 DUP v13.4s, v0.s[1] 462 DUP v14.4s, v1.s[0] 463 DUP v15.4s, v1.s[1] 464 465 # Subtract zero point from accumulators 466 SUB v16.4s, v16.4s, v12.4s 467 SUB v17.4s, v17.4s, v13.4s 468 SUB v18.4s, v18.4s, v14.4s 469 SUB v19.4s, v19.4s, v15.4s 470 SUB v20.4s, v20.4s, v12.4s 471 SUB v21.4s, v21.4s, v13.4s 472 SUB v22.4s, v22.4s, v14.4s 473 SUB v23.4s, v23.4s, v15.4s 474 SUB v24.4s, v24.4s, v12.4s 475 SUB v25.4s, v25.4s, v13.4s 476 SUB v26.4s, v26.4s, v14.4s 477 SUB v27.4s, v27.4s, v15.4s 478 SUB v28.4s, v28.4s, v12.4s 479 SUB v29.4s, v29.4s, v13.4s 480 SUB v30.4s, v30.4s, v14.4s 481 SUB v31.4s, v31.4s, v15.4s 482 483 # Apply params - preshift, scale, postshift, bias and clamp 484 LD1R {v4.4s}, [x11], 4 485 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits 486 SSHL v17.4s, v17.4s, v4.4s 487 SSHL v18.4s, v18.4s, v4.4s 488 SSHL v19.4s, v19.4s, v4.4s 489 SSHL v20.4s, v20.4s, v4.4s 490 SSHL v21.4s, v21.4s, v4.4s 491 SSHL v22.4s, v22.4s, v4.4s 492 SSHL v23.4s, v23.4s, v4.4s 493 LD1R {v5.4s}, [x11], 4 494 SSHL v24.4s, v24.4s, v4.4s 495 SSHL v25.4s, v25.4s, v4.4s 496 SSHL v26.4s, v26.4s, v4.4s 497 SSHL v27.4s, v27.4s, v4.4s 498 SSHL v28.4s, v28.4s, v4.4s 499 SSHL v29.4s, v29.4s, v4.4s 500 SSHL v30.4s, v30.4s, v4.4s 501 SSHL v31.4s, v31.4s, v4.4s 502 LD1R {v6.4s}, [x11], 4 503 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 504 SQDMULH v17.4s, v17.4s, v5.4s 505 SQDMULH v18.4s, v18.4s, v5.4s 506 SQDMULH v19.4s, v19.4s, v5.4s 507 SQDMULH v20.4s, v20.4s, v5.4s 508 SQDMULH v21.4s, v21.4s, v5.4s 509 SQDMULH v22.4s, v22.4s, v5.4s 510 SQDMULH v23.4s, v23.4s, v5.4s 511 SQDMULH v24.4s, v24.4s, v5.4s 512 SQDMULH v25.4s, v25.4s, v5.4s 513 SQDMULH v26.4s, v26.4s, v5.4s 514 SQDMULH v27.4s, v27.4s, v5.4s 515 SQDMULH v28.4s, v28.4s, v5.4s 516 SQDMULH v29.4s, v29.4s, v5.4s 517 SQDMULH v30.4s, v30.4s, v5.4s 518 SQDMULH v31.4s, v31.4s, v5.4s 519 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 520 SRSHL v17.4s, v17.4s, v6.4s 521 SRSHL v18.4s, v18.4s, v6.4s 522 SRSHL v19.4s, v19.4s, v6.4s 523 SRSHL v20.4s, v20.4s, v6.4s 524 SRSHL v21.4s, v21.4s, v6.4s 525 SRSHL v22.4s, v22.4s, v6.4s 526 SRSHL v23.4s, v23.4s, v6.4s 527 SRSHL v24.4s, v24.4s, v6.4s 528 SRSHL v25.4s, v25.4s, v6.4s 529 SRSHL v26.4s, v26.4s, v6.4s 530 SRSHL v27.4s, v27.4s, v6.4s 531 SRSHL v28.4s, v28.4s, v6.4s 532 SRSHL v29.4s, v29.4s, v6.4s 533 SRSHL v30.4s, v30.4s, v6.4s 534 SRSHL v31.4s, v31.4s, v6.4s 535 536 SQXTN v16.4h, v16.4s 537 SQXTN v17.4h, v17.4s 538 SQXTN v18.4h, v18.4s 539 SQXTN v19.4h, v19.4s 540 SQXTN v24.4h, v24.4s 541 SQXTN v25.4h, v25.4s 542 SQXTN v26.4h, v26.4s 543 SQXTN v27.4h, v27.4s 544 LD1R {v6.8h}, [x11], 2 // add bias 545 546 SQXTN2 v16.8h, v20.4s 547 SQXTN2 v17.8h, v21.4s 548 SQXTN2 v18.8h, v22.4s 549 SQXTN2 v19.8h, v23.4s 550 SQXTN2 v24.8h, v28.4s 551 SQXTN2 v25.8h, v29.4s 552 SQXTN2 v26.8h, v30.4s 553 SQXTN2 v27.8h, v31.4s 554 555 SQADD v16.8h, v16.8h, v6.8h 556 SQADD v17.8h, v17.8h, v6.8h 557 SQADD v18.8h, v18.8h, v6.8h 558 SQADD v19.8h, v19.8h, v6.8h 559 SQADD v24.8h, v24.8h, v6.8h 560 SQADD v25.8h, v25.8h, v6.8h 561 SQADD v26.8h, v26.8h, v6.8h 562 SQADD v27.8h, v27.8h, v6.8h 563 LD1R {v4.16b}, [x11], 1 // clamp min value 564 565 SQXTUN v0.8b, v16.8h 566 SQXTUN v1.8b, v17.8h 567 SQXTUN v2.8b, v18.8h 568 SQXTUN v3.8b, v19.8h 569 LD1R {v5.16b}, [x11] // clamp max value 570 SQXTUN2 v0.16b, v24.8h 571 SQXTUN2 v1.16b, v25.8h 572 SQXTUN2 v2.16b, v26.8h 573 SQXTUN2 v3.16b, v27.8h 574 575 SUB x11, x11, 15 // rewind params pointer 576 577 UMAX v0.16b, v0.16b, v4.16b 578 UMAX v1.16b, v1.16b, v4.16b 579 UMAX v2.16b, v2.16b, v4.16b 580 UMAX v3.16b, v3.16b, v4.16b 581 SUBS x1, x1, 16 582 UMIN v0.16b, v0.16b, v5.16b 583 UMIN v1.16b, v1.16b, v5.16b 584 UMIN v2.16b, v2.16b, v5.16b 585 UMIN v3.16b, v3.16b, v5.16b 586 B.LO 6f 587 588 # Store full 4 x 16 589 ST1 {v0.16b}, [x6], x12 590 SUB x3, x3, x2 // a0 -= kc 591 ST1 {v1.16b}, [x8], x12 592 SUB x15, x15, x2 // a1 -= kc 593 ST1 {v2.16b}, [x9], x12 594 SUB x13, x13, x2 // a2 -= kc 595 ST1 {v3.16b}, [x7], x12 596 SUB x4, x4, x2 // a3 -= kc 597 B.NE 0b 598 599 # Restore d8-d15 from stack 600 LDP d14, d15, [sp, 48] 601 LDP d12, d13, [sp, 32] 602 LDP d10, d11, [sp, 16] 603 LDP d8, d9, [sp], 64 604 RET 605 606 # Remainder- 4 to 12 bytes of A 607 .p2align 3 6084: 609 TBZ x0, 3, 5f 610 611 LDR d0, [x3], 8 612 LDP q8, q9, [x5], 32 613 LDR d1, [x15], 8 614 LDR d2, [x13], 8 615 LDR d3, [x4], 8 616 LDP q10, q11, [x5], 32 617 UDOT v12.2s, v7.8b, v0.8b 618 UDOT v13.2s, v7.8b, v1.8b 619 UDOT v14.2s, v7.8b, v2.8b 620 UDOT v15.2s, v7.8b, v3.8b 621 UDOT v16.4s, v8.16b, v0.4b[0] 622 UDOT v17.4s, v8.16b, v1.4b[0] 623 UDOT v18.4s, v8.16b, v2.4b[0] 624 UDOT v19.4s, v8.16b, v3.4b[0] 625 UDOT v20.4s, v9.16b, v0.4b[0] 626 UDOT v21.4s, v9.16b, v1.4b[0] 627 UDOT v22.4s, v9.16b, v2.4b[0] 628 UDOT v23.4s, v9.16b, v3.4b[0] 629 UDOT v24.4s, v10.16b, v0.4b[0] 630 UDOT v25.4s, v10.16b, v1.4b[0] 631 UDOT v26.4s, v10.16b, v2.4b[0] 632 UDOT v27.4s, v10.16b, v3.4b[0] 633 UDOT v28.4s, v11.16b, v0.4b[0] 634 UDOT v29.4s, v11.16b, v1.4b[0] 635 UDOT v30.4s, v11.16b, v2.4b[0] 636 UDOT v31.4s, v11.16b, v3.4b[0] 637 LDP q8, q9, [x5], 32 638 LDP q10, q11, [x5], 32 639 UDOT v16.4s, v8.16b, v0.4b[1] 640 UDOT v17.4s, v8.16b, v1.4b[1] 641 UDOT v18.4s, v8.16b, v2.4b[1] 642 UDOT v19.4s, v8.16b, v3.4b[1] 643 UDOT v20.4s, v9.16b, v0.4b[1] 644 UDOT v21.4s, v9.16b, v1.4b[1] 645 UDOT v22.4s, v9.16b, v2.4b[1] 646 UDOT v23.4s, v9.16b, v3.4b[1] 647 UDOT v24.4s, v10.16b, v0.4b[1] 648 UDOT v25.4s, v10.16b, v1.4b[1] 649 UDOT v26.4s, v10.16b, v2.4b[1] 650 UDOT v27.4s, v10.16b, v3.4b[1] 651 UDOT v28.4s, v11.16b, v0.4b[1] 652 UDOT v29.4s, v11.16b, v1.4b[1] 653 UDOT v30.4s, v11.16b, v2.4b[1] 654 UDOT v31.4s, v11.16b, v3.4b[1] 655 TBZ x0, 2, 3b 6565: 657 LDR s0, [x3], 4 658 LDP q8, q9, [x5], 32 659 LDR s1, [x15], 4 660 LDR s2, [x13], 4 661 LDR s3, [x4], 4 662 LDP q10, q11, [x5], 32 663 UDOT v12.2s, v7.8b, v0.8b 664 UDOT v13.2s, v7.8b, v1.8b 665 UDOT v14.2s, v7.8b, v2.8b 666 UDOT v15.2s, v7.8b, v3.8b 667 UDOT v16.4s, v8.16b, v0.4b[0] 668 UDOT v17.4s, v8.16b, v1.4b[0] 669 UDOT v18.4s, v8.16b, v2.4b[0] 670 UDOT v19.4s, v8.16b, v3.4b[0] 671 UDOT v20.4s, v9.16b, v0.4b[0] 672 UDOT v21.4s, v9.16b, v1.4b[0] 673 UDOT v22.4s, v9.16b, v2.4b[0] 674 UDOT v23.4s, v9.16b, v3.4b[0] 675 UDOT v24.4s, v10.16b, v0.4b[0] 676 UDOT v25.4s, v10.16b, v1.4b[0] 677 UDOT v26.4s, v10.16b, v2.4b[0] 678 UDOT v27.4s, v10.16b, v3.4b[0] 679 UDOT v28.4s, v11.16b, v0.4b[0] 680 UDOT v29.4s, v11.16b, v1.4b[0] 681 UDOT v30.4s, v11.16b, v2.4b[0] 682 UDOT v31.4s, v11.16b, v3.4b[0] 683 B 3b 684 685 # Store odd width 686 .p2align 3 6876: 688 TBZ x1, 3, 7f 689 STR d0, [x6], 8 690 STR d1, [x8], 8 691 DUP d0, v0.d[1] 692 DUP d1, v1.d[1] 693 STR d2, [x9], 8 694 STR d3, [x7], 8 695 DUP d2, v2.d[1] 696 DUP d3, v3.d[1] 6977: 698 TBZ x1, 2, 8f 699 STR s0, [x6], 4 700 STR s1, [x8], 4 701 DUP s0, v0.s[1] 702 DUP s1, v1.s[1] 703 STR s2, [x9], 4 704 STR s3, [x7], 4 705 DUP s2, v2.s[1] 706 DUP s3, v3.s[1] 7078: 708 TBZ x1, 1, 9f 709 STR h0, [x6], 2 710 STR h1, [x8], 2 711 DUP h0, v0.h[1] 712 DUP h1, v1.h[1] 713 STR h2, [x9], 2 714 STR h3, [x7], 2 715 DUP h2, v2.h[1] 716 DUP h3, v3.h[1] 7179: 718 TBZ x1, 0, 10f 719 STR b0, [x6] 720 STR b1, [x8] 721 STR b2, [x9] 722 STR b3, [x7] 72310: 724 # Restore d8-d15 from stack 725 LDP d14, d15, [sp, 48] 726 LDP d12, d13, [sp, 32] 727 LDP d10, d11, [sp, 16] 728 LDP d8, d9, [sp], 64 729 RET 730 731END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55 732 733#ifdef __ELF__ 734.section ".note.GNU-stack","",%progbits 735#endif 736