1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert REQUANTIZATION in ["FP32", "RNDNU"] 7 8#include <xnnpack/assembly.h> 9 10$REWIND_DECREMENT = {"RNDNU": 15, "FP32": 7}[REQUANTIZATION] 11# void xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55( 12# size_t mr, x0 13# size_t nc, x1 14# size_t kc, x2 / x0 15# const int8_t* restrict a, x3 16# size_t a_stride, x4 17# const void* restrict w, x5 18# int8_t* restrict c, x6 19# size_t cm_stride, x7 20# size_t cn_stride, [sp] -> x12 21# const union xnn_qu8_conv_minmax_params params) [sp + 8] -> x11 22 23$if REQUANTIZATION == "RNDNU": 24 # params structure is 20 bytes 25 # struct { 26 # uint8_t kernel_zero_point[4]; 27 # int32_t right_pre_shift; 28 # int32_t multiplier; 29 # int32_t right_post_shift; 30 # int16_t output_zero_point; 31 # int8_t output_min; 32 # int8_t output_max; 33 # } rndnu_neon; 34$elif REQUANTIZATION == "FP32": 35 # params structure is 12 bytes 36 # struct { 37 # uint8_t kernel_zero_point[4]; 38 # float scale; 39 # int16_t output_zero_point; 40 # int8_t output_min; 41 # int8_t output_max; 42 # } fp32_neonv8; 43 44# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 45 46# Register usage 47# A0 x3 v0 v4 48# A1 x15 v1 v5 49# A2 x13 v2 v6 50# A3 x4 v3 (v0) 51# B x5 v8 v9 v10 v11 52# C0 x6 v16 v20 v24 v28 53# C1 x8 v17 v21 v25 v29 54# C2 x9 v18 v22 v26 v30 55# C3 x7 v19 v23 v27 v31 56# zero point v7 v12 v13 v14 v15 57 58# x14 temp for Cortex-A55 loads 59 60BEGIN_FUNCTION xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55 61 62 # Clamp A and C pointers 63 CMP x0, 2 // if mr < 2 64 LDP x12, x11, [sp] // cn_stride, params 65 ADD x2, x2, 3 // kc = (kc + 3) & ~3 66 ADD x15, x3, x4 // a1 = a0 + a_stride 67 ADD x8, x6, x7 // c1 = c0 + cm_stride 68 69 # Save d8-d15 to stack 70 STP d8, d9, [sp, -64]! 71 CSEL x15, x3, x15, LO // a1 = a0 72 CSEL x8, x6, x8, LO // c1 = c0 73 BIC x2, x2, 3 74 STP d10, d11, [sp, 16] 75 76 ADD x13, x15, x4 // a2 = a1 + a_stride 77 ADD x9, x8, x7 // c2 = c1 + cm_stride 78 STP d12, d13, [sp, 32] 79 // if mr <= 2 80 CSEL x13, x15, x13, LS // a2 = a1 81 CSEL x9, x8, x9, LS // c2 = c1 82 STP d14, d15, [sp, 48] 83 84 CMP x0, 4 // if mr < 4 85 ADD x4, x13, x4 // a3 = a2 + a_stride 86 ADD x7, x9, x7 // c3 = c2 + cm_stride 87 88 LD1R {v7.4s}, [x11], 4 // kernel_zero_point 89 90 CSEL x4, x13, x4, LO // a3 = a2 91 CSEL x7, x9, x7, LO // c3 = c2 92 93 94 .p2align 3 950: 96 # Load initial bias from w into accumulators 97 LDP q16, q20, [x5], 32 98 99 MOVI v12.4s, 0 100 MOVI v13.4s, 0 101 MOVI v14.4s, 0 102 MOVI v15.4s, 0 103 104 LDP q24, q28, [x5], 32 105 MOV v17.16b, v16.16b 106 MOV v18.16b, v16.16b 107 MOV v19.16b, v16.16b 108 MOV v21.16b, v20.16b 109 SUBS x0, x2, 16 // k = kc - 16 110 MOV v22.16b, v20.16b 111 MOV v23.16b, v20.16b 112 MOV v25.16b, v24.16b 113 MOV v26.16b, v24.16b 114 MOV v27.16b, v24.16b 115 MOV v29.16b, v28.16b 116 MOV v30.16b, v28.16b 117 MOV v31.16b, v28.16b 118 119 # Is there at least 16 bytes for prologue/epilogue? 120 B.LO 4f 121 122 # prologue - read A and B values for block 0 and 1 123 LDR d0, [x3], 8 124 LDR q8, [x5], 16 125 LDR d1, [x15], 8 126 LDR d2, [x13], 8 127 LDR d3, [x4], 8 128 SUBS x0, x0, 16 // is there 16 for main loop? 129 LDR d9, [x5], 8 130 LDR x14, [x5], 8 131 # Is there at least 16 bytes for main loop? 132 B.LO 2f 133 134 # Main loop - 16 bytes of A in 4 groups. 135 # 4 row of 4 vectors wide = 16 UDOT instructions for 4 channels 136 # 4 LD64 for A 137 # 4 LD128 for W. = 2 LD64 + INS. 138 # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS. 139 140 .p2align 3 1411: 142 # BLOCK 0 143 UDOT v16.4s, v8.16b, v0.4b[0] 144 LDR d10, [x5], 8 145 UDOT v17.4s, v8.16b, v1.4b[0] 146 INS v9.d[1], x14 147 UDOT v18.4s, v8.16b, v2.4b[0] 148 LDR x14, [x5], 8 149 UDOT v19.4s, v8.16b, v3.4b[0] 150 151 # BLOCK 1 152 UDOT v20.4s, v9.16b, v0.4b[0] 153 LDR d11, [x5], 8 154 UDOT v21.4s, v9.16b, v1.4b[0] 155 INS v10.d[1], x14 156 UDOT v22.4s, v9.16b, v2.4b[0] 157 LDR x14, [x5], 8 158 UDOT v23.4s, v9.16b, v3.4b[0] 159 160 # BLOCK 2 161 UDOT v24.4s, v10.16b, v0.4b[0] 162 LDR d8, [x5], 8 163 UDOT v25.4s, v10.16b, v1.4b[0] 164 INS v11.d[1], x14 165 UDOT v26.4s, v10.16b, v2.4b[0] 166 LDR x14, [x5], 8 167 UDOT v27.4s, v10.16b, v3.4b[0] 168 169 # BLOCK 3 170 UDOT v28.4s, v11.16b, v0.4b[0] 171 LDR d9, [x5], 8 172 UDOT v29.4s, v11.16b, v1.4b[0] 173 INS v8.d[1], x14 174 UDOT v30.4s, v11.16b, v2.4b[0] 175 LDR x14, [x5], 8 176 UDOT v31.4s, v11.16b, v3.4b[0] 177 178 UDOT v12.2s, v7.8b, v0.8b 179 UDOT v13.2s, v7.8b, v1.8b 180 UDOT v14.2s, v7.8b, v2.8b 181 UDOT v15.2s, v7.8b, v3.8b 182 183 # BLOCK 0 184 UDOT v16.4s, v8.16b, v0.4b[1] 185 LDR d10, [x5], 8 186 UDOT v17.4s, v8.16b, v1.4b[1] 187 INS v9.d[1], x14 188 UDOT v18.4s, v8.16b, v2.4b[1] 189 LDR x14, [x5], 8 190 UDOT v19.4s, v8.16b, v3.4b[1] 191 LDR d4, [x3], 8 192 193 # BLOCK 1 194 UDOT v20.4s, v9.16b, v0.4b[1] 195 LDR d11, [x5], 8 196 UDOT v21.4s, v9.16b, v1.4b[1] 197 INS v10.d[1], x14 198 UDOT v22.4s, v9.16b, v2.4b[1] 199 LDR x14, [x5], 8 200 UDOT v23.4s, v9.16b, v3.4b[1] 201 LDR d5, [x15], 8 202 203 # BLOCK 2 204 UDOT v24.4s, v10.16b, v0.4b[1] 205 LDR d8, [x5], 8 206 UDOT v25.4s, v10.16b, v1.4b[1] 207 INS v11.d[1], x14 208 UDOT v26.4s, v10.16b, v2.4b[1] 209 LDR x14, [x5], 8 210 UDOT v27.4s, v10.16b, v3.4b[1] 211 LDR d6, [x13], 8 212 213 # BLOCK 3 214 UDOT v28.4s, v11.16b, v0.4b[1] 215 LDR d9, [x5], 8 216 UDOT v29.4s, v11.16b, v1.4b[1] 217 INS v8.d[1], x14 218 UDOT v30.4s, v11.16b, v2.4b[1] 219 LDR x14, [x5], 8 220 UDOT v31.4s, v11.16b, v3.4b[1] 221 LDR d0, [x4], 8 222 223 # BLOCK 0 224 UDOT v16.4s, v8.16b, v4.4b[0] 225 LDR d10, [x5], 8 226 UDOT v17.4s, v8.16b, v5.4b[0] 227 INS v9.d[1], x14 228 UDOT v18.4s, v8.16b, v6.4b[0] 229 LDR x14, [x5], 8 230 UDOT v19.4s, v8.16b, v0.4b[0] 231 232 # BLOCK 1 233 UDOT v20.4s, v9.16b, v4.4b[0] 234 LDR d11, [x5], 8 235 UDOT v21.4s, v9.16b, v5.4b[0] 236 INS v10.d[1], x14 237 UDOT v22.4s, v9.16b, v6.4b[0] 238 LDR x14, [x5], 8 239 UDOT v23.4s, v9.16b, v0.4b[0] 240 241 # BLOCK 2 242 UDOT v24.4s, v10.16b, v4.4b[0] 243 LDR d8, [x5], 8 244 UDOT v25.4s, v10.16b, v5.4b[0] 245 INS v11.d[1], x14 246 UDOT v26.4s, v10.16b, v6.4b[0] 247 LDR x14, [x5], 8 248 UDOT v27.4s, v10.16b, v0.4b[0] 249 250 # BLOCK 3 251 UDOT v28.4s, v11.16b, v4.4b[0] 252 LDR d9, [x5], 8 253 UDOT v29.4s, v11.16b, v5.4b[0] 254 INS v8.d[1], x14 255 UDOT v30.4s, v11.16b, v6.4b[0] 256 LDR x14, [x5], 8 257 UDOT v31.4s, v11.16b, v0.4b[0] 258 259 # BLOCK 0 260 UDOT v16.4s, v8.16b, v4.4b[1] 261 LDR d10, [x5], 8 262 UDOT v17.4s, v8.16b, v5.4b[1] 263 INS v9.d[1], x14 264 UDOT v18.4s, v8.16b, v6.4b[1] 265 LDR x14, [x5], 8 266 UDOT v19.4s, v8.16b, v0.4b[1] 267 LDR d1, [x15], 8 268 269 # BLOCK 1 270 UDOT v20.4s, v9.16b, v4.4b[1] 271 LDR d11, [x5], 8 272 UDOT v21.4s, v9.16b, v5.4b[1] 273 INS v10.d[1], x14 274 UDOT v22.4s, v9.16b, v6.4b[1] 275 LDR x14, [x5], 8 276 UDOT v23.4s, v9.16b, v0.4b[1] 277 LDR d2, [x13], 8 278 279 # BLOCK 2 280 UDOT v24.4s, v10.16b, v4.4b[1] 281 LDR d8, [x5], 8 // First B values for block 0 and 1 282 UDOT v25.4s, v10.16b, v5.4b[1] 283 INS v11.d[1], x14 284 UDOT v26.4s, v10.16b, v6.4b[1] 285 LDR x14, [x5], 8 286 UDOT v27.4s, v10.16b, v0.4b[1] 287 LDR d3, [x4], 8 288 289 # BLOCK 3 special 290 UDOT v31.4s, v11.16b, v0.4b[1] 291 LDR d9, [x5], 8 292 UDOT v15.2s, v7.8b, v0.8b // free up v0 early 293 INS v8.d[1], x14 294 UDOT v28.4s, v11.16b, v4.4b[1] 295 LDR x14, [x5], 8 296 UDOT v29.4s, v11.16b, v5.4b[1] 297 LDR d0, [x3], 8 298 UDOT v30.4s, v11.16b, v6.4b[1] 299 SUBS x0, x0, 16 300 301 UDOT v12.2s, v7.8b, v4.8b 302 UDOT v13.2s, v7.8b, v5.8b 303 UDOT v14.2s, v7.8b, v6.8b 304 B.HS 1b 305 306 # Epilogue. Same as main loop but no preloads in final group 3072: 308 # BLOCK 0 309 UDOT v16.4s, v8.16b, v0.4b[0] 310 LDR d10, [x5], 8 311 UDOT v17.4s, v8.16b, v1.4b[0] 312 INS v9.d[1], x14 313 UDOT v18.4s, v8.16b, v2.4b[0] 314 LDR x14, [x5], 8 315 UDOT v19.4s, v8.16b, v3.4b[0] 316 317 # BLOCK 1 318 UDOT v20.4s, v9.16b, v0.4b[0] 319 LDR d11, [x5], 8 320 UDOT v21.4s, v9.16b, v1.4b[0] 321 INS v10.d[1], x14 322 UDOT v22.4s, v9.16b, v2.4b[0] 323 LDR x14, [x5], 8 324 UDOT v23.4s, v9.16b, v3.4b[0] 325 326 # BLOCK 2 327 UDOT v24.4s, v10.16b, v0.4b[0] 328 LDR d8, [x5], 8 329 UDOT v25.4s, v10.16b, v1.4b[0] 330 INS v11.d[1], x14 331 UDOT v26.4s, v10.16b, v2.4b[0] 332 LDR x14, [x5], 8 333 UDOT v27.4s, v10.16b, v3.4b[0] 334 335 # BLOCK 3 336 UDOT v28.4s, v11.16b, v0.4b[0] 337 LDR d9, [x5], 8 338 UDOT v29.4s, v11.16b, v1.4b[0] 339 INS v8.d[1], x14 340 UDOT v30.4s, v11.16b, v2.4b[0] 341 LDR x14, [x5], 8 342 UDOT v31.4s, v11.16b, v3.4b[0] 343 344 UDOT v12.2s, v7.8b, v0.8b 345 UDOT v13.2s, v7.8b, v1.8b 346 UDOT v14.2s, v7.8b, v2.8b 347 UDOT v15.2s, v7.8b, v3.8b 348 349 # BLOCK 0 350 UDOT v16.4s, v8.16b, v0.4b[1] 351 LDR d10, [x5], 8 352 UDOT v17.4s, v8.16b, v1.4b[1] 353 INS v9.d[1], x14 354 UDOT v18.4s, v8.16b, v2.4b[1] 355 LDR x14, [x5], 8 356 UDOT v19.4s, v8.16b, v3.4b[1] 357 LDR d4, [x3], 8 358 359 # BLOCK 1 360 UDOT v20.4s, v9.16b, v0.4b[1] 361 LDR d11, [x5], 8 362 UDOT v21.4s, v9.16b, v1.4b[1] 363 INS v10.d[1], x14 364 UDOT v22.4s, v9.16b, v2.4b[1] 365 LDR x14, [x5], 8 366 UDOT v23.4s, v9.16b, v3.4b[1] 367 LDR d5, [x15], 8 368 369 # BLOCK 2 370 UDOT v24.4s, v10.16b, v0.4b[1] 371 LDR d8, [x5], 8 372 UDOT v25.4s, v10.16b, v1.4b[1] 373 INS v11.d[1], x14 374 UDOT v26.4s, v10.16b, v2.4b[1] 375 LDR x14, [x5], 8 376 UDOT v27.4s, v10.16b, v3.4b[1] 377 LDR d6, [x13], 8 378 379 # BLOCK 3 380 UDOT v28.4s, v11.16b, v0.4b[1] 381 LDR d9, [x5], 8 382 UDOT v29.4s, v11.16b, v1.4b[1] 383 INS v8.d[1], x14 384 UDOT v30.4s, v11.16b, v2.4b[1] 385 LDR x14, [x5], 8 386 UDOT v31.4s, v11.16b, v3.4b[1] 387 LDR d0, [x4], 8 388 389 # BLOCK 0 390 UDOT v16.4s, v8.16b, v4.4b[0] 391 LDR d10, [x5], 8 392 UDOT v17.4s, v8.16b, v5.4b[0] 393 INS v9.d[1], x14 394 UDOT v18.4s, v8.16b, v6.4b[0] 395 LDR x14, [x5], 8 396 UDOT v19.4s, v8.16b, v0.4b[0] 397 398 # BLOCK 1 399 UDOT v20.4s, v9.16b, v4.4b[0] 400 LDR d11, [x5], 8 401 UDOT v21.4s, v9.16b, v5.4b[0] 402 INS v10.d[1], x14 403 UDOT v22.4s, v9.16b, v6.4b[0] 404 LDR x14, [x5], 8 405 UDOT v23.4s, v9.16b, v0.4b[0] 406 407 # BLOCK 2 408 UDOT v24.4s, v10.16b, v4.4b[0] 409 LDR d8, [x5], 8 410 UDOT v25.4s, v10.16b, v5.4b[0] 411 INS v11.d[1], x14 412 UDOT v26.4s, v10.16b, v6.4b[0] 413 LDR x14, [x5], 8 414 UDOT v27.4s, v10.16b, v0.4b[0] 415 416 # BLOCK 3 417 UDOT v28.4s, v11.16b, v4.4b[0] 418 LDR d9, [x5], 8 419 UDOT v29.4s, v11.16b, v5.4b[0] 420 INS v8.d[1], x14 421 UDOT v30.4s, v11.16b, v6.4b[0] 422 LDR x14, [x5], 8 423 UDOT v31.4s, v11.16b, v0.4b[0] 424 425 # BLOCK 0 426 UDOT v16.4s, v8.16b, v4.4b[1] 427 LDR d10, [x5], 8 428 UDOT v17.4s, v8.16b, v5.4b[1] 429 INS v9.d[1], x14 430 UDOT v18.4s, v8.16b, v6.4b[1] 431 LDR x14, [x5], 8 432 UDOT v19.4s, v8.16b, v0.4b[1] 433 434 # BLOCK 1 435 UDOT v20.4s, v9.16b, v4.4b[1] 436 LDR d11, [x5], 8 437 UDOT v21.4s, v9.16b, v5.4b[1] 438 INS v10.d[1], x14 439 UDOT v22.4s, v9.16b, v6.4b[1] 440 LDR x14, [x5], 8 441 UDOT v23.4s, v9.16b, v0.4b[1] 442 443 # BLOCK 2 444 UDOT v24.4s, v10.16b, v4.4b[1] 445 UDOT v25.4s, v10.16b, v5.4b[1] 446 INS v11.d[1], x14 447 UDOT v26.4s, v10.16b, v6.4b[1] 448 UDOT v27.4s, v10.16b, v0.4b[1] 449 450 # BLOCK 3 451 UDOT v28.4s, v11.16b, v4.4b[1] 452 UDOT v29.4s, v11.16b, v5.4b[1] 453 UDOT v30.4s, v11.16b, v6.4b[1] 454 UDOT v31.4s, v11.16b, v0.4b[1] 455 AND x0, x2, 15 // kc remainder 0 to 12 456 457 UDOT v12.2s, v7.8b, v4.8b 458 UDOT v13.2s, v7.8b, v5.8b 459 UDOT v14.2s, v7.8b, v6.8b 460 UDOT v15.2s, v7.8b, v0.8b 461 462 # Is there a remainder?- 4 to 12 bytes of A 463 CBNZ x0, 4f 464 4653: 466 ADDP v0.2s, v12.2s, v13.2s 467 ADDP v1.2s, v14.2s, v15.2s 468 DUP v12.4s, v0.s[0] 469 DUP v13.4s, v0.s[1] 470 DUP v14.4s, v1.s[0] 471 DUP v15.4s, v1.s[1] 472 473 # Subtract zero point from accumulators 474 SUB v16.4s, v16.4s, v12.4s 475 SUB v17.4s, v17.4s, v13.4s 476 SUB v18.4s, v18.4s, v14.4s 477 SUB v19.4s, v19.4s, v15.4s 478 SUB v20.4s, v20.4s, v12.4s 479 SUB v21.4s, v21.4s, v13.4s 480 SUB v22.4s, v22.4s, v14.4s 481 SUB v23.4s, v23.4s, v15.4s 482 SUB v24.4s, v24.4s, v12.4s 483 SUB v25.4s, v25.4s, v13.4s 484 SUB v26.4s, v26.4s, v14.4s 485 SUB v27.4s, v27.4s, v15.4s 486 SUB v28.4s, v28.4s, v12.4s 487 SUB v29.4s, v29.4s, v13.4s 488 SUB v30.4s, v30.4s, v14.4s 489 SUB v31.4s, v31.4s, v15.4s 490 491 $if REQUANTIZATION == "RNDNU": 492 # Apply params - preshift, scale, postshift, bias and clamp 493 LD1R {v4.4s}, [x11], 4 494 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits 495 SSHL v17.4s, v17.4s, v4.4s 496 SSHL v18.4s, v18.4s, v4.4s 497 SSHL v19.4s, v19.4s, v4.4s 498 SSHL v20.4s, v20.4s, v4.4s 499 SSHL v21.4s, v21.4s, v4.4s 500 SSHL v22.4s, v22.4s, v4.4s 501 SSHL v23.4s, v23.4s, v4.4s 502 LD1R {v5.4s}, [x11], 4 503 SSHL v24.4s, v24.4s, v4.4s 504 SSHL v25.4s, v25.4s, v4.4s 505 SSHL v26.4s, v26.4s, v4.4s 506 SSHL v27.4s, v27.4s, v4.4s 507 SSHL v28.4s, v28.4s, v4.4s 508 SSHL v29.4s, v29.4s, v4.4s 509 SSHL v30.4s, v30.4s, v4.4s 510 SSHL v31.4s, v31.4s, v4.4s 511 LD1R {v6.4s}, [x11], 4 512 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 513 SQDMULH v17.4s, v17.4s, v5.4s 514 SQDMULH v18.4s, v18.4s, v5.4s 515 SQDMULH v19.4s, v19.4s, v5.4s 516 SQDMULH v20.4s, v20.4s, v5.4s 517 SQDMULH v21.4s, v21.4s, v5.4s 518 SQDMULH v22.4s, v22.4s, v5.4s 519 SQDMULH v23.4s, v23.4s, v5.4s 520 SQDMULH v24.4s, v24.4s, v5.4s 521 SQDMULH v25.4s, v25.4s, v5.4s 522 SQDMULH v26.4s, v26.4s, v5.4s 523 SQDMULH v27.4s, v27.4s, v5.4s 524 SQDMULH v28.4s, v28.4s, v5.4s 525 SQDMULH v29.4s, v29.4s, v5.4s 526 SQDMULH v30.4s, v30.4s, v5.4s 527 SQDMULH v31.4s, v31.4s, v5.4s 528 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 529 SRSHL v17.4s, v17.4s, v6.4s 530 SRSHL v18.4s, v18.4s, v6.4s 531 SRSHL v19.4s, v19.4s, v6.4s 532 SRSHL v20.4s, v20.4s, v6.4s 533 SRSHL v21.4s, v21.4s, v6.4s 534 SRSHL v22.4s, v22.4s, v6.4s 535 SRSHL v23.4s, v23.4s, v6.4s 536 SRSHL v24.4s, v24.4s, v6.4s 537 SRSHL v25.4s, v25.4s, v6.4s 538 SRSHL v26.4s, v26.4s, v6.4s 539 SRSHL v27.4s, v27.4s, v6.4s 540 SRSHL v28.4s, v28.4s, v6.4s 541 SRSHL v29.4s, v29.4s, v6.4s 542 SRSHL v30.4s, v30.4s, v6.4s 543 SRSHL v31.4s, v31.4s, v6.4s 544 $elif REQUANTIZATION == "FP32": 545 SCVTF v16.4s, v16.4s 546 SCVTF v17.4s, v17.4s 547 # Apply params - scale, bias and clamp 548 LD1R {v4.4s}, [x11], 4 549 SCVTF v18.4s, v18.4s 550 SCVTF v19.4s, v19.4s 551 SCVTF v20.4s, v20.4s 552 SCVTF v21.4s, v21.4s 553 SCVTF v22.4s, v22.4s 554 SCVTF v23.4s, v23.4s 555 SCVTF v24.4s, v24.4s 556 SCVTF v25.4s, v25.4s 557 SCVTF v26.4s, v26.4s 558 SCVTF v27.4s, v27.4s 559 SCVTF v28.4s, v28.4s 560 SCVTF v29.4s, v29.4s 561 SCVTF v30.4s, v30.4s 562 SCVTF v31.4s, v31.4s 563 564 FMUL v16.4s, v16.4s, v4.4s 565 FMUL v17.4s, v17.4s, v4.4s 566 FMUL v18.4s, v18.4s, v4.4s 567 FMUL v19.4s, v19.4s, v4.4s 568 FMUL v20.4s, v20.4s, v4.4s 569 FMUL v21.4s, v21.4s, v4.4s 570 FMUL v22.4s, v22.4s, v4.4s 571 FMUL v23.4s, v23.4s, v4.4s 572 FMUL v24.4s, v24.4s, v4.4s 573 FMUL v25.4s, v25.4s, v4.4s 574 FMUL v26.4s, v26.4s, v4.4s 575 FMUL v27.4s, v27.4s, v4.4s 576 FMUL v28.4s, v28.4s, v4.4s 577 FMUL v29.4s, v29.4s, v4.4s 578 FMUL v30.4s, v30.4s, v4.4s 579 FMUL v31.4s, v31.4s, v4.4s 580 581 FCVTNS v16.4s, v16.4s 582 FCVTNS v17.4s, v17.4s 583 FCVTNS v18.4s, v18.4s 584 FCVTNS v19.4s, v19.4s 585 FCVTNS v20.4s, v20.4s 586 FCVTNS v21.4s, v21.4s 587 FCVTNS v22.4s, v22.4s 588 FCVTNS v23.4s, v23.4s 589 FCVTNS v24.4s, v24.4s 590 FCVTNS v25.4s, v25.4s 591 FCVTNS v26.4s, v26.4s 592 FCVTNS v27.4s, v27.4s 593 FCVTNS v28.4s, v28.4s 594 FCVTNS v29.4s, v29.4s 595 FCVTNS v30.4s, v30.4s 596 FCVTNS v31.4s, v31.4s 597 598 SQXTN v16.4h, v16.4s 599 SQXTN v17.4h, v17.4s 600 SQXTN v18.4h, v18.4s 601 SQXTN v19.4h, v19.4s 602 SQXTN v24.4h, v24.4s 603 SQXTN v25.4h, v25.4s 604 SQXTN v26.4h, v26.4s 605 SQXTN v27.4h, v27.4s 606 LD1R {v6.8h}, [x11], 2 // add bias 607 608 SQXTN2 v16.8h, v20.4s 609 SQXTN2 v17.8h, v21.4s 610 SQXTN2 v18.8h, v22.4s 611 SQXTN2 v19.8h, v23.4s 612 SQXTN2 v24.8h, v28.4s 613 SQXTN2 v25.8h, v29.4s 614 SQXTN2 v26.8h, v30.4s 615 SQXTN2 v27.8h, v31.4s 616 617 SQADD v16.8h, v16.8h, v6.8h 618 SQADD v17.8h, v17.8h, v6.8h 619 SQADD v18.8h, v18.8h, v6.8h 620 SQADD v19.8h, v19.8h, v6.8h 621 SQADD v24.8h, v24.8h, v6.8h 622 SQADD v25.8h, v25.8h, v6.8h 623 SQADD v26.8h, v26.8h, v6.8h 624 SQADD v27.8h, v27.8h, v6.8h 625 LD1R {v4.16b}, [x11], 1 // clamp min value 626 627 SQXTUN v0.8b, v16.8h 628 SQXTUN v1.8b, v17.8h 629 SQXTUN v2.8b, v18.8h 630 SQXTUN v3.8b, v19.8h 631 LD1R {v5.16b}, [x11] // clamp max value 632 SQXTUN2 v0.16b, v24.8h 633 SQXTUN2 v1.16b, v25.8h 634 SQXTUN2 v2.16b, v26.8h 635 SQXTUN2 v3.16b, v27.8h 636 637 SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer 638 639 UMAX v0.16b, v0.16b, v4.16b 640 UMAX v1.16b, v1.16b, v4.16b 641 UMAX v2.16b, v2.16b, v4.16b 642 UMAX v3.16b, v3.16b, v4.16b 643 SUBS x1, x1, 16 644 UMIN v0.16b, v0.16b, v5.16b 645 UMIN v1.16b, v1.16b, v5.16b 646 UMIN v2.16b, v2.16b, v5.16b 647 UMIN v3.16b, v3.16b, v5.16b 648 B.LO 6f 649 650 # Store full 4 x 16 651 ST1 {v0.16b}, [x6], x12 652 SUB x3, x3, x2 // a0 -= kc 653 ST1 {v1.16b}, [x8], x12 654 SUB x15, x15, x2 // a1 -= kc 655 ST1 {v2.16b}, [x9], x12 656 SUB x13, x13, x2 // a2 -= kc 657 ST1 {v3.16b}, [x7], x12 658 SUB x4, x4, x2 // a3 -= kc 659 B.NE 0b 660 661 # Restore d8-d15 from stack 662 LDP d14, d15, [sp, 48] 663 LDP d12, d13, [sp, 32] 664 LDP d10, d11, [sp, 16] 665 LDP d8, d9, [sp], 64 666 RET 667 668 # Remainder- 4 to 12 bytes of A 669 .p2align 3 6704: 671 TBZ x0, 3, 5f 672 673 LDR d0, [x3], 8 674 LDP q8, q9, [x5], 32 675 LDR d1, [x15], 8 676 LDR d2, [x13], 8 677 LDR d3, [x4], 8 678 LDP q10, q11, [x5], 32 679 UDOT v12.2s, v7.8b, v0.8b 680 UDOT v13.2s, v7.8b, v1.8b 681 UDOT v14.2s, v7.8b, v2.8b 682 UDOT v15.2s, v7.8b, v3.8b 683 UDOT v16.4s, v8.16b, v0.4b[0] 684 UDOT v17.4s, v8.16b, v1.4b[0] 685 UDOT v18.4s, v8.16b, v2.4b[0] 686 UDOT v19.4s, v8.16b, v3.4b[0] 687 UDOT v20.4s, v9.16b, v0.4b[0] 688 UDOT v21.4s, v9.16b, v1.4b[0] 689 UDOT v22.4s, v9.16b, v2.4b[0] 690 UDOT v23.4s, v9.16b, v3.4b[0] 691 UDOT v24.4s, v10.16b, v0.4b[0] 692 UDOT v25.4s, v10.16b, v1.4b[0] 693 UDOT v26.4s, v10.16b, v2.4b[0] 694 UDOT v27.4s, v10.16b, v3.4b[0] 695 UDOT v28.4s, v11.16b, v0.4b[0] 696 UDOT v29.4s, v11.16b, v1.4b[0] 697 UDOT v30.4s, v11.16b, v2.4b[0] 698 UDOT v31.4s, v11.16b, v3.4b[0] 699 LDP q8, q9, [x5], 32 700 LDP q10, q11, [x5], 32 701 UDOT v16.4s, v8.16b, v0.4b[1] 702 UDOT v17.4s, v8.16b, v1.4b[1] 703 UDOT v18.4s, v8.16b, v2.4b[1] 704 UDOT v19.4s, v8.16b, v3.4b[1] 705 UDOT v20.4s, v9.16b, v0.4b[1] 706 UDOT v21.4s, v9.16b, v1.4b[1] 707 UDOT v22.4s, v9.16b, v2.4b[1] 708 UDOT v23.4s, v9.16b, v3.4b[1] 709 UDOT v24.4s, v10.16b, v0.4b[1] 710 UDOT v25.4s, v10.16b, v1.4b[1] 711 UDOT v26.4s, v10.16b, v2.4b[1] 712 UDOT v27.4s, v10.16b, v3.4b[1] 713 UDOT v28.4s, v11.16b, v0.4b[1] 714 UDOT v29.4s, v11.16b, v1.4b[1] 715 UDOT v30.4s, v11.16b, v2.4b[1] 716 UDOT v31.4s, v11.16b, v3.4b[1] 717 TBZ x0, 2, 3b 7185: 719 LDR s0, [x3], 4 720 LDP q8, q9, [x5], 32 721 LDR s1, [x15], 4 722 LDR s2, [x13], 4 723 LDR s3, [x4], 4 724 LDP q10, q11, [x5], 32 725 UDOT v12.2s, v7.8b, v0.8b 726 UDOT v13.2s, v7.8b, v1.8b 727 UDOT v14.2s, v7.8b, v2.8b 728 UDOT v15.2s, v7.8b, v3.8b 729 UDOT v16.4s, v8.16b, v0.4b[0] 730 UDOT v17.4s, v8.16b, v1.4b[0] 731 UDOT v18.4s, v8.16b, v2.4b[0] 732 UDOT v19.4s, v8.16b, v3.4b[0] 733 UDOT v20.4s, v9.16b, v0.4b[0] 734 UDOT v21.4s, v9.16b, v1.4b[0] 735 UDOT v22.4s, v9.16b, v2.4b[0] 736 UDOT v23.4s, v9.16b, v3.4b[0] 737 UDOT v24.4s, v10.16b, v0.4b[0] 738 UDOT v25.4s, v10.16b, v1.4b[0] 739 UDOT v26.4s, v10.16b, v2.4b[0] 740 UDOT v27.4s, v10.16b, v3.4b[0] 741 UDOT v28.4s, v11.16b, v0.4b[0] 742 UDOT v29.4s, v11.16b, v1.4b[0] 743 UDOT v30.4s, v11.16b, v2.4b[0] 744 UDOT v31.4s, v11.16b, v3.4b[0] 745 B 3b 746 747 # Store odd width 748 .p2align 3 7496: 750 TBZ x1, 3, 7f 751 STR d0, [x6], 8 752 STR d1, [x8], 8 753 DUP d0, v0.d[1] 754 DUP d1, v1.d[1] 755 STR d2, [x9], 8 756 STR d3, [x7], 8 757 DUP d2, v2.d[1] 758 DUP d3, v3.d[1] 7597: 760 TBZ x1, 2, 8f 761 STR s0, [x6], 4 762 STR s1, [x8], 4 763 DUP s0, v0.s[1] 764 DUP s1, v1.s[1] 765 STR s2, [x9], 4 766 STR s3, [x7], 4 767 DUP s2, v2.s[1] 768 DUP s3, v3.s[1] 7698: 770 TBZ x1, 1, 9f 771 STR h0, [x6], 2 772 STR h1, [x8], 2 773 DUP h0, v0.h[1] 774 DUP h1, v1.h[1] 775 STR h2, [x9], 2 776 STR h3, [x7], 2 777 DUP h2, v2.h[1] 778 DUP h3, v3.h[1] 7799: 780 TBZ x1, 0, 10f 781 STR b0, [x6] 782 STR b1, [x8] 783 STR b2, [x9] 784 STR b3, [x7] 78510: 786 # Restore d8-d15 from stack 787 LDP d14, d15, [sp, 48] 788 LDP d12, d13, [sp, 32] 789 LDP d10, d11, [sp, 16] 790 LDP d8, d9, [sp], 64 791 RET 792 793END_FUNCTION xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55 794 795#ifdef __ELF__ 796.section ".note.GNU-stack","",%progbits 797#endif 798