1// Copyright 2021 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert REQUANTIZATION in ["FP32", "RNDNU"] 7 8#include <xnnpack/assembly.h> 9 10# void xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55( 11# size_t mr, x0 12# size_t nc, x1 13# size_t kc, x2 / x0 14# size_t ks, x3 / x9 15# const int8_t**restrict a, x4 16# const int8_t* restrict w, x5 17# int8_t* restrict c, x6 18# size_t cm_stride, x7 19# size_t cn_stride, [sp] -> (x0) 20# size_t a_offset, [sp + 8] -> x8 21# const int8_t* zero, [sp + 16] -> x12 22# const union xnn_qu8_conv_minmax_params params) [sp + 24] -> (x11) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# Register usage 27# A0 x13 v0 v4 28# A1 x14 v1 v5 29# A2 x15 v2 v6 30# A3 x10 v3 (v0) 31# B x5 v8 v9 v10 v11 32# C0 x6 v16 v20 v24 v28 33# C1 x16 v17 v21 v25 v29 34# C2 x17 v18 v22 v26 v30 35# C3 x7 v19 v23 v27 v31 36# zero point v7 v12 v13 v14 v15 37 38# x11 temp for Cortex-A55 loads 39 40BEGIN_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55 41 42 # Clamp C pointers 43 CMP x0, 2 // if mr < 2 44 LDR x8, [sp, 8] // Load a_offset 45 ADD x16, x6, x7 // c1 = c0 + cm_stride 46 CSEL x16, x6, x16, LO // c1 = c0 47 LDP x12, x11, [sp, 16] // Load zero pointer, params 48 ADD x2, x2, 3 // kc = (kc + 3) & ~3 49 ADD x17, x16, x7 // c2 = c1 + cm_stride 50 // if mr <= 2 51 # Save d8-d15 to stack 52 STP d8, d9, [sp, -64]! 53 54 CSEL x17, x16, x17, LS // c2 = c1 55 BIC x2, x2, 3 56 STP d10, d11, [sp, 16] 57 CMP x0, 4 // if mr < 4 58 ADD x7, x17, x7 // c3 = c2 + cm_stride 59 STP d12, d13, [sp, 32] 60 CSEL x7, x17, x7, LO // c3 = c2 61 LD1R {v7.4s}, [x11], 4 // kernel_zero_point 62 STP d14, d15, [sp, 48] 63 64 .p2align 3 650: 66 # Load initial bias from w into accumulators 67 LDP q16, q20, [x5], 32 68 69 MOVI v12.4s, 0 70 MOVI v13.4s, 0 71 MOVI v14.4s, 0 72 MOVI v15.4s, 0 73 74 MOV v17.16b, v16.16b 75 MOV v18.16b, v16.16b 76 LDP q24, q28, [x5], 32 77 MOV v19.16b, v16.16b 78 MOV v21.16b, v20.16b 79 MOV v22.16b, v20.16b 80 MOV v23.16b, v20.16b 81 MOV v25.16b, v24.16b 82 MOV v26.16b, v24.16b 83 MOV v27.16b, v24.16b 84 MOV v29.16b, v28.16b 85 MOV v30.16b, v28.16b 86 MOV v31.16b, v28.16b 87 88 MOV x9, x3 // p = ks 89 90 .p2align 3 911: 92 # Load next 4 A pointers 93 LDP x13, x14, [x4], 16 94 LDP x15, x10, [x4], 16 95 96 CMP x13, x12 // if a0 == zero 97 ADD x13, x13, x8 // a0 += a_offset 98 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 99 CMP x14, x12 // if a1 == zero 100 ADD x14, x14, x8 // a1 += a_offset 101 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 102 CMP x15, x12 // if a2 == zero 103 ADD x15, x15, x8 // a2 += a_offset 104 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 105 CMP x10, x12 // if a3 == zero 106 ADD x10, x10, x8 // a3 += a_offset 107 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 108 109 # Is there at least 16 bytes for prologue/epilogue? 110 SUBS x0, x2, 16 // k = kc - 16 111 B.LO 5f 112 113 # prologue - read A and B values for block 0 and 1 114 LDR q8, [x5], 16 115 LDR d0, [x13], 8 116 LDR d1, [x14], 8 117 LDR d2, [x15], 8 118 LDR d3, [x10], 8 119 SUBS x0, x0, 16 // is there 16 for main loop? 120 LDR d9, [x5], 8 121 LDR x11, [x5], 8 122 # Is there at least 16 bytes for main loop? 123 B.LO 3f 124 125 # Main loop - 16 bytes of A in 4 groups. 126 # 4 row of 4 vectors wide = 16 UDOT instructions for 4 channels 127 # 4 LD64 for A 128 # 4 LD128 for W. = 2 LD64 + INS. 129 # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS. 130 131 .p2align 3 1322: 133 # BLOCK 0 134 UDOT v16.4s, v8.16b, v0.4b[0] 135 LDR d10, [x5], 8 136 UDOT v17.4s, v8.16b, v1.4b[0] 137 INS v9.d[1], x11 138 UDOT v18.4s, v8.16b, v2.4b[0] 139 LDR x11, [x5], 8 140 UDOT v19.4s, v8.16b, v3.4b[0] 141 142 # BLOCK 1 143 UDOT v20.4s, v9.16b, v0.4b[0] 144 LDR d11, [x5], 8 145 UDOT v21.4s, v9.16b, v1.4b[0] 146 INS v10.d[1], x11 147 UDOT v22.4s, v9.16b, v2.4b[0] 148 LDR x11, [x5], 8 149 UDOT v23.4s, v9.16b, v3.4b[0] 150 151 # BLOCK 2 152 UDOT v24.4s, v10.16b, v0.4b[0] 153 LDR d8, [x5], 8 154 UDOT v25.4s, v10.16b, v1.4b[0] 155 INS v11.d[1], x11 156 UDOT v26.4s, v10.16b, v2.4b[0] 157 LDR x11, [x5], 8 158 UDOT v27.4s, v10.16b, v3.4b[0] 159 160 # BLOCK 3 161 UDOT v28.4s, v11.16b, v0.4b[0] 162 LDR d9, [x5], 8 163 UDOT v29.4s, v11.16b, v1.4b[0] 164 INS v8.d[1], x11 165 UDOT v30.4s, v11.16b, v2.4b[0] 166 LDR x11, [x5], 8 167 UDOT v31.4s, v11.16b, v3.4b[0] 168 169 UDOT v12.2s, v7.8b, v0.8b 170 UDOT v13.2s, v7.8b, v1.8b 171 UDOT v14.2s, v7.8b, v2.8b 172 UDOT v15.2s, v7.8b, v3.8b 173 174 # BLOCK 0 175 UDOT v16.4s, v8.16b, v0.4b[1] 176 LDR d10, [x5], 8 177 UDOT v17.4s, v8.16b, v1.4b[1] 178 INS v9.d[1], x11 179 UDOT v18.4s, v8.16b, v2.4b[1] 180 LDR x11, [x5], 8 181 UDOT v19.4s, v8.16b, v3.4b[1] 182 LDR d4, [x13], 8 183 184 # BLOCK 1 185 UDOT v20.4s, v9.16b, v0.4b[1] 186 LDR d11, [x5], 8 187 UDOT v21.4s, v9.16b, v1.4b[1] 188 INS v10.d[1], x11 189 UDOT v22.4s, v9.16b, v2.4b[1] 190 LDR x11, [x5], 8 191 UDOT v23.4s, v9.16b, v3.4b[1] 192 LDR d5, [x14], 8 193 194 # BLOCK 2 195 UDOT v24.4s, v10.16b, v0.4b[1] 196 LDR d8, [x5], 8 197 UDOT v25.4s, v10.16b, v1.4b[1] 198 INS v11.d[1], x11 199 UDOT v26.4s, v10.16b, v2.4b[1] 200 LDR x11, [x5], 8 201 UDOT v27.4s, v10.16b, v3.4b[1] 202 LDR d6, [x15], 8 203 204 # BLOCK 3 205 UDOT v28.4s, v11.16b, v0.4b[1] 206 LDR d9, [x5], 8 207 UDOT v29.4s, v11.16b, v1.4b[1] 208 INS v8.d[1], x11 209 UDOT v30.4s, v11.16b, v2.4b[1] 210 LDR x11, [x5], 8 211 UDOT v31.4s, v11.16b, v3.4b[1] 212 LDR d0, [x10], 8 213 214 # BLOCK 0 215 UDOT v16.4s, v8.16b, v4.4b[0] 216 LDR d10, [x5], 8 217 UDOT v17.4s, v8.16b, v5.4b[0] 218 INS v9.d[1], x11 219 UDOT v18.4s, v8.16b, v6.4b[0] 220 LDR x11, [x5], 8 221 UDOT v19.4s, v8.16b, v0.4b[0] 222 223 # BLOCK 1 224 UDOT v20.4s, v9.16b, v4.4b[0] 225 LDR d11, [x5], 8 226 UDOT v21.4s, v9.16b, v5.4b[0] 227 INS v10.d[1], x11 228 UDOT v22.4s, v9.16b, v6.4b[0] 229 LDR x11, [x5], 8 230 UDOT v23.4s, v9.16b, v0.4b[0] 231 232 # BLOCK 2 233 UDOT v24.4s, v10.16b, v4.4b[0] 234 LDR d8, [x5], 8 235 UDOT v25.4s, v10.16b, v5.4b[0] 236 INS v11.d[1], x11 237 UDOT v26.4s, v10.16b, v6.4b[0] 238 LDR x11, [x5], 8 239 UDOT v27.4s, v10.16b, v0.4b[0] 240 241 # BLOCK 3 242 UDOT v28.4s, v11.16b, v4.4b[0] 243 LDR d9, [x5], 8 244 UDOT v29.4s, v11.16b, v5.4b[0] 245 INS v8.d[1], x11 246 UDOT v30.4s, v11.16b, v6.4b[0] 247 LDR x11, [x5], 8 248 UDOT v31.4s, v11.16b, v0.4b[0] 249 250 # BLOCK 0 251 UDOT v16.4s, v8.16b, v4.4b[1] 252 LDR d10, [x5], 8 253 UDOT v17.4s, v8.16b, v5.4b[1] 254 INS v9.d[1], x11 255 UDOT v18.4s, v8.16b, v6.4b[1] 256 LDR x11, [x5], 8 257 UDOT v19.4s, v8.16b, v0.4b[1] 258 LDR d1, [x14], 8 259 260 # BLOCK 1 261 UDOT v20.4s, v9.16b, v4.4b[1] 262 LDR d11, [x5], 8 263 UDOT v21.4s, v9.16b, v5.4b[1] 264 INS v10.d[1], x11 265 UDOT v22.4s, v9.16b, v6.4b[1] 266 LDR x11, [x5], 8 267 UDOT v23.4s, v9.16b, v0.4b[1] 268 LDR d2, [x15], 8 269 270 # BLOCK 2 271 UDOT v24.4s, v10.16b, v4.4b[1] 272 LDR d8, [x5], 8 // First B values for block 0 and 1 273 UDOT v25.4s, v10.16b, v5.4b[1] 274 INS v11.d[1], x11 275 UDOT v26.4s, v10.16b, v6.4b[1] 276 LDR x11, [x5], 8 277 UDOT v27.4s, v10.16b, v0.4b[1] 278 LDR d3, [x10], 8 279 280 # BLOCK 3 special 281 UDOT v31.4s, v11.16b, v0.4b[1] 282 LDR d9, [x5], 8 283 UDOT v15.2s, v7.8b, v0.8b // free up v0 early 284 INS v8.d[1], x11 285 UDOT v28.4s, v11.16b, v4.4b[1] 286 LDR x11, [x5], 8 287 UDOT v29.4s, v11.16b, v5.4b[1] 288 LDR d0, [x13], 8 289 UDOT v30.4s, v11.16b, v6.4b[1] 290 SUBS x0, x0, 16 291 292 UDOT v12.2s, v7.8b, v4.8b 293 UDOT v13.2s, v7.8b, v5.8b 294 UDOT v14.2s, v7.8b, v6.8b 295 B.HS 2b 296 297 # Epilogue. Same as main loop but no preloads in final group 2983: 299 # BLOCK 0 300 UDOT v16.4s, v8.16b, v0.4b[0] 301 LDR d10, [x5], 8 302 UDOT v17.4s, v8.16b, v1.4b[0] 303 INS v9.d[1], x11 304 UDOT v18.4s, v8.16b, v2.4b[0] 305 LDR x11, [x5], 8 306 UDOT v19.4s, v8.16b, v3.4b[0] 307 308 # BLOCK 1 309 UDOT v20.4s, v9.16b, v0.4b[0] 310 LDR d11, [x5], 8 311 UDOT v21.4s, v9.16b, v1.4b[0] 312 INS v10.d[1], x11 313 UDOT v22.4s, v9.16b, v2.4b[0] 314 LDR x11, [x5], 8 315 UDOT v23.4s, v9.16b, v3.4b[0] 316 317 # BLOCK 2 318 UDOT v24.4s, v10.16b, v0.4b[0] 319 LDR d8, [x5], 8 320 UDOT v25.4s, v10.16b, v1.4b[0] 321 INS v11.d[1], x11 322 UDOT v26.4s, v10.16b, v2.4b[0] 323 LDR x11, [x5], 8 324 UDOT v27.4s, v10.16b, v3.4b[0] 325 326 # BLOCK 3 327 UDOT v28.4s, v11.16b, v0.4b[0] 328 LDR d9, [x5], 8 329 UDOT v29.4s, v11.16b, v1.4b[0] 330 INS v8.d[1], x11 331 UDOT v30.4s, v11.16b, v2.4b[0] 332 LDR x11, [x5], 8 333 UDOT v31.4s, v11.16b, v3.4b[0] 334 335 UDOT v12.2s, v7.8b, v0.8b 336 UDOT v13.2s, v7.8b, v1.8b 337 UDOT v14.2s, v7.8b, v2.8b 338 UDOT v15.2s, v7.8b, v3.8b 339 340 # BLOCK 0 341 UDOT v16.4s, v8.16b, v0.4b[1] 342 LDR d10, [x5], 8 343 UDOT v17.4s, v8.16b, v1.4b[1] 344 INS v9.d[1], x11 345 UDOT v18.4s, v8.16b, v2.4b[1] 346 LDR x11, [x5], 8 347 UDOT v19.4s, v8.16b, v3.4b[1] 348 LDR d4, [x13], 8 349 350 # BLOCK 1 351 UDOT v20.4s, v9.16b, v0.4b[1] 352 LDR d11, [x5], 8 353 UDOT v21.4s, v9.16b, v1.4b[1] 354 INS v10.d[1], x11 355 UDOT v22.4s, v9.16b, v2.4b[1] 356 LDR x11, [x5], 8 357 UDOT v23.4s, v9.16b, v3.4b[1] 358 LDR d5, [x14], 8 359 360 # BLOCK 2 361 UDOT v24.4s, v10.16b, v0.4b[1] 362 LDR d8, [x5], 8 363 UDOT v25.4s, v10.16b, v1.4b[1] 364 INS v11.d[1], x11 365 UDOT v26.4s, v10.16b, v2.4b[1] 366 LDR x11, [x5], 8 367 UDOT v27.4s, v10.16b, v3.4b[1] 368 LDR d6, [x15], 8 369 370 # BLOCK 3 371 UDOT v28.4s, v11.16b, v0.4b[1] 372 LDR d9, [x5], 8 373 UDOT v29.4s, v11.16b, v1.4b[1] 374 INS v8.d[1], x11 375 UDOT v30.4s, v11.16b, v2.4b[1] 376 LDR x11, [x5], 8 377 UDOT v31.4s, v11.16b, v3.4b[1] 378 LDR d0, [x10], 8 379 380 # BLOCK 0 381 UDOT v16.4s, v8.16b, v4.4b[0] 382 LDR d10, [x5], 8 383 UDOT v17.4s, v8.16b, v5.4b[0] 384 INS v9.d[1], x11 385 UDOT v18.4s, v8.16b, v6.4b[0] 386 LDR x11, [x5], 8 387 UDOT v19.4s, v8.16b, v0.4b[0] 388 389 # BLOCK 1 390 UDOT v20.4s, v9.16b, v4.4b[0] 391 LDR d11, [x5], 8 392 UDOT v21.4s, v9.16b, v5.4b[0] 393 INS v10.d[1], x11 394 UDOT v22.4s, v9.16b, v6.4b[0] 395 LDR x11, [x5], 8 396 UDOT v23.4s, v9.16b, v0.4b[0] 397 398 # BLOCK 2 399 UDOT v24.4s, v10.16b, v4.4b[0] 400 LDR d8, [x5], 8 401 UDOT v25.4s, v10.16b, v5.4b[0] 402 INS v11.d[1], x11 403 UDOT v26.4s, v10.16b, v6.4b[0] 404 LDR x11, [x5], 8 405 UDOT v27.4s, v10.16b, v0.4b[0] 406 407 # BLOCK 3 408 UDOT v28.4s, v11.16b, v4.4b[0] 409 LDR d9, [x5], 8 410 UDOT v29.4s, v11.16b, v5.4b[0] 411 INS v8.d[1], x11 412 UDOT v30.4s, v11.16b, v6.4b[0] 413 LDR x11, [x5], 8 414 UDOT v31.4s, v11.16b, v0.4b[0] 415 416 # BLOCK 0 417 UDOT v16.4s, v8.16b, v4.4b[1] 418 LDR d10, [x5], 8 419 UDOT v17.4s, v8.16b, v5.4b[1] 420 INS v9.d[1], x11 421 UDOT v18.4s, v8.16b, v6.4b[1] 422 LDR x11, [x5], 8 423 UDOT v19.4s, v8.16b, v0.4b[1] 424 425 # BLOCK 1 426 UDOT v20.4s, v9.16b, v4.4b[1] 427 LDR d11, [x5], 8 428 UDOT v21.4s, v9.16b, v5.4b[1] 429 INS v10.d[1], x11 430 UDOT v22.4s, v9.16b, v6.4b[1] 431 LDR x11, [x5], 8 432 UDOT v23.4s, v9.16b, v0.4b[1] 433 434 # BLOCK 2 435 UDOT v24.4s, v10.16b, v4.4b[1] 436 UDOT v25.4s, v10.16b, v5.4b[1] 437 INS v11.d[1], x11 438 UDOT v26.4s, v10.16b, v6.4b[1] 439 UDOT v27.4s, v10.16b, v0.4b[1] 440 441 # BLOCK 3 442 UDOT v28.4s, v11.16b, v4.4b[1] 443 UDOT v29.4s, v11.16b, v5.4b[1] 444 UDOT v30.4s, v11.16b, v6.4b[1] 445 UDOT v31.4s, v11.16b, v0.4b[1] 446 447 UDOT v12.2s, v7.8b, v4.8b 448 UDOT v13.2s, v7.8b, v5.8b 449 UDOT v14.2s, v7.8b, v6.8b 450 UDOT v15.2s, v7.8b, v0.8b 451 452 # Is there a remainder?- 4 to 12 bytes of A 453 TST x0, 15 454 B.NE 5f 455 4564: 457 # ks loop 458 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 459 B.HI 1b 460 461 ADDP v0.2s, v12.2s, v13.2s 462 ADDP v1.2s, v14.2s, v15.2s 463 LDR x11, [sp, 88] // Reload params 464 DUP v12.4s, v0.s[0] 465 DUP v13.4s, v0.s[1] 466 DUP v14.4s, v1.s[0] 467 DUP v15.4s, v1.s[1] 468 ADD x11, x11, 4 469 470 # Subtract zero point from accumulators 471 SUB v16.4s, v16.4s, v12.4s 472 SUB v17.4s, v17.4s, v13.4s 473 SUB v18.4s, v18.4s, v14.4s 474 SUB v19.4s, v19.4s, v15.4s 475 SUB v20.4s, v20.4s, v12.4s 476 SUB v21.4s, v21.4s, v13.4s 477 SUB v22.4s, v22.4s, v14.4s 478 SUB v23.4s, v23.4s, v15.4s 479 SUB v24.4s, v24.4s, v12.4s 480 SUB v25.4s, v25.4s, v13.4s 481 SUB v26.4s, v26.4s, v14.4s 482 SUB v27.4s, v27.4s, v15.4s 483 SUB v28.4s, v28.4s, v12.4s 484 SUB v29.4s, v29.4s, v13.4s 485 SUB v30.4s, v30.4s, v14.4s 486 SUB v31.4s, v31.4s, v15.4s 487 488 $if REQUANTIZATION == "RNDNU": 489 # Apply params - preshift, scale, postshift, bias and clamp 490 LD1R {v4.4s}, [x11], 4 491 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits 492 SSHL v17.4s, v17.4s, v4.4s 493 SSHL v18.4s, v18.4s, v4.4s 494 SSHL v19.4s, v19.4s, v4.4s 495 SSHL v20.4s, v20.4s, v4.4s 496 SSHL v21.4s, v21.4s, v4.4s 497 SSHL v22.4s, v22.4s, v4.4s 498 SSHL v23.4s, v23.4s, v4.4s 499 LD1R {v5.4s}, [x11], 4 500 SSHL v24.4s, v24.4s, v4.4s 501 SSHL v25.4s, v25.4s, v4.4s 502 SSHL v26.4s, v26.4s, v4.4s 503 SSHL v27.4s, v27.4s, v4.4s 504 SSHL v28.4s, v28.4s, v4.4s 505 SSHL v29.4s, v29.4s, v4.4s 506 SSHL v30.4s, v30.4s, v4.4s 507 SSHL v31.4s, v31.4s, v4.4s 508 LD1R {v6.4s}, [x11], 4 509 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 510 SQDMULH v17.4s, v17.4s, v5.4s 511 SQDMULH v18.4s, v18.4s, v5.4s 512 SQDMULH v19.4s, v19.4s, v5.4s 513 SQDMULH v20.4s, v20.4s, v5.4s 514 SQDMULH v21.4s, v21.4s, v5.4s 515 SQDMULH v22.4s, v22.4s, v5.4s 516 SQDMULH v23.4s, v23.4s, v5.4s 517 SQDMULH v24.4s, v24.4s, v5.4s 518 SQDMULH v25.4s, v25.4s, v5.4s 519 SQDMULH v26.4s, v26.4s, v5.4s 520 SQDMULH v27.4s, v27.4s, v5.4s 521 SQDMULH v28.4s, v28.4s, v5.4s 522 SQDMULH v29.4s, v29.4s, v5.4s 523 SQDMULH v30.4s, v30.4s, v5.4s 524 SQDMULH v31.4s, v31.4s, v5.4s 525 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 526 SRSHL v17.4s, v17.4s, v6.4s 527 SRSHL v18.4s, v18.4s, v6.4s 528 SRSHL v19.4s, v19.4s, v6.4s 529 SRSHL v20.4s, v20.4s, v6.4s 530 SRSHL v21.4s, v21.4s, v6.4s 531 SRSHL v22.4s, v22.4s, v6.4s 532 SRSHL v23.4s, v23.4s, v6.4s 533 SRSHL v24.4s, v24.4s, v6.4s 534 SRSHL v25.4s, v25.4s, v6.4s 535 SRSHL v26.4s, v26.4s, v6.4s 536 SRSHL v27.4s, v27.4s, v6.4s 537 SRSHL v28.4s, v28.4s, v6.4s 538 SRSHL v29.4s, v29.4s, v6.4s 539 SRSHL v30.4s, v30.4s, v6.4s 540 SRSHL v31.4s, v31.4s, v6.4s 541 $elif REQUANTIZATION == "FP32": 542 SCVTF v16.4s, v16.4s 543 SCVTF v17.4s, v17.4s 544 # Apply params - scale, bias and clamp 545 LD1R {v4.4s}, [x11], 4 546 SCVTF v18.4s, v18.4s 547 SCVTF v19.4s, v19.4s 548 SCVTF v20.4s, v20.4s 549 SCVTF v21.4s, v21.4s 550 SCVTF v22.4s, v22.4s 551 SCVTF v23.4s, v23.4s 552 SCVTF v24.4s, v24.4s 553 SCVTF v25.4s, v25.4s 554 SCVTF v26.4s, v26.4s 555 SCVTF v27.4s, v27.4s 556 SCVTF v28.4s, v28.4s 557 SCVTF v29.4s, v29.4s 558 SCVTF v30.4s, v30.4s 559 SCVTF v31.4s, v31.4s 560 561 FMUL v16.4s, v16.4s, v4.4s 562 FMUL v17.4s, v17.4s, v4.4s 563 FMUL v18.4s, v18.4s, v4.4s 564 FMUL v19.4s, v19.4s, v4.4s 565 FMUL v20.4s, v20.4s, v4.4s 566 FMUL v21.4s, v21.4s, v4.4s 567 FMUL v22.4s, v22.4s, v4.4s 568 FMUL v23.4s, v23.4s, v4.4s 569 FMUL v24.4s, v24.4s, v4.4s 570 FMUL v25.4s, v25.4s, v4.4s 571 FMUL v26.4s, v26.4s, v4.4s 572 FMUL v27.4s, v27.4s, v4.4s 573 FMUL v28.4s, v28.4s, v4.4s 574 FMUL v29.4s, v29.4s, v4.4s 575 FMUL v30.4s, v30.4s, v4.4s 576 FMUL v31.4s, v31.4s, v4.4s 577 578 FCVTNS v16.4s, v16.4s 579 FCVTNS v17.4s, v17.4s 580 FCVTNS v18.4s, v18.4s 581 FCVTNS v19.4s, v19.4s 582 FCVTNS v20.4s, v20.4s 583 FCVTNS v21.4s, v21.4s 584 FCVTNS v22.4s, v22.4s 585 FCVTNS v23.4s, v23.4s 586 FCVTNS v24.4s, v24.4s 587 FCVTNS v25.4s, v25.4s 588 FCVTNS v26.4s, v26.4s 589 FCVTNS v27.4s, v27.4s 590 FCVTNS v28.4s, v28.4s 591 FCVTNS v29.4s, v29.4s 592 FCVTNS v30.4s, v30.4s 593 FCVTNS v31.4s, v31.4s 594 595 SQXTN v16.4h, v16.4s 596 SQXTN v17.4h, v17.4s 597 SQXTN v18.4h, v18.4s 598 SQXTN v19.4h, v19.4s 599 SQXTN v24.4h, v24.4s 600 SQXTN v25.4h, v25.4s 601 SQXTN v26.4h, v26.4s 602 SQXTN v27.4h, v27.4s 603 LD1R {v6.8h}, [x11], 2 // add bias 604 605 SQXTN2 v16.8h, v20.4s 606 SQXTN2 v17.8h, v21.4s 607 SQXTN2 v18.8h, v22.4s 608 SQXTN2 v19.8h, v23.4s 609 SQXTN2 v24.8h, v28.4s 610 SQXTN2 v25.8h, v29.4s 611 SQXTN2 v26.8h, v30.4s 612 SQXTN2 v27.8h, v31.4s 613 614 SQADD v16.8h, v16.8h, v6.8h 615 SQADD v17.8h, v17.8h, v6.8h 616 SQADD v18.8h, v18.8h, v6.8h 617 SQADD v19.8h, v19.8h, v6.8h 618 SQADD v24.8h, v24.8h, v6.8h 619 SQADD v25.8h, v25.8h, v6.8h 620 SQADD v26.8h, v26.8h, v6.8h 621 SQADD v27.8h, v27.8h, v6.8h 622 LD1R {v4.16b}, [x11], 1 // clamp min value 623 624 SQXTUN v0.8b, v16.8h 625 SQXTUN v1.8b, v17.8h 626 SQXTUN v2.8b, v18.8h 627 SQXTUN v3.8b, v19.8h 628 LD1R {v5.16b}, [x11] // clamp max value 629 SQXTUN2 v0.16b, v24.8h 630 SQXTUN2 v1.16b, v25.8h 631 SQXTUN2 v2.16b, v26.8h 632 SQXTUN2 v3.16b, v27.8h 633 LDR x0, [sp, 64] // Load cn_stride 634 635 UMAX v0.16b, v0.16b, v4.16b 636 UMAX v1.16b, v1.16b, v4.16b 637 UMAX v2.16b, v2.16b, v4.16b 638 UMAX v3.16b, v3.16b, v4.16b 639 SUBS x1, x1, 16 640 UMIN v0.16b, v0.16b, v5.16b 641 UMIN v1.16b, v1.16b, v5.16b 642 UMIN v2.16b, v2.16b, v5.16b 643 UMIN v3.16b, v3.16b, v5.16b 644 B.LO 7f 645 646 # Store full 4 x 16 647 ST1 {v3.16b}, [x7], x0 648 ST1 {v2.16b}, [x17], x0 649 ST1 {v1.16b}, [x16], x0 650 ST1 {v0.16b}, [x6], x0 651 652 SUB x4, x4, x3 // a -= ks 653 654 # nc loop 655 B.HI 0b 656 657 # Restore d8-d15 from stack 658 LDP d14, d15, [sp, 48] 659 LDP d12, d13, [sp, 32] 660 LDP d10, d11, [sp, 16] 661 LDP d8, d9, [sp], 64 662 RET 663 664 # Remainder- 4 to 12 bytes of A 665 .p2align 3 6665: 667 TBZ x0, 3, 6f 668 669 LDR d0, [x13], 8 670 LDP q8, q9, [x5], 32 671 LDR d1, [x14], 8 672 LDR d2, [x15], 8 673 LDR d3, [x10], 8 674 LDP q10, q11, [x5], 32 675 UDOT v12.2s, v7.8b, v0.8b 676 UDOT v13.2s, v7.8b, v1.8b 677 UDOT v14.2s, v7.8b, v2.8b 678 UDOT v15.2s, v7.8b, v3.8b 679 UDOT v16.4s, v8.16b, v0.4b[0] 680 UDOT v17.4s, v8.16b, v1.4b[0] 681 UDOT v18.4s, v8.16b, v2.4b[0] 682 UDOT v19.4s, v8.16b, v3.4b[0] 683 UDOT v20.4s, v9.16b, v0.4b[0] 684 UDOT v21.4s, v9.16b, v1.4b[0] 685 UDOT v22.4s, v9.16b, v2.4b[0] 686 UDOT v23.4s, v9.16b, v3.4b[0] 687 UDOT v24.4s, v10.16b, v0.4b[0] 688 UDOT v25.4s, v10.16b, v1.4b[0] 689 UDOT v26.4s, v10.16b, v2.4b[0] 690 UDOT v27.4s, v10.16b, v3.4b[0] 691 UDOT v28.4s, v11.16b, v0.4b[0] 692 UDOT v29.4s, v11.16b, v1.4b[0] 693 UDOT v30.4s, v11.16b, v2.4b[0] 694 UDOT v31.4s, v11.16b, v3.4b[0] 695 LDP q8, q9, [x5], 32 696 LDP q10, q11, [x5], 32 697 UDOT v16.4s, v8.16b, v0.4b[1] 698 UDOT v17.4s, v8.16b, v1.4b[1] 699 UDOT v18.4s, v8.16b, v2.4b[1] 700 UDOT v19.4s, v8.16b, v3.4b[1] 701 UDOT v20.4s, v9.16b, v0.4b[1] 702 UDOT v21.4s, v9.16b, v1.4b[1] 703 UDOT v22.4s, v9.16b, v2.4b[1] 704 UDOT v23.4s, v9.16b, v3.4b[1] 705 UDOT v24.4s, v10.16b, v0.4b[1] 706 UDOT v25.4s, v10.16b, v1.4b[1] 707 UDOT v26.4s, v10.16b, v2.4b[1] 708 UDOT v27.4s, v10.16b, v3.4b[1] 709 UDOT v28.4s, v11.16b, v0.4b[1] 710 UDOT v29.4s, v11.16b, v1.4b[1] 711 UDOT v30.4s, v11.16b, v2.4b[1] 712 UDOT v31.4s, v11.16b, v3.4b[1] 713 TBZ x0, 2, 4b 7146: 715 LDR s0, [x13], 4 716 LDP q8, q9, [x5], 32 717 LDR s1, [x14], 4 718 LDR s2, [x15], 4 719 LDR s3, [x10], 4 720 LDP q10, q11, [x5], 32 721 UDOT v12.2s, v7.8b, v0.8b 722 UDOT v13.2s, v7.8b, v1.8b 723 UDOT v14.2s, v7.8b, v2.8b 724 UDOT v15.2s, v7.8b, v3.8b 725 UDOT v16.4s, v8.16b, v0.4b[0] 726 UDOT v17.4s, v8.16b, v1.4b[0] 727 UDOT v18.4s, v8.16b, v2.4b[0] 728 UDOT v19.4s, v8.16b, v3.4b[0] 729 UDOT v20.4s, v9.16b, v0.4b[0] 730 UDOT v21.4s, v9.16b, v1.4b[0] 731 UDOT v22.4s, v9.16b, v2.4b[0] 732 UDOT v23.4s, v9.16b, v3.4b[0] 733 UDOT v24.4s, v10.16b, v0.4b[0] 734 UDOT v25.4s, v10.16b, v1.4b[0] 735 UDOT v26.4s, v10.16b, v2.4b[0] 736 UDOT v27.4s, v10.16b, v3.4b[0] 737 UDOT v28.4s, v11.16b, v0.4b[0] 738 UDOT v29.4s, v11.16b, v1.4b[0] 739 UDOT v30.4s, v11.16b, v2.4b[0] 740 UDOT v31.4s, v11.16b, v3.4b[0] 741 B 4b 742 743 # Store odd width 744 .p2align 3 7457: 746 TBZ x1, 3, 8f 747 STR d3, [x7], 8 748 STR d2, [x17], 8 749 DUP d3, v3.d[1] 750 DUP d2, v2.d[1] 751 STR d1, [x16], 8 752 STR d0, [x6], 8 753 DUP d1, v1.d[1] 754 DUP d0, v0.d[1] 7558: 756 TBZ x1, 2, 9f 757 STR s3, [x7], 4 758 STR s2, [x17], 4 759 DUP s3, v3.s[1] 760 DUP s2, v2.s[1] 761 STR s1, [x16], 4 762 STR s0, [x6], 4 763 DUP s1, v1.s[1] 764 DUP s0, v0.s[1] 7659: 766 TBZ x1, 1, 10f 767 STR h3, [x7], 2 768 STR h2, [x17], 2 769 DUP h3, v3.h[1] 770 DUP h2, v2.h[1] 771 STR h1, [x16], 2 772 STR h0, [x6], 2 773 DUP h1, v1.h[1] 774 DUP h0, v0.h[1] 77510: 776 TBZ x1, 0, 11f 777 STR b3, [x7] 778 STR b2, [x17] 779 STR b1, [x16] 780 STR b0, [x6] 78111: 782 # Restore d8-d15 from stack 783 LDP d14, d15, [sp, 48] 784 LDP d12, d13, [sp, 32] 785 LDP d10, d11, [sp, 16] 786 LDP d8, d9, [sp], 64 787 RET 788 789END_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55 790 791#ifdef __ELF__ 792.section ".note.GNU-stack","",%progbits 793#endif 794