1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert REQUANTIZATION in ["FP32", "RNDNU"] 7 8#include <xnnpack/assembly.h> 9 10$REWIND_DECREMENT = {"RNDNU": 19, "FP32": 11}[REQUANTIZATION] 11# void xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_cortex_a55( 12# size_t mr, x0 13# size_t nc, x1 14# size_t kc, x2 / x0 15# size_t ks, x3 / x9 16# const int8_t**restrict a, x4 17# const int8_t* restrict w, x5 18# int8_t* restrict c, x6 19# size_t cm_stride, x7 20# size_t cn_stride, [sp] -> (x10) 21# size_t a_offset, [sp + 8] -> x8 22# const int8_t* zero, [sp + 16] -> x12 23# const union xnn_qu8_conv_minmax_params [sp + 24] -> (x11) 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x13 v0 v4 29# A1 x14 v1 v5 30# A2 x15 v2 v6 31# A3 x10 v3 v7 32# B x5 v28 v29 v30 v31 33# C0 x6 v16 v20 34# C1 x16 v17 v21 35# C2 x17 v18 v22 36# C3 x7 v19 v23 37# zero_point v8 v24 v25 v26 v27 38# unused v9 v10 v11 v12 v13 v14 v15 39 40# x11 temp for Cortex-A55 loads 41 42BEGIN_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_cortex_a55 43 44 # Clamp C pointers 45 CMP x0, 2 // if mr < 2 46 LDR x8, [sp, 8] // Load a_offset 47 ADD x16, x6, x7 // c1 = c0 + cm_stride 48 CSEL x16, x6, x16, LO // c1 = c0 49 LDP x12, x11, [sp, 16] // Load zero, params pointer 50 ADD x2, x2, 3 // kc = (kc + 3) & ~3 51 ADD x17, x16, x7 // c2 = c1 + cm_stride 52 STR d8, [sp, -16]! // Save d8 on stack 53 // if mr <= 2 54 CSEL x17, x16, x17, LS // c2 = c1 55 BIC x2, x2, 3 56 CMP x0, 4 // if mr < 4 57 LD1R {v8.4s}, [x11], 4 // kernel_zero_point 58 ADD x7, x17, x7 // c3 = c2 + cm_stride 59 CSEL x7, x17, x7, LO // c3 = c2 60 61 .p2align 3 620: 63 # Load initial bias from w into accumulators 64 LDP q16, q20, [x5], 32 65 MOV v17.16b, v16.16b 66 MOV v18.16b, v16.16b 67 MOV v19.16b, v16.16b 68 MOV v21.16b, v20.16b 69 MOV v22.16b, v20.16b 70 MOV v23.16b, v20.16b 71 MOVI v24.16b, 0 72 MOVI v25.16b, 0 73 MOVI v26.16b, 0 74 MOVI v27.16b, 0 75 MOV x9, x3 // p = ks 76 77 .p2align 3 781: 79 # Load next 4 A pointers 80 LDP x13, x14, [x4], 16 81 LDP x15, x10, [x4], 16 82 83 CMP x13, x12 // if a0 == zero 84 ADD x13, x13, x8 // a0 += a_offset 85 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 86 CMP x14, x12 // if a1 == zero 87 ADD x14, x14, x8 // a1 += a_offset 88 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 89 CMP x15, x12 // if a2 == zero 90 ADD x15, x15, x8 // a2 += a_offset 91 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 92 CMP x10, x12 // if a3 == zero 93 ADD x10, x10, x8 // a3 += a_offset 94 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 95 96 # Is there at least 16 bytes for prologue/epilogue? 97 SUBS x0, x2, 16 // k = kc - 16 98 B.LO 5f 99 100 # prologue - read A and B values for block 0 and 1 101 LDR d0, [x13], 8 102 LDR q28, [x5], 16 103 LDR d1, [x14], 8 104 LDR d2, [x15], 8 105 LDR d3, [x10], 8 106 SUBS x0, x0, 16 // is there 16 for main loop? 107 LDR d29, [x5], 8 108 LDR x11, [x5], 8 109 # Is there at least 16 bytes for main loop? 110 B.LO 3f 111 112 # Main loop - 16 bytes of A in 4 groups of 2 blocks 113 # 4 row of 2 vectors wide = 8 UDOT instructions for 4 channels 114 # 4 LD64 for A 115 # 4 LD128 for W. = 2 LD64 + INS. 116 # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS. 117 118 .p2align 3 1192: 120 # BLOCK 0 121 UDOT v16.4s, v28.16b, v0.4b[0] 122 LDR d30, [x5], 8 123 UDOT v17.4s, v28.16b, v1.4b[0] 124 INS v29.d[1], x11 125 UDOT v18.4s, v28.16b, v2.4b[0] 126 LDR x11, [x5], 8 127 UDOT v19.4s, v28.16b, v3.4b[0] 128 LDR d4, [x13], 8 129 130 # BLOCK 1 131 UDOT v20.4s, v29.16b, v0.4b[0] 132 LDR d31, [x5], 8 133 UDOT v21.4s, v29.16b, v1.4b[0] 134 INS v30.d[1], x11 135 UDOT v22.4s, v29.16b, v2.4b[0] 136 LDR x11, [x5], 8 137 UDOT v23.4s, v29.16b, v3.4b[0] 138 LDR d5, [x14], 8 139 140 # BLOCK 0 141 UDOT v16.4s, v30.16b, v0.4b[1] 142 LDR d28, [x5], 8 143 UDOT v17.4s, v30.16b, v1.4b[1] 144 INS v31.d[1], x11 145 UDOT v18.4s, v30.16b, v2.4b[1] 146 LDR x11, [x5], 8 147 UDOT v19.4s, v30.16b, v3.4b[1] 148 LDR d6, [x15], 8 149 150 # BLOCK 1 151 UDOT v20.4s, v31.16b, v0.4b[1] 152 LDR d29, [x5], 8 153 UDOT v21.4s, v31.16b, v1.4b[1] 154 INS v28.d[1], x11 155 UDOT v22.4s, v31.16b, v2.4b[1] 156 LDR x11, [x5], 8 157 UDOT v23.4s, v31.16b, v3.4b[1] 158 LDR d7, [x10], 8 159 160 UDOT v24.2s, v8.8b, v0.8b 161 UDOT v25.2s, v8.8b, v1.8b 162 UDOT v26.2s, v8.8b, v2.8b 163 UDOT v27.2s, v8.8b, v3.8b 164 165 # BLOCK 0 166 UDOT v16.4s, v28.16b, v4.4b[0] 167 LDR d30, [x5], 8 168 UDOT v17.4s, v28.16b, v5.4b[0] 169 INS v29.d[1], x11 170 UDOT v18.4s, v28.16b, v6.4b[0] 171 LDR x11, [x5], 8 172 UDOT v19.4s, v28.16b, v7.4b[0] 173 LDR d0, [x13], 8 174 175 # BLOCK 1 176 UDOT v20.4s, v29.16b, v4.4b[0] 177 LDR d31, [x5], 8 178 UDOT v21.4s, v29.16b, v5.4b[0] 179 INS v30.d[1], x11 180 UDOT v22.4s, v29.16b, v6.4b[0] 181 LDR x11, [x5], 8 182 UDOT v23.4s, v29.16b, v7.4b[0] 183 LDR d1, [x14], 8 184 185 # BLOCK 0 186 UDOT v16.4s, v30.16b, v4.4b[1] 187 LDR d28, [x5], 8 188 UDOT v17.4s, v30.16b, v5.4b[1] 189 INS v31.d[1], x11 190 UDOT v18.4s, v30.16b, v6.4b[1] 191 LDR x11, [x5], 8 192 UDOT v19.4s, v30.16b, v7.4b[1] 193 LDR d2, [x15], 8 194 195 # BLOCK 1 196 UDOT v20.4s, v31.16b, v4.4b[1] 197 LDR d29, [x5], 8 198 UDOT v21.4s, v31.16b, v5.4b[1] 199 INS v28.d[1], x11 200 UDOT v22.4s, v31.16b, v6.4b[1] 201 LDR x11, [x5], 8 202 UDOT v23.4s, v31.16b, v7.4b[1] 203 LDR d3, [x10], 8 204 205 UDOT v24.2s, v8.8b, v4.8b 206 UDOT v25.2s, v8.8b, v5.8b 207 SUBS x0, x0, 16 208 UDOT v26.2s, v8.8b, v6.8b 209 UDOT v27.2s, v8.8b, v7.8b 210 211 B.HS 2b 212 213 # Epilogue. Same as main loop but no preloads in final group 2143: 215 # BLOCK 0 216 UDOT v16.4s, v28.16b, v0.4b[0] 217 LDR d30, [x5], 8 218 UDOT v17.4s, v28.16b, v1.4b[0] 219 INS v29.d[1], x11 220 UDOT v18.4s, v28.16b, v2.4b[0] 221 LDR x11, [x5], 8 222 UDOT v19.4s, v28.16b, v3.4b[0] 223 LDR d4, [x13], 8 224 225 # BLOCK 1 226 UDOT v20.4s, v29.16b, v0.4b[0] 227 LDR d31, [x5], 8 228 UDOT v21.4s, v29.16b, v1.4b[0] 229 INS v30.d[1], x11 230 UDOT v22.4s, v29.16b, v2.4b[0] 231 LDR x11, [x5], 8 232 UDOT v23.4s, v29.16b, v3.4b[0] 233 LDR d5, [x14], 8 234 235 # BLOCK 0 236 UDOT v16.4s, v30.16b, v0.4b[1] 237 LDR d28, [x5], 8 238 UDOT v17.4s, v30.16b, v1.4b[1] 239 INS v31.d[1], x11 240 UDOT v18.4s, v30.16b, v2.4b[1] 241 LDR x11, [x5], 8 242 UDOT v19.4s, v30.16b, v3.4b[1] 243 LDR d6, [x15], 8 244 245 # BLOCK 1 246 UDOT v20.4s, v31.16b, v0.4b[1] 247 LDR d29, [x5], 8 248 UDOT v21.4s, v31.16b, v1.4b[1] 249 INS v28.d[1], x11 250 UDOT v22.4s, v31.16b, v2.4b[1] 251 LDR x11, [x5], 8 252 UDOT v23.4s, v31.16b, v3.4b[1] 253 LDR d7, [x10], 8 254 255 UDOT v24.2s, v8.8b, v0.8b 256 UDOT v25.2s, v8.8b, v1.8b 257 UDOT v26.2s, v8.8b, v2.8b 258 UDOT v27.2s, v8.8b, v3.8b 259 260 # BLOCK 0 261 UDOT v16.4s, v28.16b, v4.4b[0] 262 LDR d30, [x5], 8 263 UDOT v17.4s, v28.16b, v5.4b[0] 264 INS v29.d[1], x11 265 UDOT v18.4s, v28.16b, v6.4b[0] 266 LDR x11, [x5], 8 267 UDOT v19.4s, v28.16b, v7.4b[0] 268 269 # BLOCK 1 270 UDOT v20.4s, v29.16b, v4.4b[0] 271 LDR d31, [x5], 8 272 UDOT v21.4s, v29.16b, v5.4b[0] 273 INS v30.d[1], x11 274 UDOT v22.4s, v29.16b, v6.4b[0] 275 LDR x11, [x5], 8 276 UDOT v23.4s, v29.16b, v7.4b[0] 277 278 # BLOCK 0 279 UDOT v16.4s, v30.16b, v4.4b[1] 280 UDOT v17.4s, v30.16b, v5.4b[1] 281 INS v31.d[1], x11 282 UDOT v18.4s, v30.16b, v6.4b[1] 283 UDOT v19.4s, v30.16b, v7.4b[1] 284 285 # BLOCK 1 286 UDOT v20.4s, v31.16b, v4.4b[1] 287 UDOT v21.4s, v31.16b, v5.4b[1] 288 UDOT v22.4s, v31.16b, v6.4b[1] 289 UDOT v23.4s, v31.16b, v7.4b[1] 290 291 AND x0, x2, 15 // kc remainder 0 to 12 292 293 UDOT v24.2s, v8.8b, v4.8b 294 UDOT v25.2s, v8.8b, v5.8b 295 UDOT v26.2s, v8.8b, v6.8b 296 UDOT v27.2s, v8.8b, v7.8b 297 298 # Is there a remainder?- 4 to 12 bytes of A 299 CBNZ x0, 5f 300 301 .p2align 3 3024: 303 # ks loop 304 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 305 B.HI 1b 306 307 ADDP v0.2s, v24.2s, v25.2s 308 ADDP v1.2s, v26.2s, v27.2s 309 LDR x11, [sp, 40] // reload params pointer 310 DUP v24.4s, v0.s[0] 311 DUP v25.4s, v0.s[1] 312 DUP v26.4s, v1.s[0] 313 DUP v27.4s, v1.s[1] 314 ADD x11, x11, 4 315 316 # Subtract zero point from accumulators 317 SUB v16.4s, v16.4s, v24.4s 318 SUB v17.4s, v17.4s, v25.4s 319 SUB v18.4s, v18.4s, v26.4s 320 SUB v19.4s, v19.4s, v27.4s 321 SUB v20.4s, v20.4s, v24.4s 322 SUB v21.4s, v21.4s, v25.4s 323 SUB v22.4s, v22.4s, v26.4s 324 SUB v23.4s, v23.4s, v27.4s 325 326 $if REQUANTIZATION == "RNDNU": 327 # Apply params - preshift, scale, postshift, bias and clamp 328 LD1R {v4.4s}, [x11], 4 329 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits 330 SSHL v17.4s, v17.4s, v4.4s 331 SSHL v18.4s, v18.4s, v4.4s 332 SSHL v19.4s, v19.4s, v4.4s 333 LD1R {v5.4s}, [x11], 4 334 SSHL v20.4s, v20.4s, v4.4s 335 SSHL v21.4s, v21.4s, v4.4s 336 SSHL v22.4s, v22.4s, v4.4s 337 SSHL v23.4s, v23.4s, v4.4s 338 LD1R {v6.4s}, [x11], 4 339 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 340 SQDMULH v17.4s, v17.4s, v5.4s 341 SQDMULH v18.4s, v18.4s, v5.4s 342 SQDMULH v19.4s, v19.4s, v5.4s 343 SQDMULH v20.4s, v20.4s, v5.4s 344 SQDMULH v21.4s, v21.4s, v5.4s 345 SQDMULH v22.4s, v22.4s, v5.4s 346 SQDMULH v23.4s, v23.4s, v5.4s 347 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 348 SRSHL v17.4s, v17.4s, v6.4s 349 SRSHL v18.4s, v18.4s, v6.4s 350 SRSHL v19.4s, v19.4s, v6.4s 351 SRSHL v20.4s, v20.4s, v6.4s 352 SRSHL v21.4s, v21.4s, v6.4s 353 SRSHL v22.4s, v22.4s, v6.4s 354 SRSHL v23.4s, v23.4s, v6.4s 355 $elif REQUANTIZATION == "FP32": 356 # Apply params - scale, bias and clamp 357 SCVTF v16.4s, v16.4s 358 SCVTF v17.4s, v17.4s 359 LD1R {v4.4s}, [x11], 4 360 SCVTF v18.4s, v18.4s 361 SCVTF v19.4s, v19.4s 362 SCVTF v20.4s, v20.4s 363 SCVTF v21.4s, v21.4s 364 SCVTF v22.4s, v22.4s 365 SCVTF v23.4s, v23.4s 366 367 FMUL v16.4s, v16.4s, v4.4s 368 FMUL v17.4s, v17.4s, v4.4s 369 FMUL v18.4s, v18.4s, v4.4s 370 FMUL v19.4s, v19.4s, v4.4s 371 FMUL v20.4s, v20.4s, v4.4s 372 FMUL v21.4s, v21.4s, v4.4s 373 FMUL v22.4s, v22.4s, v4.4s 374 FMUL v23.4s, v23.4s, v4.4s 375 376 FCVTNS v16.4s, v16.4s 377 FCVTNS v17.4s, v17.4s 378 FCVTNS v18.4s, v18.4s 379 FCVTNS v19.4s, v19.4s 380 FCVTNS v20.4s, v20.4s 381 FCVTNS v21.4s, v21.4s 382 FCVTNS v22.4s, v22.4s 383 FCVTNS v23.4s, v23.4s 384 385 SQXTN v16.4h, v16.4s 386 SQXTN v17.4h, v17.4s 387 SQXTN v18.4h, v18.4s 388 SQXTN v19.4h, v19.4s 389 LD1R {v6.8h}, [x11], 2 // add bias 390 391 SQXTN2 v16.8h, v20.4s 392 SQXTN2 v17.8h, v21.4s 393 SQXTN2 v18.8h, v22.4s 394 SQXTN2 v19.8h, v23.4s 395 396 SQADD v16.8h, v16.8h, v6.8h 397 SQADD v17.8h, v17.8h, v6.8h 398 LDR x10, [sp, 16] // Load cn_stride 399 SQADD v18.8h, v18.8h, v6.8h 400 SQADD v19.8h, v19.8h, v6.8h 401 LD1R {v4.16b}, [x11], 1 // clamp min value 402 403 SQXTUN v0.8b, v16.8h 404 SQXTUN v1.8b, v18.8h 405 LD1R {v5.16b}, [x11] // clamp max value 406 SQXTUN2 v0.16b, v17.8h 407 SQXTUN2 v1.16b, v19.8h 408 409 UMAX v0.16b, v0.16b, v4.16b 410 UMAX v1.16b, v1.16b, v4.16b 411 SUBS x1, x1, 8 412 UMIN v0.16b, v0.16b, v5.16b 413 UMIN v1.16b, v1.16b, v5.16b 414 B.LO 7f 415 416 # Store full 4 x 8 417 ST1 {v1.d}[1], [x7], x10 418 ST1 {v1.8b}, [x17], x10 419 ST1 {v0.d}[1], [x16], x10 420 ST1 {v0.8b}, [x6], x10 421 SUB x4, x4, x3 // a -= ks 422 423 # nc loop 424 B.HI 0b 425 426 # Restore d8 from stack 427 LDR d8, [sp], 16 428 RET 429 430 # Remainder- 4 to 12 bytes of A 431 .p2align 3 4325: 433 TBZ x0, 3, 6f 434 435 LDR d0, [x13], 8 436 LDR q4, [x5], 16 437 LDR d1, [x14], 8 438 LDR d2, [x15], 8 439 LDR d3, [x10], 8 440 LDR q5, [x5], 16 441 UDOT v24.2s, v8.8b, v0.8b 442 UDOT v25.2s, v8.8b, v1.8b 443 UDOT v26.2s, v8.8b, v2.8b 444 UDOT v27.2s, v8.8b, v3.8b 445 UDOT v16.4s, v4.16b, v0.4b[0] 446 UDOT v17.4s, v4.16b, v1.4b[0] 447 UDOT v18.4s, v4.16b, v2.4b[0] 448 UDOT v19.4s, v4.16b, v3.4b[0] 449 LDR q6, [x5], 16 450 UDOT v20.4s, v5.16b, v0.4b[0] 451 UDOT v21.4s, v5.16b, v1.4b[0] 452 UDOT v22.4s, v5.16b, v2.4b[0] 453 UDOT v23.4s, v5.16b, v3.4b[0] 454 LDR q4, [x5], 16 455 UDOT v16.4s, v6.16b, v0.4b[1] 456 UDOT v17.4s, v6.16b, v1.4b[1] 457 UDOT v18.4s, v6.16b, v2.4b[1] 458 UDOT v19.4s, v6.16b, v3.4b[1] 459 UDOT v20.4s, v4.16b, v0.4b[1] 460 UDOT v21.4s, v4.16b, v1.4b[1] 461 UDOT v22.4s, v4.16b, v2.4b[1] 462 UDOT v23.4s, v4.16b, v3.4b[1] 463 TBZ x0, 2, 4b 4646: 465 LDR s0, [x13], 4 466 LDR q4, [x5], 16 467 LDR s1, [x14], 4 468 LDR s2, [x15], 4 469 LDR s3, [x10], 4 470 LDR q5, [x5], 16 471 UDOT v24.2s, v8.8b, v0.8b 472 UDOT v25.2s, v8.8b, v1.8b 473 UDOT v26.2s, v8.8b, v2.8b 474 UDOT v27.2s, v8.8b, v3.8b 475 UDOT v16.4s, v4.16b, v0.4b[0] 476 UDOT v17.4s, v4.16b, v1.4b[0] 477 UDOT v18.4s, v4.16b, v2.4b[0] 478 UDOT v19.4s, v4.16b, v3.4b[0] 479 UDOT v20.4s, v5.16b, v0.4b[0] 480 UDOT v21.4s, v5.16b, v1.4b[0] 481 UDOT v22.4s, v5.16b, v2.4b[0] 482 UDOT v23.4s, v5.16b, v3.4b[0] 483 B 4b 484 485 # Store odd width 486 .p2align 3 4877: 488 TBZ x1, 2, 8f 489 ST1 {v1.s}[2], [x7], 4 490 STR s1, [x17], 4 491 ST1 {v0.s}[2], [x16], 4 492 STR s0, [x6], 4 493 EXT v0.16b, v0.16b, v0.16b, 4 494 EXT v1.16b, v1.16b, v1.16b, 4 4958: 496 TBZ x1, 1, 9f 497 ST1 {v1.h}[4], [x7], 2 498 STR h1, [x17], 2 499 ST1 {v0.h}[4], [x16], 2 500 STR h0, [x6], 2 501 EXT v0.16b, v0.16b, v0.16b, 2 502 EXT v1.16b, v1.16b, v1.16b, 2 5039: 504 TBZ x1, 0, 10f 505 ST1 {v1.b}[8], [x7] 506 STR b1, [x17] 507 ST1 {v0.b}[8], [x16] 508 STR b0, [x6] 50910: 510 # Restore d8 from stack 511 LDR d8, [sp], 16 512 RET 513 514END_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_cortex_a55 515 516#ifdef __ELF__ 517.section ".note.GNU-stack","",%progbits 518#endif 519