1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert REQUANTIZATION in ["FP32", "RNDNU"] 7 8#include <xnnpack/assembly.h> 9 10$REWIND_DECREMENT = {"RNDNU": 15, "FP32": 7}[REQUANTIZATION] 11# void xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_cortex_a55( 12# size_t mr, x0 13# size_t nc, x1 14# size_t kc, x2 / x0 15# const int8_t* restrict a, x3 16# size_t a_stride, x4 17# const void* restrict w, x5 18# int8_t* restrict c, x6 19# size_t cm_stride, x7 20# size_t cn_stride, [sp] -> x12 21# const union xnn_qu8_conv_minmax_params) [sp + 8] -> x11 22 23# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 24 25# Register usage 26# A0 x3 v0 v4 27# A1 x15 v1 v5 28# A2 x13 v2 v6 29# A3 x4 v3 v7 30# B x5 v28 v29 v30 v31 31# C0 x6 v16 v20 32# C1 x8 v17 v21 33# C2 x9 v18 v22 34# C3 x7 v19 v23 35# zero_point v24 v25 v26 v27 v8 36# unused v12 v13 v14 v15 v29 v30 v31 37 38# x14 temp for Cortex-A55 loads 39 40BEGIN_FUNCTION xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_cortex_a55 41 42 # Clamp A and C pointers 43 CMP x0, 2 // if mr < 2 44 ADD x2, x2, 3 // kc = (kc + 3) & ~3 45 ADD x15, x3, x4 // a1 = a0 + a_stride 46 ADD x8, x6, x7 // c1 = c0 + cm_stride 47 CSEL x15, x3, x15, LO // a1 = a0 48 CSEL x8, x6, x8, LO // c1 = c0 49 BIC x2, x2, 3 50 51 LDP x12, x11, [sp] // cn_stride, params 52 53 ADD x13, x15, x4 // a2 = a1 + a_stride 54 ADD x9, x8, x7 // c2 = c1 + cm_stride 55 STR d8, [sp, -16]! // Save d8 on stack 56 // if mr <= 2 57 CSEL x13, x15, x13, LS // a2 = a1 58 CSEL x9, x8, x9, LS // c2 = c1 59 60 LD1R {v8.4s}, [x11], 4 // kernel_zero_point 61 62 CMP x0, 4 // if mr < 4 63 ADD x4, x13, x4 // a3 = a2 + a_stride 64 ADD x7, x9, x7 // c3 = c2 + cm_stride 65 CSEL x4, x13, x4, LO // a3 = a2 66 CSEL x7, x9, x7, LO // c3 = c2 67 68 .p2align 3 690: 70 # Load initial bias from w into accumulators 71 LDP q16, q20, [x5], 32 72 MOV v17.16b, v16.16b 73 MOV v18.16b, v16.16b 74 MOV v19.16b, v16.16b 75 MOV v21.16b, v20.16b 76 MOV v22.16b, v20.16b 77 MOV v23.16b, v20.16b 78 SUBS x0, x2, 16 // k = kc - 16 79 MOVI v24.16b, 0 80 MOVI v25.16b, 0 81 MOVI v26.16b, 0 82 MOVI v27.16b, 0 83 84 # Is there at least 16 bytes for prologue/epilogue? 85 B.LO 4f 86 87 # prologue - read A and B values for block 0 and 1 88 LDR d0, [x3], 8 89 LDR q28, [x5], 16 90 LDR d1, [x15], 8 91 LDR d2, [x13], 8 92 LDR d3, [x4], 8 93 SUBS x0, x0, 16 // is there 16 for main loop? 94 LDR d29, [x5], 8 95 LDR x14, [x5], 8 96 # Is there at least 16 bytes for main loop? 97 B.LO 2f 98 99 # Main loop - 16 bytes of A in 4 groups of 2 blocks 100 # 4 row of 2 vectors wide = 8 UDOT instructions for 4 channels 101 # 4 LD64 for A 102 # 4 LD128 for W. = 2 LD64 + INS. 103 # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS. 104 105 .p2align 3 1061: 107 # BLOCK 0 108 UDOT v16.4s, v28.16b, v0.4b[0] 109 LDR d30, [x5], 8 110 UDOT v17.4s, v28.16b, v1.4b[0] 111 INS v29.d[1], x14 112 UDOT v18.4s, v28.16b, v2.4b[0] 113 LDR x14, [x5], 8 114 UDOT v19.4s, v28.16b, v3.4b[0] 115 LDR d4, [x3], 8 116 117 # BLOCK 1 118 UDOT v20.4s, v29.16b, v0.4b[0] 119 LDR d31, [x5], 8 120 UDOT v21.4s, v29.16b, v1.4b[0] 121 INS v30.d[1], x14 122 UDOT v22.4s, v29.16b, v2.4b[0] 123 LDR x14, [x5], 8 124 UDOT v23.4s, v29.16b, v3.4b[0] 125 LDR d5, [x15], 8 126 127 # BLOCK 0 128 UDOT v16.4s, v30.16b, v0.4b[1] 129 LDR d28, [x5], 8 130 UDOT v17.4s, v30.16b, v1.4b[1] 131 INS v31.d[1], x14 132 UDOT v18.4s, v30.16b, v2.4b[1] 133 LDR x14, [x5], 8 134 UDOT v19.4s, v30.16b, v3.4b[1] 135 LDR d6, [x13], 8 136 137 # BLOCK 1 138 UDOT v20.4s, v31.16b, v0.4b[1] 139 LDR d29, [x5], 8 140 UDOT v21.4s, v31.16b, v1.4b[1] 141 INS v28.d[1], x14 142 UDOT v22.4s, v31.16b, v2.4b[1] 143 LDR x14, [x5], 8 144 UDOT v23.4s, v31.16b, v3.4b[1] 145 LDR d7, [x4], 8 146 147 UDOT v24.2s, v8.8b, v0.8b 148 UDOT v25.2s, v8.8b, v1.8b 149 UDOT v26.2s, v8.8b, v2.8b 150 UDOT v27.2s, v8.8b, v3.8b 151 152 # BLOCK 0 153 UDOT v16.4s, v28.16b, v4.4b[0] 154 LDR d30, [x5], 8 155 UDOT v17.4s, v28.16b, v5.4b[0] 156 INS v29.d[1], x14 157 UDOT v18.4s, v28.16b, v6.4b[0] 158 LDR x14, [x5], 8 159 UDOT v19.4s, v28.16b, v7.4b[0] 160 LDR d0, [x3], 8 161 162 # BLOCK 1 163 UDOT v20.4s, v29.16b, v4.4b[0] 164 LDR d31, [x5], 8 165 UDOT v21.4s, v29.16b, v5.4b[0] 166 INS v30.d[1], x14 167 UDOT v22.4s, v29.16b, v6.4b[0] 168 LDR x14, [x5], 8 169 UDOT v23.4s, v29.16b, v7.4b[0] 170 LDR d1, [x15], 8 171 172 # BLOCK 0 173 UDOT v16.4s, v30.16b, v4.4b[1] 174 LDR d28, [x5], 8 175 UDOT v17.4s, v30.16b, v5.4b[1] 176 INS v31.d[1], x14 177 UDOT v18.4s, v30.16b, v6.4b[1] 178 LDR x14, [x5], 8 179 UDOT v19.4s, v30.16b, v7.4b[1] 180 LDR d2, [x13], 8 181 182 # BLOCK 1 183 UDOT v20.4s, v31.16b, v4.4b[1] 184 LDR d29, [x5], 8 185 UDOT v21.4s, v31.16b, v5.4b[1] 186 INS v28.d[1], x14 187 UDOT v22.4s, v31.16b, v6.4b[1] 188 LDR x14, [x5], 8 189 UDOT v23.4s, v31.16b, v7.4b[1] 190 LDR d3, [x4], 8 191 192 UDOT v24.2s, v8.8b, v4.8b 193 UDOT v25.2s, v8.8b, v5.8b 194 SUBS x0, x0, 16 195 UDOT v26.2s, v8.8b, v6.8b 196 UDOT v27.2s, v8.8b, v7.8b 197 198 B.HS 1b 199 200 # Epilogue. Same as main loop but no preloads in final group 2012: 202 # BLOCK 0 203 UDOT v16.4s, v28.16b, v0.4b[0] 204 LDR d30, [x5], 8 205 UDOT v17.4s, v28.16b, v1.4b[0] 206 INS v29.d[1], x14 207 UDOT v18.4s, v28.16b, v2.4b[0] 208 LDR x14, [x5], 8 209 UDOT v19.4s, v28.16b, v3.4b[0] 210 LDR d4, [x3], 8 211 212 # BLOCK 1 213 UDOT v20.4s, v29.16b, v0.4b[0] 214 LDR d31, [x5], 8 215 UDOT v21.4s, v29.16b, v1.4b[0] 216 INS v30.d[1], x14 217 UDOT v22.4s, v29.16b, v2.4b[0] 218 LDR x14, [x5], 8 219 UDOT v23.4s, v29.16b, v3.4b[0] 220 LDR d5, [x15], 8 221 222 # BLOCK 0 223 UDOT v16.4s, v30.16b, v0.4b[1] 224 LDR d28, [x5], 8 225 UDOT v17.4s, v30.16b, v1.4b[1] 226 INS v31.d[1], x14 227 UDOT v18.4s, v30.16b, v2.4b[1] 228 LDR x14, [x5], 8 229 UDOT v19.4s, v30.16b, v3.4b[1] 230 LDR d6, [x13], 8 231 232 # BLOCK 1 233 UDOT v20.4s, v31.16b, v0.4b[1] 234 LDR d29, [x5], 8 235 UDOT v21.4s, v31.16b, v1.4b[1] 236 INS v28.d[1], x14 237 UDOT v22.4s, v31.16b, v2.4b[1] 238 LDR x14, [x5], 8 239 UDOT v23.4s, v31.16b, v3.4b[1] 240 LDR d7, [x4], 8 241 242 UDOT v24.2s, v8.8b, v0.8b 243 UDOT v25.2s, v8.8b, v1.8b 244 UDOT v26.2s, v8.8b, v2.8b 245 UDOT v27.2s, v8.8b, v3.8b 246 247 # BLOCK 0 248 UDOT v16.4s, v28.16b, v4.4b[0] 249 LDR d30, [x5], 8 250 UDOT v17.4s, v28.16b, v5.4b[0] 251 INS v29.d[1], x14 252 UDOT v18.4s, v28.16b, v6.4b[0] 253 LDR x14, [x5], 8 254 UDOT v19.4s, v28.16b, v7.4b[0] 255 256 # BLOCK 1 257 UDOT v20.4s, v29.16b, v4.4b[0] 258 LDR d31, [x5], 8 259 UDOT v21.4s, v29.16b, v5.4b[0] 260 INS v30.d[1], x14 261 UDOT v22.4s, v29.16b, v6.4b[0] 262 LDR x14, [x5], 8 263 UDOT v23.4s, v29.16b, v7.4b[0] 264 265 # BLOCK 0 266 UDOT v16.4s, v30.16b, v4.4b[1] 267 UDOT v17.4s, v30.16b, v5.4b[1] 268 INS v31.d[1], x14 269 UDOT v18.4s, v30.16b, v6.4b[1] 270 UDOT v19.4s, v30.16b, v7.4b[1] 271 272 # BLOCK 1 273 UDOT v20.4s, v31.16b, v4.4b[1] 274 UDOT v21.4s, v31.16b, v5.4b[1] 275 UDOT v22.4s, v31.16b, v6.4b[1] 276 UDOT v23.4s, v31.16b, v7.4b[1] 277 278 AND x0, x2, 15 // kc remainder 0 to 12 279 280 UDOT v24.2s, v8.8b, v4.8b 281 UDOT v25.2s, v8.8b, v5.8b 282 UDOT v26.2s, v8.8b, v6.8b 283 UDOT v27.2s, v8.8b, v7.8b 284 285 # Is there a remainder?- 4 to 12 bytes of A 286 CBNZ x0, 4f 287 288 .p2align 3 2893: 290 ADDP v0.2s, v24.2s, v25.2s 291 ADDP v1.2s, v26.2s, v27.2s 292 DUP v24.4s, v0.s[0] 293 DUP v25.4s, v0.s[1] 294 DUP v26.4s, v1.s[0] 295 DUP v27.4s, v1.s[1] 296 297 # Subtract zero point from accumulators 298 SUB v16.4s, v16.4s, v24.4s 299 SUB v17.4s, v17.4s, v25.4s 300 SUB v18.4s, v18.4s, v26.4s 301 SUB v19.4s, v19.4s, v27.4s 302 SUB v20.4s, v20.4s, v24.4s 303 SUB v21.4s, v21.4s, v25.4s 304 SUB v22.4s, v22.4s, v26.4s 305 SUB v23.4s, v23.4s, v27.4s 306 307 $if REQUANTIZATION == "RNDNU": 308 # Apply params - preshift, scale, postshift, bias and clamp 309 LD1R {v4.4s}, [x11], 4 310 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits 311 SSHL v17.4s, v17.4s, v4.4s 312 SSHL v18.4s, v18.4s, v4.4s 313 SSHL v19.4s, v19.4s, v4.4s 314 LD1R {v5.4s}, [x11], 4 315 SSHL v20.4s, v20.4s, v4.4s 316 SSHL v21.4s, v21.4s, v4.4s 317 SSHL v22.4s, v22.4s, v4.4s 318 SSHL v23.4s, v23.4s, v4.4s 319 LD1R {v6.4s}, [x11], 4 320 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 321 SQDMULH v17.4s, v17.4s, v5.4s 322 SQDMULH v18.4s, v18.4s, v5.4s 323 SQDMULH v19.4s, v19.4s, v5.4s 324 SQDMULH v20.4s, v20.4s, v5.4s 325 SQDMULH v21.4s, v21.4s, v5.4s 326 SQDMULH v22.4s, v22.4s, v5.4s 327 SQDMULH v23.4s, v23.4s, v5.4s 328 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 329 SRSHL v17.4s, v17.4s, v6.4s 330 SRSHL v18.4s, v18.4s, v6.4s 331 SRSHL v19.4s, v19.4s, v6.4s 332 SRSHL v20.4s, v20.4s, v6.4s 333 SRSHL v21.4s, v21.4s, v6.4s 334 SRSHL v22.4s, v22.4s, v6.4s 335 SRSHL v23.4s, v23.4s, v6.4s 336 $elif REQUANTIZATION == "FP32": 337 # Apply params - scale, bias and clamp 338 SCVTF v16.4s, v16.4s 339 SCVTF v17.4s, v17.4s 340 LD1R {v4.4s}, [x11], 4 341 SCVTF v18.4s, v18.4s 342 SCVTF v19.4s, v19.4s 343 SCVTF v20.4s, v20.4s 344 SCVTF v21.4s, v21.4s 345 SCVTF v22.4s, v22.4s 346 SCVTF v23.4s, v23.4s 347 348 FMUL v16.4s, v16.4s, v4.4s 349 FMUL v17.4s, v17.4s, v4.4s 350 FMUL v18.4s, v18.4s, v4.4s 351 FMUL v19.4s, v19.4s, v4.4s 352 FMUL v20.4s, v20.4s, v4.4s 353 FMUL v21.4s, v21.4s, v4.4s 354 FMUL v22.4s, v22.4s, v4.4s 355 FMUL v23.4s, v23.4s, v4.4s 356 357 FCVTNS v16.4s, v16.4s 358 FCVTNS v17.4s, v17.4s 359 FCVTNS v18.4s, v18.4s 360 FCVTNS v19.4s, v19.4s 361 FCVTNS v20.4s, v20.4s 362 FCVTNS v21.4s, v21.4s 363 FCVTNS v22.4s, v22.4s 364 FCVTNS v23.4s, v23.4s 365 366 SQXTN v16.4h, v16.4s 367 SQXTN v17.4h, v17.4s 368 SQXTN v18.4h, v18.4s 369 SQXTN v19.4h, v19.4s 370 LD1R {v6.8h}, [x11], 2 // add bias 371 372 SQXTN2 v16.8h, v20.4s 373 SQXTN2 v17.8h, v21.4s 374 SQXTN2 v18.8h, v22.4s 375 SQXTN2 v19.8h, v23.4s 376 377 SQADD v16.8h, v16.8h, v6.8h 378 SQADD v17.8h, v17.8h, v6.8h 379 SQADD v18.8h, v18.8h, v6.8h 380 SQADD v19.8h, v19.8h, v6.8h 381 LD1R {v4.16b}, [x11], 1 // clamp min value 382 383 SQXTUN v0.8b, v16.8h 384 SQXTUN v1.8b, v18.8h 385 LD1R {v5.16b}, [x11] // clamp max value 386 SQXTUN2 v0.16b, v17.8h 387 SQXTUN2 v1.16b, v19.8h 388 SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer 389 390 UMAX v0.16b, v0.16b, v4.16b 391 UMAX v1.16b, v1.16b, v4.16b 392 SUBS x1, x1, 8 393 UMIN v0.16b, v0.16b, v5.16b 394 UMIN v1.16b, v1.16b, v5.16b 395 B.LO 6f 396 397 # Store full 4 x 8 398 ST1 {v0.8b}, [x6], x12 399 SUB x3, x3, x2 // a0 -= kc 400 ST1 {v0.d}[1], [x8], x12 401 SUB x15, x15, x2 // a1 -= kc 402 ST1 {v1.8b}, [x9], x12 403 SUB x13, x13, x2 // a2 -= kc 404 ST1 {v1.d}[1], [x7], x12 405 SUB x4, x4, x2 // a3 -= kc 406 B.NE 0b 407 408 # Restore d8 from stack 409 LDR d8, [sp], 16 410 RET 411 412 # Remainder- 4 to 12 bytes of A 413 # Although C4, its safe to read 16 bytes. 414 .p2align 3 4154: 416 TBZ x0, 3, 5f 417 418 LDR d0, [x3], 8 419 LDR q4, [x5], 16 420 LDR d1, [x15], 8 421 LDR d2, [x13], 8 422 LDR d3, [x4], 8 423 LDR q5, [x5], 16 424 UDOT v24.2s, v8.8b, v0.8b 425 UDOT v25.2s, v8.8b, v1.8b 426 UDOT v26.2s, v8.8b, v2.8b 427 UDOT v27.2s, v8.8b, v3.8b 428 UDOT v16.4s, v4.16b, v0.4b[0] 429 UDOT v17.4s, v4.16b, v1.4b[0] 430 UDOT v18.4s, v4.16b, v2.4b[0] 431 UDOT v19.4s, v4.16b, v3.4b[0] 432 LDR q6, [x5], 16 433 UDOT v20.4s, v5.16b, v0.4b[0] 434 UDOT v21.4s, v5.16b, v1.4b[0] 435 UDOT v22.4s, v5.16b, v2.4b[0] 436 UDOT v23.4s, v5.16b, v3.4b[0] 437 LDR q4, [x5], 16 438 UDOT v16.4s, v6.16b, v0.4b[1] 439 UDOT v17.4s, v6.16b, v1.4b[1] 440 UDOT v18.4s, v6.16b, v2.4b[1] 441 UDOT v19.4s, v6.16b, v3.4b[1] 442 UDOT v20.4s, v4.16b, v0.4b[1] 443 UDOT v21.4s, v4.16b, v1.4b[1] 444 UDOT v22.4s, v4.16b, v2.4b[1] 445 UDOT v23.4s, v4.16b, v3.4b[1] 446 TBZ x0, 2, 3b 4475: 448 LDR s0, [x3], 4 449 LDR q4, [x5], 16 450 LDR s1, [x15], 4 451 LDR s2, [x13], 4 452 LDR s3, [x4], 4 453 LDR q5, [x5], 16 454 UDOT v24.2s, v8.8b, v0.8b 455 UDOT v25.2s, v8.8b, v1.8b 456 UDOT v26.2s, v8.8b, v2.8b 457 UDOT v27.2s, v8.8b, v3.8b 458 UDOT v16.4s, v4.16b, v0.4b[0] 459 UDOT v17.4s, v4.16b, v1.4b[0] 460 UDOT v18.4s, v4.16b, v2.4b[0] 461 UDOT v19.4s, v4.16b, v3.4b[0] 462 UDOT v20.4s, v5.16b, v0.4b[0] 463 UDOT v21.4s, v5.16b, v1.4b[0] 464 UDOT v22.4s, v5.16b, v2.4b[0] 465 UDOT v23.4s, v5.16b, v3.4b[0] 466 B 3b 467 468 # Store odd width 469 .p2align 3 4706: 471 TBZ x1, 2, 7f 472 STR s0, [x6], 4 473 ST1 {v0.s}[2], [x8], 4 474 STR s1, [x9], 4 475 ST1 {v1.s}[2], [x7], 4 476 EXT v0.16b, v0.16b, v0.16b, 4 477 EXT v1.16b, v1.16b, v1.16b, 4 4787: 479 TBZ x1, 1, 8f 480 STR h0, [x6], 2 481 ST1 {v0.h}[4], [x8], 2 482 STR h1, [x9], 2 483 ST1 {v1.h}[4], [x7], 2 484 EXT v0.16b, v0.16b, v0.16b, 2 485 EXT v1.16b, v1.16b, v1.16b, 2 4868: 487 TBZ x1, 0, 9f 488 STR b0, [x6] 489 ST1 {v0.b}[8], [x8] 490 STR b1, [x9] 491 ST1 {v1.b}[8], [x7] 4929: 493 # Restore d8 from stack 494 LDR d8, [sp], 16 495 RET 496 497END_FUNCTION xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_cortex_a55 498 499#ifdef __ELF__ 500.section ".note.GNU-stack","",%progbits 501#endif 502