1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert REQUANTIZATION in ["FP32", "RNDNU"] 7$assert not CHANNELWISE or REQUANTIZATION == "FP32" 8 9#include <xnnpack/assembly.h> 10 11$DATATYPE = "qc8" if CHANNELWISE else "qs8" 12$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params" 13$REWIND_DECREMENT = 3 if CHANNELWISE else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION] 14$if DATATYPE == "qu8": REWIND_DECREMENT += 4 15# void xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55( 16# size_t mr, x0 17# size_t nc, x1 18# size_t kc, x2 / x0 19# size_t ks, x3 / x9 20# const int8_t**restrict a, x4 21# const int8_t* restrict w, x5 22# int8_t* restrict c, x6 23# size_t cm_stride, x7 24# size_t cn_stride, [sp] -> (x0) 25# size_t a_offset, [sp + 8] -> x8 26# const int8_t* zero, [sp + 16] -> x12 27# const union ${PARAMS_UNION} params [sp + 24] -> (x11) 28 29# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 30 31# Register usage 32# A0 x13 v0 v4 33# A1 x14 v1 v5 34# A2 x15 v2 v6 35# A3 x10 v3 v7 36# B x5 v8 v9 v10 v11 37# C0 x6 v16 v20 v24 v28 38# C1 x16 v17 v21 v25 v29 39# C2 x17 v18 v22 v26 v30 40# C3 x7 v19 v23 v27 v31 41# unused v12 v13 v14 v15 42 43# x11 temp for Cortex-A55 loads 44 45BEGIN_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55 46 47 # Clamp C pointers 48 CMP x0, 2 // if mr < 2 49 LDR x8, [sp, 8] // Load a_offset 50 ADD x16, x6, x7 // c1 = c0 + cm_stride 51 LDP x12, x11, [sp, 16] // Load zero, params pointer 52 CSEL x16, x6, x16, LO // c1 = c0 53 ADD x2, x2, 3 // kc = (kc + 3) & ~3 54 STP d8, d9, [sp, -32]! // Save d8-d11 on stack 55 56 ADD x17, x16, x7 // c2 = c1 + cm_stride 57 STP d10, d11, [sp, 16] 58 // if mr <= 2 59 CSEL x17, x16, x17, LS // c2 = c1 60 BIC x2, x2, 3 61 62 CMP x0, 4 // if mr < 4 63 ADD x7, x17, x7 // c3 = c2 + cm_stride 64 CSEL x7, x17, x7, LO // c3 = c2 65 66 .p2align 3 670: 68 # Load initial bias from w into accumulators 69 LDP q16, q20, [x5], 32 70 MOV v17.16b, v16.16b 71 MOV v18.16b, v16.16b 72 LDP q24, q28, [x5], 32 73 MOV v19.16b, v16.16b 74 MOV v21.16b, v20.16b 75 MOV v22.16b, v20.16b 76 MOV v23.16b, v20.16b 77 MOV v25.16b, v24.16b 78 MOV v26.16b, v24.16b 79 MOV v27.16b, v24.16b 80 MOV v29.16b, v28.16b 81 MOV v30.16b, v28.16b 82 MOV v31.16b, v28.16b 83 MOV x9, x3 // p = ks 84 85 .p2align 3 861: 87 # Load next 4 A pointers 88 LDP x13, x14, [x4], 16 89 LDP x15, x10, [x4], 16 90 91 CMP x13, x12 // if a0 == zero 92 ADD x13, x13, x8 // a0 += a_offset 93 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 94 CMP x14, x12 // if a1 == zero 95 ADD x14, x14, x8 // a1 += a_offset 96 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 97 CMP x15, x12 // if a2 == zero 98 ADD x15, x15, x8 // a2 += a_offset 99 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 100 CMP x10, x12 // if a3 == zero 101 ADD x10, x10, x8 // a3 += a_offset 102 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 103 104 # Is there at least 16 bytes for prologue/epilogue? 105 SUBS x0, x2, 16 // k = kc - 16 106 B.LO 5f 107 108 # prologue - read A and B values for block 0 and 1 109 LDR d0, [x13], 8 110 LDR q8, [x5], 16 111 LDR d1, [x14], 8 112 LDR d2, [x15], 8 113 LDR d3, [x10], 8 114 SUBS x0, x0, 16 // is there 16 for main loop? 115 LDR d9, [x5], 8 116 LDR x11, [x5], 8 117 # Is there at least 16 bytes for main loop? 118 B.LO 3f 119 120 # Main loop - 16 bytes of A in 4 groups. 121 # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels 122 # 4 LD64 for A 123 # 4 LD128 for W. = 2 LD64 + INS. 124 # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS. 125 126 .p2align 3 1272: 128 # BLOCK 0 129 SDOT v16.4s, v8.16b, v0.4b[0] 130 LDR d10, [x5], 8 131 SDOT v17.4s, v8.16b, v1.4b[0] 132 INS v9.d[1], x11 133 SDOT v18.4s, v8.16b, v2.4b[0] 134 LDR x11, [x5], 8 135 SDOT v19.4s, v8.16b, v3.4b[0] 136 LDR d4, [x13], 8 137 138 # BLOCK 1 139 SDOT v20.4s, v9.16b, v0.4b[0] 140 LDR d11, [x5], 8 141 SDOT v21.4s, v9.16b, v1.4b[0] 142 INS v10.d[1], x11 143 SDOT v22.4s, v9.16b, v2.4b[0] 144 LDR x11, [x5], 8 145 SDOT v23.4s, v9.16b, v3.4b[0] 146 LDR d5, [x14], 8 147 148 # BLOCK 2 149 SDOT v24.4s, v10.16b, v0.4b[0] 150 LDR d8, [x5], 8 151 SDOT v25.4s, v10.16b, v1.4b[0] 152 INS v11.d[1], x11 153 SDOT v26.4s, v10.16b, v2.4b[0] 154 LDR x11, [x5], 8 155 SDOT v27.4s, v10.16b, v3.4b[0] 156 LDR d6, [x15], 8 157 158 # BLOCK 3 159 SDOT v28.4s, v11.16b, v0.4b[0] 160 LDR d9, [x5], 8 161 SDOT v29.4s, v11.16b, v1.4b[0] 162 INS v8.d[1], x11 163 SDOT v30.4s, v11.16b, v2.4b[0] 164 LDR x11, [x5], 8 165 SDOT v31.4s, v11.16b, v3.4b[0] 166 LDR d7, [x10], 8 167 168 # BLOCK 0 169 SDOT v16.4s, v8.16b, v0.4b[1] 170 LDR d10, [x5], 8 171 SDOT v17.4s, v8.16b, v1.4b[1] 172 INS v9.d[1], x11 173 SDOT v18.4s, v8.16b, v2.4b[1] 174 LDR x11, [x5], 8 175 SDOT v19.4s, v8.16b, v3.4b[1] 176 177 # BLOCK 1 178 SDOT v20.4s, v9.16b, v0.4b[1] 179 LDR d11, [x5], 8 180 SDOT v21.4s, v9.16b, v1.4b[1] 181 INS v10.d[1], x11 182 SDOT v22.4s, v9.16b, v2.4b[1] 183 LDR x11, [x5], 8 184 SDOT v23.4s, v9.16b, v3.4b[1] 185 186 # BLOCK 2 187 SDOT v24.4s, v10.16b, v0.4b[1] 188 LDR d8, [x5], 8 189 SDOT v25.4s, v10.16b, v1.4b[1] 190 INS v11.d[1], x11 191 SDOT v26.4s, v10.16b, v2.4b[1] 192 LDR x11, [x5], 8 193 SDOT v27.4s, v10.16b, v3.4b[1] 194 195 # BLOCK 3 196 SDOT v28.4s, v11.16b, v0.4b[1] 197 LDR d9, [x5], 8 198 SDOT v29.4s, v11.16b, v1.4b[1] 199 INS v8.d[1], x11 200 SDOT v30.4s, v11.16b, v2.4b[1] 201 LDR x11, [x5], 8 202 SDOT v31.4s, v11.16b, v3.4b[1] 203 204 # BLOCK 0 205 SDOT v16.4s, v8.16b, v4.4b[0] 206 LDR d10, [x5], 8 207 SDOT v17.4s, v8.16b, v5.4b[0] 208 INS v9.d[1], x11 209 SDOT v18.4s, v8.16b, v6.4b[0] 210 LDR x11, [x5], 8 211 SDOT v19.4s, v8.16b, v7.4b[0] 212 LDR d0, [x13], 8 213 214 # BLOCK 1 215 SDOT v20.4s, v9.16b, v4.4b[0] 216 LDR d11, [x5], 8 217 SDOT v21.4s, v9.16b, v5.4b[0] 218 INS v10.d[1], x11 219 SDOT v22.4s, v9.16b, v6.4b[0] 220 LDR x11, [x5], 8 221 SDOT v23.4s, v9.16b, v7.4b[0] 222 LDR d1, [x14], 8 223 224 # BLOCK 2 225 SDOT v24.4s, v10.16b, v4.4b[0] 226 LDR d8, [x5], 8 227 SDOT v25.4s, v10.16b, v5.4b[0] 228 INS v11.d[1], x11 229 SDOT v26.4s, v10.16b, v6.4b[0] 230 LDR x11, [x5], 8 231 SDOT v27.4s, v10.16b, v7.4b[0] 232 LDR d2, [x15], 8 233 234 # BLOCK 3 235 SDOT v28.4s, v11.16b, v4.4b[0] 236 LDR d9, [x5], 8 237 SDOT v29.4s, v11.16b, v5.4b[0] 238 INS v8.d[1], x11 239 SDOT v30.4s, v11.16b, v6.4b[0] 240 LDR x11, [x5], 8 241 SDOT v31.4s, v11.16b, v7.4b[0] 242 LDR d3, [x10], 8 243 244 # BLOCK 0 245 SDOT v16.4s, v8.16b, v4.4b[1] 246 LDR d10, [x5], 8 247 SDOT v17.4s, v8.16b, v5.4b[1] 248 INS v9.d[1], x11 249 SDOT v18.4s, v8.16b, v6.4b[1] 250 LDR x11, [x5], 8 251 SDOT v19.4s, v8.16b, v7.4b[1] 252 253 # BLOCK 1 254 SDOT v20.4s, v9.16b, v4.4b[1] 255 LDR d11, [x5], 8 256 SDOT v21.4s, v9.16b, v5.4b[1] 257 INS v10.d[1], x11 258 SDOT v22.4s, v9.16b, v6.4b[1] 259 LDR x11, [x5], 8 260 SDOT v23.4s, v9.16b, v7.4b[1] 261 262 # BLOCK 2 263 SDOT v24.4s, v10.16b, v4.4b[1] 264 LDR d8, [x5], 8 // First B values for block 0 and 1 265 SDOT v25.4s, v10.16b, v5.4b[1] 266 INS v11.d[1], x11 267 SDOT v26.4s, v10.16b, v6.4b[1] 268 LDR x11, [x5], 8 269 SDOT v27.4s, v10.16b, v7.4b[1] 270 SUBS x0, x0, 16 271 272 # BLOCK 3 273 SDOT v28.4s, v11.16b, v4.4b[1] 274 LDR d9, [x5], 8 275 SDOT v29.4s, v11.16b, v5.4b[1] 276 INS v8.d[1], x11 277 SDOT v30.4s, v11.16b, v6.4b[1] 278 LDR x11, [x5], 8 279 SDOT v31.4s, v11.16b, v7.4b[1] 280 B.HS 2b 281 282 # Epilogue. Same as main loop but no preloads in final group 2833: 284 # BLOCK 0 285 SDOT v16.4s, v8.16b, v0.4b[0] 286 LDR d10, [x5], 8 287 SDOT v17.4s, v8.16b, v1.4b[0] 288 INS v9.d[1], x11 289 SDOT v18.4s, v8.16b, v2.4b[0] 290 LDR x11, [x5], 8 291 SDOT v19.4s, v8.16b, v3.4b[0] 292 LDR d4, [x13], 8 293 294 # BLOCK 1 295 SDOT v20.4s, v9.16b, v0.4b[0] 296 LDR d11, [x5], 8 297 SDOT v21.4s, v9.16b, v1.4b[0] 298 INS v10.d[1], x11 299 SDOT v22.4s, v9.16b, v2.4b[0] 300 LDR x11, [x5], 8 301 SDOT v23.4s, v9.16b, v3.4b[0] 302 LDR d5, [x14], 8 303 304 # BLOCK 2 305 SDOT v24.4s, v10.16b, v0.4b[0] 306 LDR d8, [x5], 8 307 SDOT v25.4s, v10.16b, v1.4b[0] 308 INS v11.d[1], x11 309 SDOT v26.4s, v10.16b, v2.4b[0] 310 LDR x11, [x5], 8 311 SDOT v27.4s, v10.16b, v3.4b[0] 312 LDR d6, [x15], 8 313 314 # BLOCK 3 315 SDOT v28.4s, v11.16b, v0.4b[0] 316 LDR d9, [x5], 8 317 SDOT v29.4s, v11.16b, v1.4b[0] 318 INS v8.d[1], x11 319 SDOT v30.4s, v11.16b, v2.4b[0] 320 LDR x11, [x5], 8 321 SDOT v31.4s, v11.16b, v3.4b[0] 322 LDR d7, [x10], 8 323 324 # BLOCK 0 325 SDOT v16.4s, v8.16b, v0.4b[1] 326 LDR d10, [x5], 8 327 SDOT v17.4s, v8.16b, v1.4b[1] 328 INS v9.d[1], x11 329 SDOT v18.4s, v8.16b, v2.4b[1] 330 LDR x11, [x5], 8 331 SDOT v19.4s, v8.16b, v3.4b[1] 332 333 # BLOCK 1 334 SDOT v20.4s, v9.16b, v0.4b[1] 335 LDR d11, [x5], 8 336 SDOT v21.4s, v9.16b, v1.4b[1] 337 INS v10.d[1], x11 338 SDOT v22.4s, v9.16b, v2.4b[1] 339 LDR x11, [x5], 8 340 SDOT v23.4s, v9.16b, v3.4b[1] 341 342 # BLOCK 2 343 SDOT v24.4s, v10.16b, v0.4b[1] 344 LDR d8, [x5], 8 345 SDOT v25.4s, v10.16b, v1.4b[1] 346 INS v11.d[1], x11 347 SDOT v26.4s, v10.16b, v2.4b[1] 348 LDR x11, [x5], 8 349 SDOT v27.4s, v10.16b, v3.4b[1] 350 351 # BLOCK 3 352 SDOT v28.4s, v11.16b, v0.4b[1] 353 LDR d9, [x5], 8 354 SDOT v29.4s, v11.16b, v1.4b[1] 355 INS v8.d[1], x11 356 SDOT v30.4s, v11.16b, v2.4b[1] 357 LDR x11, [x5], 8 358 SDOT v31.4s, v11.16b, v3.4b[1] 359 360 # BLOCK 0 361 SDOT v16.4s, v8.16b, v4.4b[0] 362 LDR d10, [x5], 8 363 SDOT v17.4s, v8.16b, v5.4b[0] 364 INS v9.d[1], x11 365 SDOT v18.4s, v8.16b, v6.4b[0] 366 LDR x11, [x5], 8 367 SDOT v19.4s, v8.16b, v7.4b[0] 368 369 # BLOCK 1 370 SDOT v20.4s, v9.16b, v4.4b[0] 371 LDR d11, [x5], 8 372 SDOT v21.4s, v9.16b, v5.4b[0] 373 INS v10.d[1], x11 374 SDOT v22.4s, v9.16b, v6.4b[0] 375 LDR x11, [x5], 8 376 SDOT v23.4s, v9.16b, v7.4b[0] 377 378 # BLOCK 2 379 SDOT v24.4s, v10.16b, v4.4b[0] 380 LDR d8, [x5], 8 381 SDOT v25.4s, v10.16b, v5.4b[0] 382 INS v11.d[1], x11 383 SDOT v26.4s, v10.16b, v6.4b[0] 384 LDR x11, [x5], 8 385 SDOT v27.4s, v10.16b, v7.4b[0] 386 387 # BLOCK 3 388 SDOT v28.4s, v11.16b, v4.4b[0] 389 LDR d9, [x5], 8 390 SDOT v29.4s, v11.16b, v5.4b[0] 391 INS v8.d[1], x11 392 SDOT v30.4s, v11.16b, v6.4b[0] 393 LDR x11, [x5], 8 394 SDOT v31.4s, v11.16b, v7.4b[0] 395 396 # BLOCK 0 397 SDOT v16.4s, v8.16b, v4.4b[1] 398 LDR d10, [x5], 8 399 SDOT v17.4s, v8.16b, v5.4b[1] 400 INS v9.d[1], x11 401 SDOT v18.4s, v8.16b, v6.4b[1] 402 LDR x11, [x5], 8 403 SDOT v19.4s, v8.16b, v7.4b[1] 404 405 # BLOCK 1 406 SDOT v20.4s, v9.16b, v4.4b[1] 407 LDR d11, [x5], 8 408 SDOT v21.4s, v9.16b, v5.4b[1] 409 INS v10.d[1], x11 410 SDOT v22.4s, v9.16b, v6.4b[1] 411 LDR x11, [x5], 8 412 SDOT v23.4s, v9.16b, v7.4b[1] 413 414 # BLOCK 2 415 SDOT v24.4s, v10.16b, v4.4b[1] 416 SDOT v25.4s, v10.16b, v5.4b[1] 417 INS v11.d[1], x11 418 SDOT v26.4s, v10.16b, v6.4b[1] 419 SDOT v27.4s, v10.16b, v7.4b[1] 420 AND x0, x2, 15 // kc remainder 0 to 12 421 422 # BLOCK 3 423 SDOT v28.4s, v11.16b, v4.4b[1] 424 SDOT v29.4s, v11.16b, v5.4b[1] 425 LDR x11, [sp, 56] // reload params pointer 426 SDOT v30.4s, v11.16b, v6.4b[1] 427 SDOT v31.4s, v11.16b, v7.4b[1] 428 429 # Is there a remainder?- 4 to 12 bytes of A 430 CBNZ x0, 6f 431 432 .p2align 3 4334: 434 # ks loop 435 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 436 B.HI 1b 437 438 $if REQUANTIZATION == "RNDNU": 439 # Apply params - preshift, scale, postshift, bias and clamp 440 LD1R {v4.4s}, [x11], 4 441 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 442 SQSHL v17.4s, v17.4s, v4.4s 443 SQSHL v18.4s, v18.4s, v4.4s 444 SQSHL v19.4s, v19.4s, v4.4s 445 SQSHL v20.4s, v20.4s, v4.4s 446 SQSHL v21.4s, v21.4s, v4.4s 447 SQSHL v22.4s, v22.4s, v4.4s 448 SQSHL v23.4s, v23.4s, v4.4s 449 LD1R {v5.4s}, [x11], 4 450 SQSHL v24.4s, v24.4s, v4.4s 451 SQSHL v25.4s, v25.4s, v4.4s 452 SQSHL v26.4s, v26.4s, v4.4s 453 SQSHL v27.4s, v27.4s, v4.4s 454 SQSHL v28.4s, v28.4s, v4.4s 455 SQSHL v29.4s, v29.4s, v4.4s 456 SQSHL v30.4s, v30.4s, v4.4s 457 SQSHL v31.4s, v31.4s, v4.4s 458 LD1R {v6.4s}, [x11], 4 459 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 460 SQDMULH v17.4s, v17.4s, v5.4s 461 SQDMULH v18.4s, v18.4s, v5.4s 462 SQDMULH v19.4s, v19.4s, v5.4s 463 SQDMULH v20.4s, v20.4s, v5.4s 464 SQDMULH v21.4s, v21.4s, v5.4s 465 SQDMULH v22.4s, v22.4s, v5.4s 466 SQDMULH v23.4s, v23.4s, v5.4s 467 SQDMULH v24.4s, v24.4s, v5.4s 468 SQDMULH v25.4s, v25.4s, v5.4s 469 SQDMULH v26.4s, v26.4s, v5.4s 470 SQDMULH v27.4s, v27.4s, v5.4s 471 SQDMULH v28.4s, v28.4s, v5.4s 472 SQDMULH v29.4s, v29.4s, v5.4s 473 SQDMULH v30.4s, v30.4s, v5.4s 474 SQDMULH v31.4s, v31.4s, v5.4s 475 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 476 SRSHL v17.4s, v17.4s, v6.4s 477 SRSHL v18.4s, v18.4s, v6.4s 478 SRSHL v19.4s, v19.4s, v6.4s 479 SRSHL v20.4s, v20.4s, v6.4s 480 SRSHL v21.4s, v21.4s, v6.4s 481 SRSHL v22.4s, v22.4s, v6.4s 482 SRSHL v23.4s, v23.4s, v6.4s 483 SRSHL v24.4s, v24.4s, v6.4s 484 SRSHL v25.4s, v25.4s, v6.4s 485 SRSHL v26.4s, v26.4s, v6.4s 486 SRSHL v27.4s, v27.4s, v6.4s 487 SRSHL v28.4s, v28.4s, v6.4s 488 SRSHL v29.4s, v29.4s, v6.4s 489 SRSHL v30.4s, v30.4s, v6.4s 490 SRSHL v31.4s, v31.4s, v6.4s 491 $elif REQUANTIZATION == "FP32": 492 SCVTF v16.4s, v16.4s 493 SCVTF v17.4s, v17.4s 494 $if not CHANNELWISE: 495 # Apply params - scale, bias and clamp 496 LD1R {v4.4s}, [x11], 4 497 SCVTF v18.4s, v18.4s 498 SCVTF v19.4s, v19.4s 499 $else: 500 # Load per channel scale values from weights 501 LDR q4, [x5], 16 502 SCVTF v18.4s, v18.4s 503 SCVTF v19.4s, v19.4s 504 LDR q5, [x5], 16 505 SCVTF v20.4s, v20.4s 506 SCVTF v21.4s, v21.4s 507 SCVTF v22.4s, v22.4s 508 SCVTF v23.4s, v23.4s 509 SCVTF v24.4s, v24.4s 510 SCVTF v25.4s, v25.4s 511 SCVTF v26.4s, v26.4s 512 SCVTF v27.4s, v27.4s 513 SCVTF v28.4s, v28.4s 514 SCVTF v29.4s, v29.4s 515 SCVTF v30.4s, v30.4s 516 SCVTF v31.4s, v31.4s 517 518 $if CHANNELWISE: 519 LDR q6, [x5], 16 520 FMUL v16.4s, v16.4s, v4.4s 521 FMUL v17.4s, v17.4s, v4.4s 522 FMUL v18.4s, v18.4s, v4.4s 523 FMUL v19.4s, v19.4s, v4.4s 524 FMUL v20.4s, v20.4s, v5.4s 525 LDR q4, [x5], 16 526 FMUL v21.4s, v21.4s, v5.4s 527 FMUL v22.4s, v22.4s, v5.4s 528 FMUL v23.4s, v23.4s, v5.4s 529 FMUL v24.4s, v24.4s, v6.4s 530 FMUL v25.4s, v25.4s, v6.4s 531 FMUL v26.4s, v26.4s, v6.4s 532 FMUL v27.4s, v27.4s, v6.4s 533 FMUL v28.4s, v28.4s, v4.4s 534 FMUL v29.4s, v29.4s, v4.4s 535 FMUL v30.4s, v30.4s, v4.4s 536 FMUL v31.4s, v31.4s, v4.4s 537 $else: 538 FMUL v16.4s, v16.4s, v4.4s 539 FMUL v17.4s, v17.4s, v4.4s 540 FMUL v18.4s, v18.4s, v4.4s 541 FMUL v19.4s, v19.4s, v4.4s 542 FMUL v20.4s, v20.4s, v4.4s 543 FMUL v21.4s, v21.4s, v4.4s 544 FMUL v22.4s, v22.4s, v4.4s 545 FMUL v23.4s, v23.4s, v4.4s 546 FMUL v24.4s, v24.4s, v4.4s 547 FMUL v25.4s, v25.4s, v4.4s 548 FMUL v26.4s, v26.4s, v4.4s 549 FMUL v27.4s, v27.4s, v4.4s 550 FMUL v28.4s, v28.4s, v4.4s 551 FMUL v29.4s, v29.4s, v4.4s 552 FMUL v30.4s, v30.4s, v4.4s 553 FMUL v31.4s, v31.4s, v4.4s 554 555 FCVTNS v16.4s, v16.4s 556 FCVTNS v17.4s, v17.4s 557 FCVTNS v18.4s, v18.4s 558 FCVTNS v19.4s, v19.4s 559 FCVTNS v20.4s, v20.4s 560 FCVTNS v21.4s, v21.4s 561 FCVTNS v22.4s, v22.4s 562 FCVTNS v23.4s, v23.4s 563 FCVTNS v24.4s, v24.4s 564 FCVTNS v25.4s, v25.4s 565 FCVTNS v26.4s, v26.4s 566 FCVTNS v27.4s, v27.4s 567 FCVTNS v28.4s, v28.4s 568 FCVTNS v29.4s, v29.4s 569 FCVTNS v30.4s, v30.4s 570 FCVTNS v31.4s, v31.4s 571 572 SQXTN v16.4h, v16.4s 573 SQXTN v17.4h, v17.4s 574 SQXTN v18.4h, v18.4s 575 SQXTN v19.4h, v19.4s 576 SQXTN v24.4h, v24.4s 577 SQXTN v25.4h, v25.4s 578 SQXTN v26.4h, v26.4s 579 SQXTN v27.4h, v27.4s 580 LD1R {v6.8h}, [x11], 2 // add bias 581 582 SQXTN2 v16.8h, v20.4s 583 SQXTN2 v17.8h, v21.4s 584 SQXTN2 v18.8h, v22.4s 585 SQXTN2 v19.8h, v23.4s 586 SQXTN2 v24.8h, v28.4s 587 SQXTN2 v25.8h, v29.4s 588 SQXTN2 v26.8h, v30.4s 589 SQXTN2 v27.8h, v31.4s 590 591 SQADD v16.8h, v16.8h, v6.8h 592 SQADD v17.8h, v17.8h, v6.8h 593 SQADD v18.8h, v18.8h, v6.8h 594 SQADD v19.8h, v19.8h, v6.8h 595 SQADD v24.8h, v24.8h, v6.8h 596 SQADD v25.8h, v25.8h, v6.8h 597 SQADD v26.8h, v26.8h, v6.8h 598 SQADD v27.8h, v27.8h, v6.8h 599 LD1R {v4.16b}, [x11], 1 // clamp min value 600 601 SQXTN v0.8b, v16.8h 602 SQXTN v1.8b, v17.8h 603 SQXTN v2.8b, v18.8h 604 SQXTN v3.8b, v19.8h 605 LD1R {v5.16b}, [x11] // clamp max value 606 SQXTN2 v0.16b, v24.8h 607 SQXTN2 v1.16b, v25.8h 608 SQXTN2 v2.16b, v26.8h 609 SQXTN2 v3.16b, v27.8h 610 LDR x0, [sp, 32] // cn_stride 611 SMAX v0.16b, v0.16b, v4.16b 612 SMAX v1.16b, v1.16b, v4.16b 613 SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer 614 SMAX v2.16b, v2.16b, v4.16b 615 SMAX v3.16b, v3.16b, v4.16b 616 SUBS x1, x1, 16 617 SMIN v0.16b, v0.16b, v5.16b 618 SMIN v1.16b, v1.16b, v5.16b 619 SMIN v2.16b, v2.16b, v5.16b 620 SMIN v3.16b, v3.16b, v5.16b 621 B.LO 7f 622 623 # Store full 4 x 16 624 ST1 {v3.16b}, [x7], x0 625 ST1 {v2.16b}, [x17], x0 626 ST1 {v1.16b}, [x16], x0 627 ST1 {v0.16b}, [x6], x0 628 629 SUB x4, x4, x3 // a -= ks 630 631 # nc loop 632 B.HI 0b 633 634 # Restore d8-d11 from stack 635 LDP d10, d11, [sp, 16] 636 LDP d8, d9, [sp], 32 637 RET 638 639 # Remainder- 4 to 12 bytes of A 640 # Although C4, its safe to read 16 bytes. 641 .p2align 3 6425: 643 AND x0, x2, 15 // kc remainder 4 to 12 6446: 645 LDR q0, [x13] 646 LDP q8, q9, [x5], 32 647 LDR q1, [x14] 648 LDR q2, [x15] 649 LDR q3, [x10] 650 LDP q10, q11, [x5], 32 651 SDOT v16.4s, v8.16b, v0.4b[0] 652 SDOT v17.4s, v8.16b, v1.4b[0] 653 SDOT v18.4s, v8.16b, v2.4b[0] 654 SDOT v19.4s, v8.16b, v3.4b[0] 655 SDOT v20.4s, v9.16b, v0.4b[0] 656 SDOT v21.4s, v9.16b, v1.4b[0] 657 SDOT v22.4s, v9.16b, v2.4b[0] 658 SDOT v23.4s, v9.16b, v3.4b[0] 659 SDOT v24.4s, v10.16b, v0.4b[0] 660 SDOT v25.4s, v10.16b, v1.4b[0] 661 SDOT v26.4s, v10.16b, v2.4b[0] 662 SDOT v27.4s, v10.16b, v3.4b[0] 663 SDOT v28.4s, v11.16b, v0.4b[0] 664 SDOT v29.4s, v11.16b, v1.4b[0] 665 SDOT v30.4s, v11.16b, v2.4b[0] 666 SDOT v31.4s, v11.16b, v3.4b[0] 667 CMP x0, 4 668 B.LS 4b 669 LDP q8, q9, [x5], 32 670 LDP q10, q11, [x5], 32 671 SDOT v16.4s, v8.16b, v0.4b[1] 672 SDOT v17.4s, v8.16b, v1.4b[1] 673 SDOT v18.4s, v8.16b, v2.4b[1] 674 SDOT v19.4s, v8.16b, v3.4b[1] 675 SDOT v20.4s, v9.16b, v0.4b[1] 676 SDOT v21.4s, v9.16b, v1.4b[1] 677 SDOT v22.4s, v9.16b, v2.4b[1] 678 SDOT v23.4s, v9.16b, v3.4b[1] 679 SDOT v24.4s, v10.16b, v0.4b[1] 680 SDOT v25.4s, v10.16b, v1.4b[1] 681 SDOT v26.4s, v10.16b, v2.4b[1] 682 SDOT v27.4s, v10.16b, v3.4b[1] 683 SDOT v28.4s, v11.16b, v0.4b[1] 684 SDOT v29.4s, v11.16b, v1.4b[1] 685 SDOT v30.4s, v11.16b, v2.4b[1] 686 SDOT v31.4s, v11.16b, v3.4b[1] 687 CMP x0, 8 688 B.LS 4b 689 LDP q8, q9, [x5], 32 690 LDP q10, q11, [x5], 32 691 SDOT v16.4s, v8.16b, v0.4b[2] 692 SDOT v17.4s, v8.16b, v1.4b[2] 693 SDOT v18.4s, v8.16b, v2.4b[2] 694 SDOT v19.4s, v8.16b, v3.4b[2] 695 SDOT v20.4s, v9.16b, v0.4b[2] 696 SDOT v21.4s, v9.16b, v1.4b[2] 697 SDOT v22.4s, v9.16b, v2.4b[2] 698 SDOT v23.4s, v9.16b, v3.4b[2] 699 SDOT v24.4s, v10.16b, v0.4b[2] 700 SDOT v25.4s, v10.16b, v1.4b[2] 701 SDOT v26.4s, v10.16b, v2.4b[2] 702 SDOT v27.4s, v10.16b, v3.4b[2] 703 SDOT v28.4s, v11.16b, v0.4b[2] 704 SDOT v29.4s, v11.16b, v1.4b[2] 705 SDOT v30.4s, v11.16b, v2.4b[2] 706 SDOT v31.4s, v11.16b, v3.4b[2] 707 B 4b 708 709 # Store odd width 710 .p2align 3 7117: 712 TBZ x1, 3, 8f 713 STR d3, [x7], 8 714 STR d2, [x17], 8 715 DUP d3, v3.d[1] 716 DUP d2, v2.d[1] 717 STR d1, [x16], 8 718 STR d0, [x6], 8 719 DUP d1, v1.d[1] 720 DUP d0, v0.d[1] 7218: 722 TBZ x1, 2, 9f 723 STR s3, [x7], 4 724 STR s2, [x17], 4 725 DUP s3, v3.s[1] 726 DUP s2, v2.s[1] 727 STR s1, [x16], 4 728 STR s0, [x6], 4 729 DUP s1, v1.s[1] 730 DUP s0, v0.s[1] 7319: 732 TBZ x1, 1, 10f 733 STR h3, [x7], 2 734 STR h2, [x17], 2 735 DUP h3, v3.h[1] 736 DUP h2, v2.h[1] 737 STR h1, [x16], 2 738 STR h0, [x6], 2 739 DUP h1, v1.h[1] 740 DUP h0, v0.h[1] 74110: 742 TBZ x1, 0, 11f 743 STR b3, [x7] 744 STR b2, [x17] 745 STR b1, [x16] 746 STR b0, [x6] 74711: 748 # Restore d8-d11 from stack 749 LDP d10, d11, [sp, 16] 750 LDP d8, d9, [sp], 32 751 RET 752 753END_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55 754 755#ifdef __ELF__ 756.section ".note.GNU-stack","",%progbits 757#endif 758