1// Copyright 2021 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert REQUANTIZATION in ["FP32", "RNDNU"] 7 8#include <xnnpack/assembly.h> 9 10$REWIND_DECREMENT = {"RNDNU": 15, "FP32": 7}[REQUANTIZATION] 11# void xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128( 12# size_t mr, x0 13# size_t nc, x1 14# size_t kc, x2 / x0 15# size_t ks, x3 / x9 16# const int8_t**restrict a, x4 17# const int8_t* restrict w, x5 18# int8_t* restrict c, x6 19# size_t cm_stride, x7 20# size_t cn_stride, [sp] -> (x0) 21# size_t a_offset, [sp + 8] -> x8 22# const int8_t* zero, [sp + 16] -> x12 23# const union xnn_qu8_conv_minmax_params params) [sp + 24] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x13 v0 29# A1 x14 v1 30# A2 x15 v2 31# A3 x10 v3 32# B x5 v4 v5 v6 v7 33# C0 x6 v16 v20 v24 v28 34# C1 x16 v17 v21 v25 v29 35# C2 x17 v18 v22 v26 v30 36# C3 x7 v19 v23 v27 v31 37# zero_point v8 v12 v13 v14 v15 38# unused v9 v10 v11 39 40BEGIN_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128 41 42 # Clamp C pointers 43 CMP x0, 2 // if mr < 2 44 LDR x8, [sp, 8] // Load a_offset 45 ADD x16, x6, x7 // c1 = c0 + cm_stride 46 CSEL x16, x6, x16, LO // c1 = c0 47 LDP x12, x11, [sp, 16] // Load zero pointer, params 48 ADD x2, x2, 3 // kc = (kc + 3) & ~3 49 ADD x17, x16, x7 // c2 = c1 + cm_stride 50 // if mr <= 2 51 # Save d8,d12-d15 on stack 52 STR d8, [sp, -48]! 53 CSEL x17, x16, x17, LS // c2 = c1 54 BIC x2, x2, 3 55 STP d12, d13, [sp, 16] 56 CMP x0, 4 // if mr < 4 57 ADD x7, x17, x7 // c3 = c2 + cm_stride 58 STP d14, d15, [sp, 32] 59 CSEL x7, x17, x7, LO // c3 = c2 60 LD1R {v8.4s}, [x11], 4 // kernel_zero_point 61 62 .p2align 3 630: 64 # Load initial bias from w into accumulators 65 LDP q16, q20, [x5], 32 66 67 MOVI v12.4s, 0 68 MOVI v13.4s, 0 69 MOVI v14.4s, 0 70 MOVI v15.4s, 0 71 72 MOV v17.16b, v16.16b 73 MOV v18.16b, v16.16b 74 LDP q24, q28, [x5], 32 75 MOV v19.16b, v16.16b 76 MOV v21.16b, v20.16b 77 MOV v22.16b, v20.16b 78 MOV v23.16b, v20.16b 79 MOV v25.16b, v24.16b 80 MOV v26.16b, v24.16b 81 MOV v27.16b, v24.16b 82 MOV v29.16b, v28.16b 83 MOV v30.16b, v28.16b 84 MOV v31.16b, v28.16b 85 86 MOV x9, x3 // p = ks 87 88 .p2align 3 891: 90 # Load next 4 A pointers 91 LDP x13, x14, [x4], 16 92 LDP x15, x10, [x4], 16 93 94 CMP x13, x12 // if a0 == zero 95 ADD x13, x13, x8 // a0 += a_offset 96 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 97 CMP x14, x12 // if a1 == zero 98 ADD x14, x14, x8 // a1 += a_offset 99 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 100 CMP x15, x12 // if a2 == zero 101 ADD x15, x15, x8 // a2 += a_offset 102 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 103 CMP x10, x12 // if a3 == zero 104 ADD x10, x10, x8 // a3 += a_offset 105 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 106 107 # Is there at least 16 bytes for main loop? 108 SUBS x0, x2, 16 // k = kc - 16 109 B.LO 4f 110 111 # Main loop - 16 bytes of A 112 .p2align 3 1132: 114 LDR q0, [x13], 16 115 LDR q4, [x5], 16 116 LDR q1, [x14], 16 117 LDR q2, [x15], 16 118 LDR q3, [x10], 16 119 LDR q5, [x5], 16 120 121 UDOT v12.4s, v8.16b, v0.16b // update zero point 122 UDOT v13.4s, v8.16b, v1.16b 123 UDOT v14.4s, v8.16b, v2.16b 124 UDOT v15.4s, v8.16b, v3.16b 125 126 UDOT v16.4s, v4.16b, v0.4b[0] 127 UDOT v17.4s, v4.16b, v1.4b[0] 128 LDP q6, q7, [x5], 32 129 UDOT v18.4s, v4.16b, v2.4b[0] 130 UDOT v19.4s, v4.16b, v3.4b[0] 131 UDOT v20.4s, v5.16b, v0.4b[0] 132 UDOT v21.4s, v5.16b, v1.4b[0] 133 UDOT v22.4s, v5.16b, v2.4b[0] 134 UDOT v23.4s, v5.16b, v3.4b[0] 135 UDOT v24.4s, v6.16b, v0.4b[0] 136 UDOT v25.4s, v6.16b, v1.4b[0] 137 LDP q4, q5, [x5], 32 138 UDOT v26.4s, v6.16b, v2.4b[0] 139 UDOT v27.4s, v6.16b, v3.4b[0] 140 UDOT v28.4s, v7.16b, v0.4b[0] 141 UDOT v29.4s, v7.16b, v1.4b[0] 142 UDOT v30.4s, v7.16b, v2.4b[0] 143 UDOT v31.4s, v7.16b, v3.4b[0] 144 145 UDOT v16.4s, v4.16b, v0.4b[1] 146 UDOT v17.4s, v4.16b, v1.4b[1] 147 LDP q6, q7, [x5], 32 148 UDOT v18.4s, v4.16b, v2.4b[1] 149 UDOT v19.4s, v4.16b, v3.4b[1] 150 UDOT v20.4s, v5.16b, v0.4b[1] 151 UDOT v21.4s, v5.16b, v1.4b[1] 152 UDOT v22.4s, v5.16b, v2.4b[1] 153 UDOT v23.4s, v5.16b, v3.4b[1] 154 UDOT v24.4s, v6.16b, v0.4b[1] 155 UDOT v25.4s, v6.16b, v1.4b[1] 156 LDP q4, q5, [x5], 32 157 UDOT v26.4s, v6.16b, v2.4b[1] 158 UDOT v27.4s, v6.16b, v3.4b[1] 159 UDOT v28.4s, v7.16b, v0.4b[1] 160 UDOT v29.4s, v7.16b, v1.4b[1] 161 UDOT v30.4s, v7.16b, v2.4b[1] 162 UDOT v31.4s, v7.16b, v3.4b[1] 163 164 UDOT v16.4s, v4.16b, v0.4b[2] 165 UDOT v17.4s, v4.16b, v1.4b[2] 166 LDP q6, q7, [x5], 32 167 UDOT v18.4s, v4.16b, v2.4b[2] 168 UDOT v19.4s, v4.16b, v3.4b[2] 169 UDOT v20.4s, v5.16b, v0.4b[2] 170 UDOT v21.4s, v5.16b, v1.4b[2] 171 UDOT v22.4s, v5.16b, v2.4b[2] 172 UDOT v23.4s, v5.16b, v3.4b[2] 173 UDOT v24.4s, v6.16b, v0.4b[2] 174 UDOT v25.4s, v6.16b, v1.4b[2] 175 LDP q4, q5, [x5], 32 176 UDOT v26.4s, v6.16b, v2.4b[2] 177 UDOT v27.4s, v6.16b, v3.4b[2] 178 UDOT v28.4s, v7.16b, v0.4b[2] 179 UDOT v29.4s, v7.16b, v1.4b[2] 180 UDOT v30.4s, v7.16b, v2.4b[2] 181 UDOT v31.4s, v7.16b, v3.4b[2] 182 183 UDOT v16.4s, v4.16b, v0.4b[3] 184 UDOT v17.4s, v4.16b, v1.4b[3] 185 LDP q6, q7, [x5], 32 186 UDOT v18.4s, v4.16b, v2.4b[3] 187 UDOT v19.4s, v4.16b, v3.4b[3] 188 UDOT v20.4s, v5.16b, v0.4b[3] 189 UDOT v21.4s, v5.16b, v1.4b[3] 190 UDOT v22.4s, v5.16b, v2.4b[3] 191 UDOT v23.4s, v5.16b, v3.4b[3] 192 UDOT v24.4s, v6.16b, v0.4b[3] 193 UDOT v25.4s, v6.16b, v1.4b[3] 194 UDOT v26.4s, v6.16b, v2.4b[3] 195 UDOT v27.4s, v6.16b, v3.4b[3] 196 SUBS x0, x0, 16 197 UDOT v28.4s, v7.16b, v0.4b[3] 198 UDOT v29.4s, v7.16b, v1.4b[3] 199 UDOT v30.4s, v7.16b, v2.4b[3] 200 UDOT v31.4s, v7.16b, v3.4b[3] 201 B.HS 2b 202 203 # Is there a remainder?- 4 to 12 bytes of A 204 TST x0, 15 205 B.NE 4f 206 2073: 208 # ks loop 209 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 210 B.HI 1b 211 212 ADDP v0.4s, v12.4s, v12.4s 213 ADDP v1.4s, v13.4s, v13.4s 214 ADDP v2.4s, v14.4s, v14.4s 215 ADDP v3.4s, v15.4s, v15.4s 216 ADDP v12.4s, v0.4s, v0.4s 217 ADDP v13.4s, v1.4s, v1.4s 218 ADDP v14.4s, v2.4s, v2.4s 219 ADDP v15.4s, v3.4s, v3.4s 220 221 # Subtract zero point from accumulators 222 SUB v16.4s, v16.4s, v12.4s 223 SUB v17.4s, v17.4s, v13.4s 224 SUB v18.4s, v18.4s, v14.4s 225 SUB v19.4s, v19.4s, v15.4s 226 SUB v20.4s, v20.4s, v12.4s 227 SUB v21.4s, v21.4s, v13.4s 228 SUB v22.4s, v22.4s, v14.4s 229 SUB v23.4s, v23.4s, v15.4s 230 SUB v24.4s, v24.4s, v12.4s 231 SUB v25.4s, v25.4s, v13.4s 232 SUB v26.4s, v26.4s, v14.4s 233 SUB v27.4s, v27.4s, v15.4s 234 SUB v28.4s, v28.4s, v12.4s 235 SUB v29.4s, v29.4s, v13.4s 236 SUB v30.4s, v30.4s, v14.4s 237 SUB v31.4s, v31.4s, v15.4s 238 239 $if REQUANTIZATION == "RNDNU": 240 # Apply params - preshift, scale, postshift, bias and clamp 241 LD1R {v4.4s}, [x11], 4 242 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits 243 SSHL v17.4s, v17.4s, v4.4s 244 SSHL v18.4s, v18.4s, v4.4s 245 SSHL v19.4s, v19.4s, v4.4s 246 SSHL v20.4s, v20.4s, v4.4s 247 SSHL v21.4s, v21.4s, v4.4s 248 SSHL v22.4s, v22.4s, v4.4s 249 SSHL v23.4s, v23.4s, v4.4s 250 LD1R {v5.4s}, [x11], 4 251 SSHL v24.4s, v24.4s, v4.4s 252 SSHL v25.4s, v25.4s, v4.4s 253 SSHL v26.4s, v26.4s, v4.4s 254 SSHL v27.4s, v27.4s, v4.4s 255 SSHL v28.4s, v28.4s, v4.4s 256 SSHL v29.4s, v29.4s, v4.4s 257 SSHL v30.4s, v30.4s, v4.4s 258 SSHL v31.4s, v31.4s, v4.4s 259 LD1R {v6.4s}, [x11], 4 260 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 261 SQDMULH v17.4s, v17.4s, v5.4s 262 SQDMULH v18.4s, v18.4s, v5.4s 263 SQDMULH v19.4s, v19.4s, v5.4s 264 SQDMULH v20.4s, v20.4s, v5.4s 265 SQDMULH v21.4s, v21.4s, v5.4s 266 SQDMULH v22.4s, v22.4s, v5.4s 267 SQDMULH v23.4s, v23.4s, v5.4s 268 SQDMULH v24.4s, v24.4s, v5.4s 269 SQDMULH v25.4s, v25.4s, v5.4s 270 SQDMULH v26.4s, v26.4s, v5.4s 271 SQDMULH v27.4s, v27.4s, v5.4s 272 SQDMULH v28.4s, v28.4s, v5.4s 273 SQDMULH v29.4s, v29.4s, v5.4s 274 SQDMULH v30.4s, v30.4s, v5.4s 275 SQDMULH v31.4s, v31.4s, v5.4s 276 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 277 SRSHL v17.4s, v17.4s, v6.4s 278 SRSHL v18.4s, v18.4s, v6.4s 279 SRSHL v19.4s, v19.4s, v6.4s 280 SRSHL v20.4s, v20.4s, v6.4s 281 SRSHL v21.4s, v21.4s, v6.4s 282 SRSHL v22.4s, v22.4s, v6.4s 283 SRSHL v23.4s, v23.4s, v6.4s 284 SRSHL v24.4s, v24.4s, v6.4s 285 SRSHL v25.4s, v25.4s, v6.4s 286 SRSHL v26.4s, v26.4s, v6.4s 287 SRSHL v27.4s, v27.4s, v6.4s 288 SRSHL v28.4s, v28.4s, v6.4s 289 SRSHL v29.4s, v29.4s, v6.4s 290 SRSHL v30.4s, v30.4s, v6.4s 291 SRSHL v31.4s, v31.4s, v6.4s 292 $elif REQUANTIZATION == "FP32": 293 SCVTF v16.4s, v16.4s 294 SCVTF v17.4s, v17.4s 295 # Apply params - scale, bias and clamp 296 LD1R {v4.4s}, [x11], 4 297 SCVTF v18.4s, v18.4s 298 SCVTF v19.4s, v19.4s 299 SCVTF v20.4s, v20.4s 300 SCVTF v21.4s, v21.4s 301 SCVTF v22.4s, v22.4s 302 SCVTF v23.4s, v23.4s 303 SCVTF v24.4s, v24.4s 304 SCVTF v25.4s, v25.4s 305 SCVTF v26.4s, v26.4s 306 SCVTF v27.4s, v27.4s 307 SCVTF v28.4s, v28.4s 308 SCVTF v29.4s, v29.4s 309 SCVTF v30.4s, v30.4s 310 SCVTF v31.4s, v31.4s 311 312 FMUL v16.4s, v16.4s, v4.4s 313 FMUL v17.4s, v17.4s, v4.4s 314 FMUL v18.4s, v18.4s, v4.4s 315 FMUL v19.4s, v19.4s, v4.4s 316 FMUL v20.4s, v20.4s, v4.4s 317 FMUL v21.4s, v21.4s, v4.4s 318 FMUL v22.4s, v22.4s, v4.4s 319 FMUL v23.4s, v23.4s, v4.4s 320 FMUL v24.4s, v24.4s, v4.4s 321 FMUL v25.4s, v25.4s, v4.4s 322 FMUL v26.4s, v26.4s, v4.4s 323 FMUL v27.4s, v27.4s, v4.4s 324 FMUL v28.4s, v28.4s, v4.4s 325 FMUL v29.4s, v29.4s, v4.4s 326 FMUL v30.4s, v30.4s, v4.4s 327 FMUL v31.4s, v31.4s, v4.4s 328 329 FCVTNS v16.4s, v16.4s 330 FCVTNS v17.4s, v17.4s 331 FCVTNS v18.4s, v18.4s 332 FCVTNS v19.4s, v19.4s 333 FCVTNS v20.4s, v20.4s 334 FCVTNS v21.4s, v21.4s 335 FCVTNS v22.4s, v22.4s 336 FCVTNS v23.4s, v23.4s 337 FCVTNS v24.4s, v24.4s 338 FCVTNS v25.4s, v25.4s 339 FCVTNS v26.4s, v26.4s 340 FCVTNS v27.4s, v27.4s 341 FCVTNS v28.4s, v28.4s 342 FCVTNS v29.4s, v29.4s 343 FCVTNS v30.4s, v30.4s 344 FCVTNS v31.4s, v31.4s 345 346 SQXTN v16.4h, v16.4s 347 SQXTN v17.4h, v17.4s 348 SQXTN v18.4h, v18.4s 349 SQXTN v19.4h, v19.4s 350 SQXTN v24.4h, v24.4s 351 SQXTN v25.4h, v25.4s 352 SQXTN v26.4h, v26.4s 353 SQXTN v27.4h, v27.4s 354 LD1R {v6.8h}, [x11], 2 // add bias 355 356 SQXTN2 v16.8h, v20.4s 357 SQXTN2 v17.8h, v21.4s 358 SQXTN2 v18.8h, v22.4s 359 SQXTN2 v19.8h, v23.4s 360 SQXTN2 v24.8h, v28.4s 361 SQXTN2 v25.8h, v29.4s 362 SQXTN2 v26.8h, v30.4s 363 SQXTN2 v27.8h, v31.4s 364 365 SQADD v16.8h, v16.8h, v6.8h 366 SQADD v17.8h, v17.8h, v6.8h 367 SQADD v18.8h, v18.8h, v6.8h 368 SQADD v19.8h, v19.8h, v6.8h 369 SQADD v24.8h, v24.8h, v6.8h 370 SQADD v25.8h, v25.8h, v6.8h 371 SQADD v26.8h, v26.8h, v6.8h 372 SQADD v27.8h, v27.8h, v6.8h 373 LD1R {v4.16b}, [x11], 1 // clamp min value 374 375 SQXTUN v0.8b, v16.8h 376 SQXTUN v1.8b, v17.8h 377 SQXTUN v2.8b, v18.8h 378 SQXTUN v3.8b, v19.8h 379 LD1R {v5.16b}, [x11] // clamp max value 380 SQXTUN2 v0.16b, v24.8h 381 SQXTUN2 v1.16b, v25.8h 382 SQXTUN2 v2.16b, v26.8h 383 SQXTUN2 v3.16b, v27.8h 384 LDR x0, [sp, 48] // Load cn_stride 385 386 UMAX v0.16b, v0.16b, v4.16b 387 UMAX v1.16b, v1.16b, v4.16b 388 SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer 389 UMAX v2.16b, v2.16b, v4.16b 390 UMAX v3.16b, v3.16b, v4.16b 391 SUBS x1, x1, 16 392 UMIN v0.16b, v0.16b, v5.16b 393 UMIN v1.16b, v1.16b, v5.16b 394 UMIN v2.16b, v2.16b, v5.16b 395 UMIN v3.16b, v3.16b, v5.16b 396 B.LO 6f 397 398 # Store full 4 x 16 399 ST1 {v3.16b}, [x7], x0 400 ST1 {v2.16b}, [x17], x0 401 ST1 {v1.16b}, [x16], x0 402 ST1 {v0.16b}, [x6], x0 403 404 SUB x4, x4, x3 // a -= ks 405 406 # nc loop 407 B.HI 0b 408 409 # Restore d8,d12-d15 from stack 410 LDP d14, d15, [sp, 32] 411 LDP d12, d13, [sp, 16] 412 LDR d8, [sp], 48 413 RET 414 415 # Remainder- 8 bytes of A 416 .p2align 3 4174: 418 # Is there a remainder?- 8 bytes of A 419 TBZ x0, 3, 5f 420 421 LDR d0, [x13], 8 422 LDR q4, [x5], 16 423 LDR d1, [x14], 8 424 LDR d2, [x15], 8 425 LDR d3, [x10], 8 426 LDR q5, [x5], 16 427 428 UDOT v12.4s, v8.16b, v0.16b // update zero point 429 UDOT v13.4s, v8.16b, v1.16b 430 UDOT v14.4s, v8.16b, v2.16b 431 UDOT v15.4s, v8.16b, v3.16b 432 433 UDOT v16.4s, v4.16b, v0.4b[0] 434 UDOT v17.4s, v4.16b, v1.4b[0] 435 LDP q6, q7, [x5], 32 436 UDOT v18.4s, v4.16b, v2.4b[0] 437 UDOT v19.4s, v4.16b, v3.4b[0] 438 UDOT v20.4s, v5.16b, v0.4b[0] 439 UDOT v21.4s, v5.16b, v1.4b[0] 440 UDOT v22.4s, v5.16b, v2.4b[0] 441 UDOT v23.4s, v5.16b, v3.4b[0] 442 UDOT v24.4s, v6.16b, v0.4b[0] 443 UDOT v25.4s, v6.16b, v1.4b[0] 444 LDP q4, q5, [x5], 32 445 UDOT v26.4s, v6.16b, v2.4b[0] 446 UDOT v27.4s, v6.16b, v3.4b[0] 447 UDOT v28.4s, v7.16b, v0.4b[0] 448 UDOT v29.4s, v7.16b, v1.4b[0] 449 UDOT v30.4s, v7.16b, v2.4b[0] 450 UDOT v31.4s, v7.16b, v3.4b[0] 451 UDOT v16.4s, v4.16b, v0.4b[1] 452 UDOT v17.4s, v4.16b, v1.4b[1] 453 LDP q6, q7, [x5], 32 454 UDOT v18.4s, v4.16b, v2.4b[1] 455 UDOT v19.4s, v4.16b, v3.4b[1] 456 UDOT v20.4s, v5.16b, v0.4b[1] 457 UDOT v21.4s, v5.16b, v1.4b[1] 458 UDOT v22.4s, v5.16b, v2.4b[1] 459 UDOT v23.4s, v5.16b, v3.4b[1] 460 UDOT v24.4s, v6.16b, v0.4b[1] 461 UDOT v25.4s, v6.16b, v1.4b[1] 462 UDOT v26.4s, v6.16b, v2.4b[1] 463 UDOT v27.4s, v6.16b, v3.4b[1] 464 UDOT v28.4s, v7.16b, v0.4b[1] 465 UDOT v29.4s, v7.16b, v1.4b[1] 466 UDOT v30.4s, v7.16b, v2.4b[1] 467 UDOT v31.4s, v7.16b, v3.4b[1] 468 # Is there a remainder?- 4 bytes of A 469 TBZ x0, 2, 3b 470 471 # Remainder- 4 bytes of A 4725: 473 LDR s0, [x13], 4 474 LDR q4, [x5], 16 475 LDR s1, [x14], 4 476 LDR s2, [x15], 4 477 LDR s3, [x10], 4 478 LDR q5, [x5], 16 479 480 UDOT v12.4s, v8.16b, v0.16b // update zero point 481 UDOT v13.4s, v8.16b, v1.16b 482 UDOT v14.4s, v8.16b, v2.16b 483 UDOT v15.4s, v8.16b, v3.16b 484 485 UDOT v16.4s, v4.16b, v0.4b[0] 486 UDOT v17.4s, v4.16b, v1.4b[0] 487 UDOT v18.4s, v4.16b, v2.4b[0] 488 UDOT v19.4s, v4.16b, v3.4b[0] 489 LDP q6, q7, [x5], 32 490 UDOT v20.4s, v5.16b, v0.4b[0] 491 UDOT v21.4s, v5.16b, v1.4b[0] 492 UDOT v22.4s, v5.16b, v2.4b[0] 493 UDOT v23.4s, v5.16b, v3.4b[0] 494 UDOT v24.4s, v6.16b, v0.4b[0] 495 UDOT v25.4s, v6.16b, v1.4b[0] 496 UDOT v26.4s, v6.16b, v2.4b[0] 497 UDOT v27.4s, v6.16b, v3.4b[0] 498 UDOT v28.4s, v7.16b, v0.4b[0] 499 UDOT v29.4s, v7.16b, v1.4b[0] 500 UDOT v30.4s, v7.16b, v2.4b[0] 501 UDOT v31.4s, v7.16b, v3.4b[0] 502 B 3b 503 504 # Store odd width 505 .p2align 3 5066: 507 TBZ x1, 3, 7f 508 STR d3, [x7], 8 509 STR d2, [x17], 8 510 DUP d3, v3.d[1] 511 DUP d2, v2.d[1] 512 STR d1, [x16], 8 513 STR d0, [x6], 8 514 DUP d1, v1.d[1] 515 DUP d0, v0.d[1] 5167: 517 TBZ x1, 2, 8f 518 STR s3, [x7], 4 519 STR s2, [x17], 4 520 DUP s3, v3.s[1] 521 DUP s2, v2.s[1] 522 STR s1, [x16], 4 523 STR s0, [x6], 4 524 DUP s1, v1.s[1] 525 DUP s0, v0.s[1] 5268: 527 TBZ x1, 1, 9f 528 STR h3, [x7], 2 529 STR h2, [x17], 2 530 DUP h3, v3.h[1] 531 DUP h2, v2.h[1] 532 STR h1, [x16], 2 533 STR h0, [x6], 2 534 DUP h1, v1.h[1] 535 DUP h0, v0.h[1] 5369: 537 TBZ x1, 0, 10f 538 STR b3, [x7] 539 STR b2, [x17] 540 STR b1, [x16] 541 STR b0, [x6] 54210: 543 # Restore d8,d12-d15 from stack 544 LDP d14, d15, [sp, 32] 545 LDP d12, d13, [sp, 16] 546 LDR d8, [sp], 48 547 RET 548 549END_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128 550 551#ifdef __ELF__ 552.section ".note.GNU-stack","",%progbits 553#endif 554