1// Copyright 2021 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert REQUANTIZATION in ["FP32", "RNDNU"] 7$assert not CHANNELWISE or REQUANTIZATION == "FP32" 8 9#include <xnnpack/assembly.h> 10 11$DATATYPE = "qc8" if CHANNELWISE else "qs8" 12$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params" 13$REWIND_DECREMENT = 3 if CHANNELWISE else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION] 14# void xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128( 15# size_t mr, x0 16# size_t nc, x1 17# size_t kc, x2 / x0 18# size_t ks, x3 / x9 19# const int8_t**restrict a, x4 20# const int8_t* restrict w, x5 21# int8_t* restrict c, x6 22# size_t cm_stride, x7 23# size_t cn_stride, [sp] -> (x0) 24# size_t a_offset, [sp + 8] -> x8 25# const int8_t* zero, [sp + 16] -> x12 26# const union ${PARAMS_UNION} params [sp + 24] -> x11 27 28# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 29 30# Register usage 31# A0 x13 v0 32# A1 x14 v1 33# A2 x15 v2 34# A3 x10 v3 35# B x5 v4 v5 v6 v7 36# C0 x6 v16 v20 v24 v28 37# C1 x16 v17 v21 v25 v29 38# C2 x17 v18 v22 v26 v30 39# C3 x7 v19 v23 v27 v31 40# unused v8 v9 v10 v11 v12 v13 v14 v15 41 42BEGIN_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128 43 44 # Clamp C pointers 45 CMP x0, 2 // if mr < 2 46 LDR x8, [sp, 8] // Load a_offset 47 ADD x16, x6, x7 // c1 = c0 + cm_stride 48 CSEL x16, x6, x16, LO // c1 = c0 49 ADD x2, x2, 3 // kc = (kc + 3) & ~3 50 51 ADD x17, x16, x7 // c2 = c1 + cm_stride 52 LDP x12, x11, [sp, 16] // Load zero, params pointer 53 // if mr <= 2 54 CSEL x17, x16, x17, LS // c2 = c1 55 BIC x2, x2, 3 56 57 CMP x0, 4 // if mr < 4 58 ADD x7, x17, x7 // c3 = c2 + cm_stride 59 CSEL x7, x17, x7, LO // c3 = c2 60 61 .p2align 3 620: 63 # Load initial bias from w into accumulators 64 LDP q16, q20, [x5], 32 65 MOV v17.16b, v16.16b 66 MOV v18.16b, v16.16b 67 LDP q24, q28, [x5], 32 68 MOV v19.16b, v16.16b 69 MOV v21.16b, v20.16b 70 MOV v22.16b, v20.16b 71 MOV v23.16b, v20.16b 72 MOV v25.16b, v24.16b 73 MOV v26.16b, v24.16b 74 MOV v27.16b, v24.16b 75 MOV v29.16b, v28.16b 76 MOV v30.16b, v28.16b 77 MOV v31.16b, v28.16b 78 MOV x9, x3 // p = ks 79 80 .p2align 3 811: 82 # Load next 4 A pointers 83 LDP x13, x14, [x4], 16 84 LDP x15, x10, [x4], 16 85 86 CMP x13, x12 // if a0 == zero 87 ADD x13, x13, x8 // a0 += a_offset 88 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 89 CMP x14, x12 // if a1 == zero 90 ADD x14, x14, x8 // a1 += a_offset 91 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 92 CMP x15, x12 // if a2 == zero 93 ADD x15, x15, x8 // a2 += a_offset 94 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 95 CMP x10, x12 // if a3 == zero 96 ADD x10, x10, x8 // a3 += a_offset 97 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 98 99 # Is there at least 16 bytes for main loop? 100 SUBS x0, x2, 16 // k = kc - 16 101 B.LO 4f 102 103 # Main loop - 16 bytes of A 104 .p2align 3 1052: 106 LDR q0, [x13], 16 107 LDR q4, [x5], 16 108 LDR q1, [x14], 16 109 LDR q2, [x15], 16 110 LDR q3, [x10], 16 111 LDR q5, [x5], 16 112 SDOT v16.4s, v4.16b, v0.4b[0] 113 SDOT v17.4s, v4.16b, v1.4b[0] 114 LDP q6, q7, [x5], 32 115 SDOT v18.4s, v4.16b, v2.4b[0] 116 SDOT v19.4s, v4.16b, v3.4b[0] 117 SDOT v20.4s, v5.16b, v0.4b[0] 118 SDOT v21.4s, v5.16b, v1.4b[0] 119 SDOT v22.4s, v5.16b, v2.4b[0] 120 SDOT v23.4s, v5.16b, v3.4b[0] 121 SDOT v24.4s, v6.16b, v0.4b[0] 122 SDOT v25.4s, v6.16b, v1.4b[0] 123 LDP q4, q5, [x5], 32 124 SDOT v26.4s, v6.16b, v2.4b[0] 125 SDOT v27.4s, v6.16b, v3.4b[0] 126 SDOT v28.4s, v7.16b, v0.4b[0] 127 SDOT v29.4s, v7.16b, v1.4b[0] 128 SDOT v30.4s, v7.16b, v2.4b[0] 129 SDOT v31.4s, v7.16b, v3.4b[0] 130 131 SDOT v16.4s, v4.16b, v0.4b[1] 132 SDOT v17.4s, v4.16b, v1.4b[1] 133 LDP q6, q7, [x5], 32 134 SDOT v18.4s, v4.16b, v2.4b[1] 135 SDOT v19.4s, v4.16b, v3.4b[1] 136 SDOT v20.4s, v5.16b, v0.4b[1] 137 SDOT v21.4s, v5.16b, v1.4b[1] 138 SDOT v22.4s, v5.16b, v2.4b[1] 139 SDOT v23.4s, v5.16b, v3.4b[1] 140 SDOT v24.4s, v6.16b, v0.4b[1] 141 SDOT v25.4s, v6.16b, v1.4b[1] 142 LDP q4, q5, [x5], 32 143 SDOT v26.4s, v6.16b, v2.4b[1] 144 SDOT v27.4s, v6.16b, v3.4b[1] 145 SDOT v28.4s, v7.16b, v0.4b[1] 146 SDOT v29.4s, v7.16b, v1.4b[1] 147 SDOT v30.4s, v7.16b, v2.4b[1] 148 SDOT v31.4s, v7.16b, v3.4b[1] 149 150 SDOT v16.4s, v4.16b, v0.4b[2] 151 SDOT v17.4s, v4.16b, v1.4b[2] 152 LDP q6, q7, [x5], 32 153 SDOT v18.4s, v4.16b, v2.4b[2] 154 SDOT v19.4s, v4.16b, v3.4b[2] 155 SDOT v20.4s, v5.16b, v0.4b[2] 156 SDOT v21.4s, v5.16b, v1.4b[2] 157 SDOT v22.4s, v5.16b, v2.4b[2] 158 SDOT v23.4s, v5.16b, v3.4b[2] 159 SDOT v24.4s, v6.16b, v0.4b[2] 160 SDOT v25.4s, v6.16b, v1.4b[2] 161 LDP q4, q5, [x5], 32 162 SDOT v26.4s, v6.16b, v2.4b[2] 163 SDOT v27.4s, v6.16b, v3.4b[2] 164 SDOT v28.4s, v7.16b, v0.4b[2] 165 SDOT v29.4s, v7.16b, v1.4b[2] 166 SDOT v30.4s, v7.16b, v2.4b[2] 167 SDOT v31.4s, v7.16b, v3.4b[2] 168 169 SDOT v16.4s, v4.16b, v0.4b[3] 170 SDOT v17.4s, v4.16b, v1.4b[3] 171 LDP q6, q7, [x5], 32 172 SDOT v18.4s, v4.16b, v2.4b[3] 173 SDOT v19.4s, v4.16b, v3.4b[3] 174 SDOT v20.4s, v5.16b, v0.4b[3] 175 SDOT v21.4s, v5.16b, v1.4b[3] 176 SDOT v22.4s, v5.16b, v2.4b[3] 177 SDOT v23.4s, v5.16b, v3.4b[3] 178 SDOT v24.4s, v6.16b, v0.4b[3] 179 SDOT v25.4s, v6.16b, v1.4b[3] 180 SDOT v26.4s, v6.16b, v2.4b[3] 181 SDOT v27.4s, v6.16b, v3.4b[3] 182 SUBS x0, x0, 16 183 SDOT v28.4s, v7.16b, v0.4b[3] 184 SDOT v29.4s, v7.16b, v1.4b[3] 185 SDOT v30.4s, v7.16b, v2.4b[3] 186 SDOT v31.4s, v7.16b, v3.4b[3] 187 B.HS 2b 188 189 # Is there a remainder?- 4 to 12 bytes of A 190 TST x0, 15 191 B.NE 4f 192 1933: 194 # ks loop 195 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 196 B.HI 1b 197 198 $if REQUANTIZATION == "RNDNU": 199 # Apply params - preshift, scale, postshift, bias and clamp 200 LD1R {v4.4s}, [x11], 4 201 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 202 SQSHL v17.4s, v17.4s, v4.4s 203 SQSHL v18.4s, v18.4s, v4.4s 204 SQSHL v19.4s, v19.4s, v4.4s 205 SQSHL v20.4s, v20.4s, v4.4s 206 SQSHL v21.4s, v21.4s, v4.4s 207 SQSHL v22.4s, v22.4s, v4.4s 208 SQSHL v23.4s, v23.4s, v4.4s 209 LD1R {v5.4s}, [x11], 4 210 SQSHL v24.4s, v24.4s, v4.4s 211 SQSHL v25.4s, v25.4s, v4.4s 212 SQSHL v26.4s, v26.4s, v4.4s 213 SQSHL v27.4s, v27.4s, v4.4s 214 SQSHL v28.4s, v28.4s, v4.4s 215 SQSHL v29.4s, v29.4s, v4.4s 216 SQSHL v30.4s, v30.4s, v4.4s 217 SQSHL v31.4s, v31.4s, v4.4s 218 LD1R {v6.4s}, [x11], 4 219 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 220 SQDMULH v17.4s, v17.4s, v5.4s 221 SQDMULH v18.4s, v18.4s, v5.4s 222 SQDMULH v19.4s, v19.4s, v5.4s 223 SQDMULH v20.4s, v20.4s, v5.4s 224 SQDMULH v21.4s, v21.4s, v5.4s 225 SQDMULH v22.4s, v22.4s, v5.4s 226 SQDMULH v23.4s, v23.4s, v5.4s 227 SQDMULH v24.4s, v24.4s, v5.4s 228 SQDMULH v25.4s, v25.4s, v5.4s 229 SQDMULH v26.4s, v26.4s, v5.4s 230 SQDMULH v27.4s, v27.4s, v5.4s 231 SQDMULH v28.4s, v28.4s, v5.4s 232 SQDMULH v29.4s, v29.4s, v5.4s 233 SQDMULH v30.4s, v30.4s, v5.4s 234 SQDMULH v31.4s, v31.4s, v5.4s 235 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 236 SRSHL v17.4s, v17.4s, v6.4s 237 SRSHL v18.4s, v18.4s, v6.4s 238 SRSHL v19.4s, v19.4s, v6.4s 239 SRSHL v20.4s, v20.4s, v6.4s 240 SRSHL v21.4s, v21.4s, v6.4s 241 SRSHL v22.4s, v22.4s, v6.4s 242 SRSHL v23.4s, v23.4s, v6.4s 243 SRSHL v24.4s, v24.4s, v6.4s 244 SRSHL v25.4s, v25.4s, v6.4s 245 SRSHL v26.4s, v26.4s, v6.4s 246 SRSHL v27.4s, v27.4s, v6.4s 247 SRSHL v28.4s, v28.4s, v6.4s 248 SRSHL v29.4s, v29.4s, v6.4s 249 SRSHL v30.4s, v30.4s, v6.4s 250 SRSHL v31.4s, v31.4s, v6.4s 251 $elif REQUANTIZATION == "FP32": 252 SCVTF v16.4s, v16.4s 253 SCVTF v17.4s, v17.4s 254 $if not CHANNELWISE: 255 # Apply params - scale, bias and clamp 256 LD1R {v4.4s}, [x11], 4 257 SCVTF v18.4s, v18.4s 258 SCVTF v19.4s, v19.4s 259 $else: 260 # Load per channel scale values from weights 261 LDR q4, [x5], 16 262 SCVTF v18.4s, v18.4s 263 SCVTF v19.4s, v19.4s 264 LDR q5, [x5], 16 265 SCVTF v20.4s, v20.4s 266 SCVTF v21.4s, v21.4s 267 SCVTF v22.4s, v22.4s 268 SCVTF v23.4s, v23.4s 269 SCVTF v24.4s, v24.4s 270 SCVTF v25.4s, v25.4s 271 SCVTF v26.4s, v26.4s 272 SCVTF v27.4s, v27.4s 273 SCVTF v28.4s, v28.4s 274 SCVTF v29.4s, v29.4s 275 SCVTF v30.4s, v30.4s 276 SCVTF v31.4s, v31.4s 277 278 $if CHANNELWISE: 279 LDR q6, [x5], 16 280 FMUL v16.4s, v16.4s, v4.4s 281 FMUL v17.4s, v17.4s, v4.4s 282 FMUL v18.4s, v18.4s, v4.4s 283 FMUL v19.4s, v19.4s, v4.4s 284 FMUL v20.4s, v20.4s, v5.4s 285 LDR q4, [x5], 16 286 FMUL v21.4s, v21.4s, v5.4s 287 FMUL v22.4s, v22.4s, v5.4s 288 FMUL v23.4s, v23.4s, v5.4s 289 FMUL v24.4s, v24.4s, v6.4s 290 FMUL v25.4s, v25.4s, v6.4s 291 FMUL v26.4s, v26.4s, v6.4s 292 FMUL v27.4s, v27.4s, v6.4s 293 FMUL v28.4s, v28.4s, v4.4s 294 FMUL v29.4s, v29.4s, v4.4s 295 FMUL v30.4s, v30.4s, v4.4s 296 FMUL v31.4s, v31.4s, v4.4s 297 $else: 298 FMUL v16.4s, v16.4s, v4.4s 299 FMUL v17.4s, v17.4s, v4.4s 300 FMUL v18.4s, v18.4s, v4.4s 301 FMUL v19.4s, v19.4s, v4.4s 302 FMUL v20.4s, v20.4s, v4.4s 303 FMUL v21.4s, v21.4s, v4.4s 304 FMUL v22.4s, v22.4s, v4.4s 305 FMUL v23.4s, v23.4s, v4.4s 306 FMUL v24.4s, v24.4s, v4.4s 307 FMUL v25.4s, v25.4s, v4.4s 308 FMUL v26.4s, v26.4s, v4.4s 309 FMUL v27.4s, v27.4s, v4.4s 310 FMUL v28.4s, v28.4s, v4.4s 311 FMUL v29.4s, v29.4s, v4.4s 312 FMUL v30.4s, v30.4s, v4.4s 313 FMUL v31.4s, v31.4s, v4.4s 314 315 FCVTNS v16.4s, v16.4s 316 FCVTNS v17.4s, v17.4s 317 FCVTNS v18.4s, v18.4s 318 FCVTNS v19.4s, v19.4s 319 FCVTNS v20.4s, v20.4s 320 FCVTNS v21.4s, v21.4s 321 FCVTNS v22.4s, v22.4s 322 FCVTNS v23.4s, v23.4s 323 FCVTNS v24.4s, v24.4s 324 FCVTNS v25.4s, v25.4s 325 FCVTNS v26.4s, v26.4s 326 FCVTNS v27.4s, v27.4s 327 FCVTNS v28.4s, v28.4s 328 FCVTNS v29.4s, v29.4s 329 FCVTNS v30.4s, v30.4s 330 FCVTNS v31.4s, v31.4s 331 332 SQXTN v16.4h, v16.4s 333 SQXTN v17.4h, v17.4s 334 SQXTN v18.4h, v18.4s 335 SQXTN v19.4h, v19.4s 336 SQXTN v24.4h, v24.4s 337 SQXTN v25.4h, v25.4s 338 SQXTN v26.4h, v26.4s 339 SQXTN v27.4h, v27.4s 340 LD1R {v6.8h}, [x11], 2 // add bias 341 342 SQXTN2 v16.8h, v20.4s 343 SQXTN2 v17.8h, v21.4s 344 SQXTN2 v18.8h, v22.4s 345 SQXTN2 v19.8h, v23.4s 346 SQXTN2 v24.8h, v28.4s 347 SQXTN2 v25.8h, v29.4s 348 SQXTN2 v26.8h, v30.4s 349 SQXTN2 v27.8h, v31.4s 350 351 SQADD v16.8h, v16.8h, v6.8h 352 SQADD v17.8h, v17.8h, v6.8h 353 SQADD v18.8h, v18.8h, v6.8h 354 SQADD v19.8h, v19.8h, v6.8h 355 SQADD v24.8h, v24.8h, v6.8h 356 SQADD v25.8h, v25.8h, v6.8h 357 SQADD v26.8h, v26.8h, v6.8h 358 SQADD v27.8h, v27.8h, v6.8h 359 LD1R {v4.16b}, [x11], 1 // clamp min value 360 361 SQXTN v0.8b, v16.8h 362 SQXTN v1.8b, v17.8h 363 SQXTN v2.8b, v18.8h 364 SQXTN v3.8b, v19.8h 365 LD1R {v5.16b}, [x11] // clamp max value 366 SQXTN2 v0.16b, v24.8h 367 SQXTN2 v1.16b, v25.8h 368 SQXTN2 v2.16b, v26.8h 369 SQXTN2 v3.16b, v27.8h 370 LDR x0, [sp] // cn_stride 371 372 SMAX v0.16b, v0.16b, v4.16b 373 SMAX v1.16b, v1.16b, v4.16b 374 SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer 375 SMAX v2.16b, v2.16b, v4.16b 376 SMAX v3.16b, v3.16b, v4.16b 377 SUBS x1, x1, 16 378 SMIN v0.16b, v0.16b, v5.16b 379 SMIN v1.16b, v1.16b, v5.16b 380 SMIN v2.16b, v2.16b, v5.16b 381 SMIN v3.16b, v3.16b, v5.16b 382 B.LO 6f 383 384 # Store full 4 x 16 385 ST1 {v3.16b}, [x7], x0 386 ST1 {v2.16b}, [x17], x0 387 ST1 {v1.16b}, [x16], x0 388 ST1 {v0.16b}, [x6], x0 389 390 SUB x4, x4, x3 // a -= ks 391 392 # nc loop 393 B.HI 0b 394 RET 395 396 # Remainder- 8 bytes of A 397 .p2align 3 3984: 399 # Is there a remainder?- 8 bytes of A 400 TBZ x0, 3, 5f 401 402 LDR d0, [x13], 8 403 LDR q4, [x5], 16 404 LDR d1, [x14], 8 405 LDR d2, [x15], 8 406 LDR d3, [x10], 8 407 LDR q5, [x5], 16 408 SDOT v16.4s, v4.16b, v0.4b[0] 409 SDOT v17.4s, v4.16b, v1.4b[0] 410 LDP q6, q7, [x5], 32 411 SDOT v18.4s, v4.16b, v2.4b[0] 412 SDOT v19.4s, v4.16b, v3.4b[0] 413 SDOT v20.4s, v5.16b, v0.4b[0] 414 SDOT v21.4s, v5.16b, v1.4b[0] 415 SDOT v22.4s, v5.16b, v2.4b[0] 416 SDOT v23.4s, v5.16b, v3.4b[0] 417 SDOT v24.4s, v6.16b, v0.4b[0] 418 SDOT v25.4s, v6.16b, v1.4b[0] 419 LDP q4, q5, [x5], 32 420 SDOT v26.4s, v6.16b, v2.4b[0] 421 SDOT v27.4s, v6.16b, v3.4b[0] 422 SDOT v28.4s, v7.16b, v0.4b[0] 423 SDOT v29.4s, v7.16b, v1.4b[0] 424 SDOT v30.4s, v7.16b, v2.4b[0] 425 SDOT v31.4s, v7.16b, v3.4b[0] 426 SDOT v16.4s, v4.16b, v0.4b[1] 427 SDOT v17.4s, v4.16b, v1.4b[1] 428 LDP q6, q7, [x5], 32 429 SDOT v18.4s, v4.16b, v2.4b[1] 430 SDOT v19.4s, v4.16b, v3.4b[1] 431 SDOT v20.4s, v5.16b, v0.4b[1] 432 SDOT v21.4s, v5.16b, v1.4b[1] 433 SDOT v22.4s, v5.16b, v2.4b[1] 434 SDOT v23.4s, v5.16b, v3.4b[1] 435 SDOT v24.4s, v6.16b, v0.4b[1] 436 SDOT v25.4s, v6.16b, v1.4b[1] 437 SDOT v26.4s, v6.16b, v2.4b[1] 438 SDOT v27.4s, v6.16b, v3.4b[1] 439 SDOT v28.4s, v7.16b, v0.4b[1] 440 SDOT v29.4s, v7.16b, v1.4b[1] 441 SDOT v30.4s, v7.16b, v2.4b[1] 442 SDOT v31.4s, v7.16b, v3.4b[1] 443 # Is there a remainder?- 4 bytes of A 444 TBZ x0, 2, 3b 445 446 # Remainder- 4 bytes of A 4475: 448 LDR s0, [x13], 4 449 LDR q4, [x5], 16 450 LDR s1, [x14], 4 451 LDR s2, [x15], 4 452 LDR s3, [x10], 4 453 LDR q5, [x5], 16 454 SDOT v16.4s, v4.16b, v0.4b[0] 455 SDOT v17.4s, v4.16b, v1.4b[0] 456 LDP q6, q7, [x5], 32 457 SDOT v18.4s, v4.16b, v2.4b[0] 458 SDOT v19.4s, v4.16b, v3.4b[0] 459 SDOT v20.4s, v5.16b, v0.4b[0] 460 SDOT v21.4s, v5.16b, v1.4b[0] 461 SDOT v22.4s, v5.16b, v2.4b[0] 462 SDOT v23.4s, v5.16b, v3.4b[0] 463 SDOT v24.4s, v6.16b, v0.4b[0] 464 SDOT v25.4s, v6.16b, v1.4b[0] 465 SDOT v26.4s, v6.16b, v2.4b[0] 466 SDOT v27.4s, v6.16b, v3.4b[0] 467 SDOT v28.4s, v7.16b, v0.4b[0] 468 SDOT v29.4s, v7.16b, v1.4b[0] 469 SDOT v30.4s, v7.16b, v2.4b[0] 470 SDOT v31.4s, v7.16b, v3.4b[0] 471 B 3b 472 473 # Store odd width 474 .p2align 3 4756: 476 TBZ x1, 3, 7f 477 STR d3, [x7], 8 478 STR d2, [x17], 8 479 DUP d3, v3.d[1] 480 DUP d2, v2.d[1] 481 STR d1, [x16], 8 482 STR d0, [x6], 8 483 DUP d1, v1.d[1] 484 DUP d0, v0.d[1] 4857: 486 TBZ x1, 2, 8f 487 STR s3, [x7], 4 488 STR s2, [x17], 4 489 DUP s3, v3.s[1] 490 DUP s2, v2.s[1] 491 STR s1, [x16], 4 492 STR s0, [x6], 4 493 DUP s1, v1.s[1] 494 DUP s0, v0.s[1] 4958: 496 TBZ x1, 1, 9f 497 STR h3, [x7], 2 498 STR h2, [x17], 2 499 DUP h3, v3.h[1] 500 DUP h2, v2.h[1] 501 STR h1, [x16], 2 502 STR h0, [x6], 2 503 DUP h1, v1.h[1] 504 DUP h0, v0.h[1] 5059: 506 TBZ x1, 0, 10f 507 STR b3, [x7] 508 STR b2, [x17] 509 STR b1, [x16] 510 STR b0, [x6] 51110: 512 RET 513 514END_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128 515 516#ifdef __ELF__ 517.section ".note.GNU-stack","",%progbits 518#endif 519