1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t** restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x20 v3 34# B x5 v4 v5 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 40 41BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64 42 43 # Clamp C pointers 44 CMP x0, 2 // if mr < 2 45 LDP x10, x8, [sp] // Load cn_stride, a_offset 46 ADD x16, x6, x7 // c1 = c0 + cm_stride 47 CSEL x16, x6, x16, LO // c1 = c0 48 49 ADD x17, x16, x7 // c2 = c1 + cm_stride 50 LDP x12, x11, [sp, 16] // Load zero, params pointer 51 // if mr <= 2 52 CSEL x17, x16, x17, LS // c2 = c1 53 54 CMP x0, 4 // if mr < 4 55 STR x20, [sp, -16]! // Save x20 on stack 56 ADD x7, x17, x7 // c3 = c2 + cm_stride 57 CSEL x7, x17, x7, LO // c3 = c2 58 59 60 .p2align 3 610: 62 # Load initial bias from w into accumulators 63 LDP q16, q20, [x5], 32 64 MOV v17.16b, v16.16b 65 MOV v18.16b, v16.16b 66 LDP q24, q28, [x5], 32 67 MOV v19.16b, v16.16b 68 MOV v21.16b, v20.16b 69 MOV v22.16b, v20.16b 70 MOV v23.16b, v20.16b 71 MOV v25.16b, v24.16b 72 MOV v26.16b, v24.16b 73 MOV v27.16b, v24.16b 74 MOV v29.16b, v28.16b 75 MOV v30.16b, v28.16b 76 MOV v31.16b, v28.16b 77 MOV x9, x3 // p = ks 78 79 .p2align 3 801: 81 # Load next 4 A pointers 82 LDP x13, x14, [x4], 16 83 LDP x15, x20, [x4], 16 84 85 CMP x13, x12 // if a0 == zero 86 ADD x13, x13, x8 // a0 += a_offset 87 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 88 CMP x14, x12 // if a1 == zero 89 ADD x14, x14, x8 // a1 += a_offset 90 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 91 CMP x15, x12 // if a2 == zero 92 ADD x15, x15, x8 // a2 += a_offset 93 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 94 CMP x20, x12 // if a3 == zero 95 ADD x20, x20, x8 // a3 += a_offset 96 CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset 97 98 # Is there at least 8 bytes for main loop? 99 SUBS x0, x2, 8 // k = kc - 8 100 B.LO 4f 101 102 # Main loop - 8 bytes of A 103 .p2align 3 1042: 105 LD1 {v0.8b}, [x13], 8 106 LDP d4, d5, [x5], 16 107 LD1 {v1.8b}, [x14], 8 108 LD1 {v2.8b}, [x15], 8 109 LD1 {v3.8b}, [x20], 8 110 SXTL v0.8h, v0.8b 111 SXTL v4.8h, v4.8b 112 SXTL v5.8h, v5.8b 113 SXTL v1.8h, v1.8b 114 SXTL v2.8h, v2.8b 115 SXTL v3.8h, v3.8b 116 SMLAL v16.4s, v4.4h, v0.h[0] 117 SMLAL2 v20.4s, v4.8h, v0.h[0] 118 SMLAL v24.4s, v5.4h, v0.h[0] 119 SMLAL2 v28.4s, v5.8h, v0.h[0] 120 SMLAL v17.4s, v4.4h, v1.h[0] 121 SMLAL2 v21.4s, v4.8h, v1.h[0] 122 SMLAL v25.4s, v5.4h, v1.h[0] 123 SMLAL2 v29.4s, v5.8h, v1.h[0] 124 SMLAL v18.4s, v4.4h, v2.h[0] 125 SMLAL2 v22.4s, v4.8h, v2.h[0] 126 SMLAL v26.4s, v5.4h, v2.h[0] 127 SMLAL2 v30.4s, v5.8h, v2.h[0] 128 SMLAL v19.4s, v4.4h, v3.h[0] 129 SMLAL2 v23.4s, v4.8h, v3.h[0] 130 SMLAL v27.4s, v5.4h, v3.h[0] 131 SMLAL2 v31.4s, v5.8h, v3.h[0] 132 133 LDP d4, d5, [x5], 16 134 SXTL v4.8h, v4.8b 135 SXTL v5.8h, v5.8b 136 SMLAL v16.4s, v4.4h, v0.h[1] 137 SMLAL2 v20.4s, v4.8h, v0.h[1] 138 SMLAL v24.4s, v5.4h, v0.h[1] 139 SMLAL2 v28.4s, v5.8h, v0.h[1] 140 SMLAL v17.4s, v4.4h, v1.h[1] 141 SMLAL2 v21.4s, v4.8h, v1.h[1] 142 SMLAL v25.4s, v5.4h, v1.h[1] 143 SMLAL2 v29.4s, v5.8h, v1.h[1] 144 SMLAL v18.4s, v4.4h, v2.h[1] 145 SMLAL2 v22.4s, v4.8h, v2.h[1] 146 SMLAL v26.4s, v5.4h, v2.h[1] 147 SMLAL2 v30.4s, v5.8h, v2.h[1] 148 SMLAL v19.4s, v4.4h, v3.h[1] 149 SMLAL2 v23.4s, v4.8h, v3.h[1] 150 SMLAL v27.4s, v5.4h, v3.h[1] 151 SMLAL2 v31.4s, v5.8h, v3.h[1] 152 153 LDP d4, d5, [x5], 16 154 SXTL v4.8h, v4.8b 155 SXTL v5.8h, v5.8b 156 SMLAL v16.4s, v4.4h, v0.h[2] 157 SMLAL2 v20.4s, v4.8h, v0.h[2] 158 SMLAL v24.4s, v5.4h, v0.h[2] 159 SMLAL2 v28.4s, v5.8h, v0.h[2] 160 SMLAL v17.4s, v4.4h, v1.h[2] 161 SMLAL2 v21.4s, v4.8h, v1.h[2] 162 SMLAL v25.4s, v5.4h, v1.h[2] 163 SMLAL2 v29.4s, v5.8h, v1.h[2] 164 SMLAL v18.4s, v4.4h, v2.h[2] 165 SMLAL2 v22.4s, v4.8h, v2.h[2] 166 SMLAL v26.4s, v5.4h, v2.h[2] 167 SMLAL2 v30.4s, v5.8h, v2.h[2] 168 SMLAL v19.4s, v4.4h, v3.h[2] 169 SMLAL2 v23.4s, v4.8h, v3.h[2] 170 SMLAL v27.4s, v5.4h, v3.h[2] 171 SMLAL2 v31.4s, v5.8h, v3.h[2] 172 173 LDP d4, d5, [x5], 16 174 SXTL v4.8h, v4.8b 175 SXTL v5.8h, v5.8b 176 SMLAL v16.4s, v4.4h, v0.h[3] 177 SMLAL2 v20.4s, v4.8h, v0.h[3] 178 SMLAL v24.4s, v5.4h, v0.h[3] 179 SMLAL2 v28.4s, v5.8h, v0.h[3] 180 SMLAL v17.4s, v4.4h, v1.h[3] 181 SMLAL2 v21.4s, v4.8h, v1.h[3] 182 SMLAL v25.4s, v5.4h, v1.h[3] 183 SMLAL2 v29.4s, v5.8h, v1.h[3] 184 SMLAL v18.4s, v4.4h, v2.h[3] 185 SMLAL2 v22.4s, v4.8h, v2.h[3] 186 SMLAL v26.4s, v5.4h, v2.h[3] 187 SMLAL2 v30.4s, v5.8h, v2.h[3] 188 SMLAL v19.4s, v4.4h, v3.h[3] 189 SMLAL2 v23.4s, v4.8h, v3.h[3] 190 SMLAL v27.4s, v5.4h, v3.h[3] 191 SMLAL2 v31.4s, v5.8h, v3.h[3] 192 193 LDP d4, d5, [x5], 16 194 SXTL v4.8h, v4.8b 195 SXTL v5.8h, v5.8b 196 SMLAL v16.4s, v4.4h, v0.h[4] 197 SMLAL2 v20.4s, v4.8h, v0.h[4] 198 SMLAL v24.4s, v5.4h, v0.h[4] 199 SMLAL2 v28.4s, v5.8h, v0.h[4] 200 SMLAL v17.4s, v4.4h, v1.h[4] 201 SMLAL2 v21.4s, v4.8h, v1.h[4] 202 SMLAL v25.4s, v5.4h, v1.h[4] 203 SMLAL2 v29.4s, v5.8h, v1.h[4] 204 SMLAL v18.4s, v4.4h, v2.h[4] 205 SMLAL2 v22.4s, v4.8h, v2.h[4] 206 SMLAL v26.4s, v5.4h, v2.h[4] 207 SMLAL2 v30.4s, v5.8h, v2.h[4] 208 SMLAL v19.4s, v4.4h, v3.h[4] 209 SMLAL2 v23.4s, v4.8h, v3.h[4] 210 SMLAL v27.4s, v5.4h, v3.h[4] 211 SMLAL2 v31.4s, v5.8h, v3.h[4] 212 213 LDP d4, d5, [x5], 16 214 SXTL v4.8h, v4.8b 215 SXTL v5.8h, v5.8b 216 SMLAL v16.4s, v4.4h, v0.h[5] 217 SMLAL2 v20.4s, v4.8h, v0.h[5] 218 SMLAL v24.4s, v5.4h, v0.h[5] 219 SMLAL2 v28.4s, v5.8h, v0.h[5] 220 SMLAL v17.4s, v4.4h, v1.h[5] 221 SMLAL2 v21.4s, v4.8h, v1.h[5] 222 SMLAL v25.4s, v5.4h, v1.h[5] 223 SMLAL2 v29.4s, v5.8h, v1.h[5] 224 SMLAL v18.4s, v4.4h, v2.h[5] 225 SMLAL2 v22.4s, v4.8h, v2.h[5] 226 SMLAL v26.4s, v5.4h, v2.h[5] 227 SMLAL2 v30.4s, v5.8h, v2.h[5] 228 SMLAL v19.4s, v4.4h, v3.h[5] 229 SMLAL2 v23.4s, v4.8h, v3.h[5] 230 SMLAL v27.4s, v5.4h, v3.h[5] 231 SMLAL2 v31.4s, v5.8h, v3.h[5] 232 233 LDP d4, d5, [x5], 16 234 SXTL v4.8h, v4.8b 235 SXTL v5.8h, v5.8b 236 SMLAL v16.4s, v4.4h, v0.h[6] 237 SMLAL2 v20.4s, v4.8h, v0.h[6] 238 SMLAL v24.4s, v5.4h, v0.h[6] 239 SMLAL2 v28.4s, v5.8h, v0.h[6] 240 SMLAL v17.4s, v4.4h, v1.h[6] 241 SMLAL2 v21.4s, v4.8h, v1.h[6] 242 SMLAL v25.4s, v5.4h, v1.h[6] 243 SMLAL2 v29.4s, v5.8h, v1.h[6] 244 SMLAL v18.4s, v4.4h, v2.h[6] 245 SMLAL2 v22.4s, v4.8h, v2.h[6] 246 SMLAL v26.4s, v5.4h, v2.h[6] 247 SMLAL2 v30.4s, v5.8h, v2.h[6] 248 SMLAL v19.4s, v4.4h, v3.h[6] 249 SMLAL2 v23.4s, v4.8h, v3.h[6] 250 SMLAL v27.4s, v5.4h, v3.h[6] 251 SMLAL2 v31.4s, v5.8h, v3.h[6] 252 253 LDP d4, d5, [x5], 16 254 SXTL v4.8h, v4.8b 255 SXTL v5.8h, v5.8b 256 SMLAL v16.4s, v4.4h, v0.h[7] 257 SMLAL2 v20.4s, v4.8h, v0.h[7] 258 SMLAL v24.4s, v5.4h, v0.h[7] 259 SMLAL2 v28.4s, v5.8h, v0.h[7] 260 SMLAL v17.4s, v4.4h, v1.h[7] 261 SMLAL2 v21.4s, v4.8h, v1.h[7] 262 SMLAL v25.4s, v5.4h, v1.h[7] 263 SMLAL2 v29.4s, v5.8h, v1.h[7] 264 SMLAL v18.4s, v4.4h, v2.h[7] 265 SMLAL2 v22.4s, v4.8h, v2.h[7] 266 SMLAL v26.4s, v5.4h, v2.h[7] 267 SMLAL2 v30.4s, v5.8h, v2.h[7] 268 SMLAL v19.4s, v4.4h, v3.h[7] 269 SMLAL2 v23.4s, v4.8h, v3.h[7] 270 SMLAL v27.4s, v5.4h, v3.h[7] 271 SMLAL2 v31.4s, v5.8h, v3.h[7] 272 273 SUBS x0, x0, 8 274 B.HS 2b 275 276 AND x0, x2, 7 // kc remainder 0 to 7 277 # Is there a remainder?- 1 to 7 bytes of A 278 CBNZ x0, 4f 279 2803: 281 # ks loop 282 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 283 B.HI 1b 284 285 SCVTF v16.4s, v16.4s 286 SCVTF v17.4s, v17.4s 287 # Load per channel scale values from weights 288 LDR q4, [x5], 16 289 SCVTF v18.4s, v18.4s 290 SCVTF v19.4s, v19.4s 291 LDR q5, [x5], 16 292 SCVTF v20.4s, v20.4s 293 SCVTF v21.4s, v21.4s 294 SCVTF v22.4s, v22.4s 295 SCVTF v23.4s, v23.4s 296 SCVTF v24.4s, v24.4s 297 SCVTF v25.4s, v25.4s 298 SCVTF v26.4s, v26.4s 299 SCVTF v27.4s, v27.4s 300 SCVTF v28.4s, v28.4s 301 SCVTF v29.4s, v29.4s 302 SCVTF v30.4s, v30.4s 303 SCVTF v31.4s, v31.4s 304 305 LDR q6, [x5], 16 306 FMUL v16.4s, v16.4s, v4.4s 307 FMUL v17.4s, v17.4s, v4.4s 308 FMUL v18.4s, v18.4s, v4.4s 309 FMUL v19.4s, v19.4s, v4.4s 310 FMUL v20.4s, v20.4s, v5.4s 311 LDR q4, [x5], 16 312 FMUL v21.4s, v21.4s, v5.4s 313 FMUL v22.4s, v22.4s, v5.4s 314 FMUL v23.4s, v23.4s, v5.4s 315 FMUL v24.4s, v24.4s, v6.4s 316 FMUL v25.4s, v25.4s, v6.4s 317 FMUL v26.4s, v26.4s, v6.4s 318 FMUL v27.4s, v27.4s, v6.4s 319 FMUL v28.4s, v28.4s, v4.4s 320 FMUL v29.4s, v29.4s, v4.4s 321 FMUL v30.4s, v30.4s, v4.4s 322 FMUL v31.4s, v31.4s, v4.4s 323 324 FCVTNS v16.4s, v16.4s 325 FCVTNS v17.4s, v17.4s 326 FCVTNS v18.4s, v18.4s 327 FCVTNS v19.4s, v19.4s 328 FCVTNS v20.4s, v20.4s 329 FCVTNS v21.4s, v21.4s 330 FCVTNS v22.4s, v22.4s 331 FCVTNS v23.4s, v23.4s 332 FCVTNS v24.4s, v24.4s 333 FCVTNS v25.4s, v25.4s 334 FCVTNS v26.4s, v26.4s 335 FCVTNS v27.4s, v27.4s 336 FCVTNS v28.4s, v28.4s 337 FCVTNS v29.4s, v29.4s 338 FCVTNS v30.4s, v30.4s 339 FCVTNS v31.4s, v31.4s 340 341 SQXTN v16.4h, v16.4s 342 SQXTN v17.4h, v17.4s 343 SQXTN v18.4h, v18.4s 344 SQXTN v19.4h, v19.4s 345 SQXTN v24.4h, v24.4s 346 SQXTN v25.4h, v25.4s 347 SQXTN v26.4h, v26.4s 348 SQXTN v27.4h, v27.4s 349 LD1R {v6.8h}, [x11], 2 // add bias 350 351 SQXTN2 v16.8h, v20.4s 352 SQXTN2 v17.8h, v21.4s 353 SQXTN2 v18.8h, v22.4s 354 SQXTN2 v19.8h, v23.4s 355 SQXTN2 v24.8h, v28.4s 356 SQXTN2 v25.8h, v29.4s 357 SQXTN2 v26.8h, v30.4s 358 SQXTN2 v27.8h, v31.4s 359 360 SQADD v16.8h, v16.8h, v6.8h 361 SQADD v17.8h, v17.8h, v6.8h 362 SQADD v18.8h, v18.8h, v6.8h 363 SQADD v19.8h, v19.8h, v6.8h 364 SQADD v24.8h, v24.8h, v6.8h 365 SQADD v25.8h, v25.8h, v6.8h 366 SQADD v26.8h, v26.8h, v6.8h 367 SQADD v27.8h, v27.8h, v6.8h 368 LD1R {v4.16b}, [x11], 1 // clamp min value 369 370 SQXTN v0.8b, v16.8h 371 SQXTN v1.8b, v17.8h 372 SQXTN v2.8b, v18.8h 373 SQXTN v3.8b, v19.8h 374 LD1R {v5.16b}, [x11] // clamp max value 375 SQXTN2 v0.16b, v24.8h 376 SQXTN2 v1.16b, v25.8h 377 SQXTN2 v2.16b, v26.8h 378 SQXTN2 v3.16b, v27.8h 379 SUB x11, x11, 3 // rewind params pointer 380 381 SMAX v0.16b, v0.16b, v4.16b 382 SMAX v1.16b, v1.16b, v4.16b 383 SMAX v2.16b, v2.16b, v4.16b 384 SMAX v3.16b, v3.16b, v4.16b 385 SUBS x1, x1, 16 386 SMIN v0.16b, v0.16b, v5.16b 387 SMIN v1.16b, v1.16b, v5.16b 388 SMIN v2.16b, v2.16b, v5.16b 389 SMIN v3.16b, v3.16b, v5.16b 390 B.LO 5f 391 392 # Store full 4 x 16 393 ST1 {v3.16b}, [x7], x10 394 ST1 {v2.16b}, [x17], x10 395 ST1 {v1.16b}, [x16], x10 396 ST1 {v0.16b}, [x6], x10 397 398 SUB x4, x4, x3 // a -= ks 399 400 # nc loop 401 B.HI 0b 402 403 # Restore x20 from stack 404 LDR x20, [sp], 16 405 RET 406 407 # Remainder- 1 to 7 bytes of A 408 .p2align 3 4094: 410 AND x0, x2, 7 // kc remainder 1 to 7 411 412 LD1 {v0.8b}, [x13], x0 413 LDP d4, d5, [x5], 16 414 LD1 {v1.8b}, [x14], x0 415 LD1 {v2.8b}, [x15], x0 416 LD1 {v3.8b}, [x20], x0 417 SXTL v0.8h, v0.8b 418 SXTL v4.8h, v4.8b 419 SXTL v5.8h, v5.8b 420 SXTL v1.8h, v1.8b 421 SXTL v2.8h, v2.8b 422 SXTL v3.8h, v3.8b 423 SMLAL v16.4s, v4.4h, v0.h[0] 424 SMLAL2 v20.4s, v4.8h, v0.h[0] 425 SMLAL v24.4s, v5.4h, v0.h[0] 426 SMLAL2 v28.4s, v5.8h, v0.h[0] 427 SMLAL v17.4s, v4.4h, v1.h[0] 428 SMLAL2 v21.4s, v4.8h, v1.h[0] 429 SMLAL v25.4s, v5.4h, v1.h[0] 430 SMLAL2 v29.4s, v5.8h, v1.h[0] 431 SMLAL v18.4s, v4.4h, v2.h[0] 432 SMLAL2 v22.4s, v4.8h, v2.h[0] 433 SMLAL v26.4s, v5.4h, v2.h[0] 434 SMLAL2 v30.4s, v5.8h, v2.h[0] 435 SMLAL v19.4s, v4.4h, v3.h[0] 436 SMLAL2 v23.4s, v4.8h, v3.h[0] 437 SMLAL v27.4s, v5.4h, v3.h[0] 438 SMLAL2 v31.4s, v5.8h, v3.h[0] 439 CMP x0, 2 440 B.LO 3b 441 442 LDP d4, d5, [x5], 16 443 SXTL v4.8h, v4.8b 444 SXTL v5.8h, v5.8b 445 SMLAL v16.4s, v4.4h, v0.h[1] 446 SMLAL2 v20.4s, v4.8h, v0.h[1] 447 SMLAL v24.4s, v5.4h, v0.h[1] 448 SMLAL2 v28.4s, v5.8h, v0.h[1] 449 SMLAL v17.4s, v4.4h, v1.h[1] 450 SMLAL2 v21.4s, v4.8h, v1.h[1] 451 SMLAL v25.4s, v5.4h, v1.h[1] 452 SMLAL2 v29.4s, v5.8h, v1.h[1] 453 SMLAL v18.4s, v4.4h, v2.h[1] 454 SMLAL2 v22.4s, v4.8h, v2.h[1] 455 SMLAL v26.4s, v5.4h, v2.h[1] 456 SMLAL2 v30.4s, v5.8h, v2.h[1] 457 SMLAL v19.4s, v4.4h, v3.h[1] 458 SMLAL2 v23.4s, v4.8h, v3.h[1] 459 SMLAL v27.4s, v5.4h, v3.h[1] 460 SMLAL2 v31.4s, v5.8h, v3.h[1] 461 B.EQ 3b 462 463 LDP d4, d5, [x5], 16 464 SXTL v4.8h, v4.8b 465 SXTL v5.8h, v5.8b 466 SMLAL v16.4s, v4.4h, v0.h[2] 467 SMLAL2 v20.4s, v4.8h, v0.h[2] 468 SMLAL v24.4s, v5.4h, v0.h[2] 469 SMLAL2 v28.4s, v5.8h, v0.h[2] 470 SMLAL v17.4s, v4.4h, v1.h[2] 471 SMLAL2 v21.4s, v4.8h, v1.h[2] 472 SMLAL v25.4s, v5.4h, v1.h[2] 473 SMLAL2 v29.4s, v5.8h, v1.h[2] 474 SMLAL v18.4s, v4.4h, v2.h[2] 475 SMLAL2 v22.4s, v4.8h, v2.h[2] 476 SMLAL v26.4s, v5.4h, v2.h[2] 477 SMLAL2 v30.4s, v5.8h, v2.h[2] 478 SMLAL v19.4s, v4.4h, v3.h[2] 479 SMLAL2 v23.4s, v4.8h, v3.h[2] 480 SMLAL v27.4s, v5.4h, v3.h[2] 481 SMLAL2 v31.4s, v5.8h, v3.h[2] 482 CMP x0, 4 483 B.LO 3b 484 485 LDP d4, d5, [x5], 16 486 SXTL v4.8h, v4.8b 487 SXTL v5.8h, v5.8b 488 SMLAL v16.4s, v4.4h, v0.h[3] 489 SMLAL2 v20.4s, v4.8h, v0.h[3] 490 SMLAL v24.4s, v5.4h, v0.h[3] 491 SMLAL2 v28.4s, v5.8h, v0.h[3] 492 SMLAL v17.4s, v4.4h, v1.h[3] 493 SMLAL2 v21.4s, v4.8h, v1.h[3] 494 SMLAL v25.4s, v5.4h, v1.h[3] 495 SMLAL2 v29.4s, v5.8h, v1.h[3] 496 SMLAL v18.4s, v4.4h, v2.h[3] 497 SMLAL2 v22.4s, v4.8h, v2.h[3] 498 SMLAL v26.4s, v5.4h, v2.h[3] 499 SMLAL2 v30.4s, v5.8h, v2.h[3] 500 SMLAL v19.4s, v4.4h, v3.h[3] 501 SMLAL2 v23.4s, v4.8h, v3.h[3] 502 SMLAL v27.4s, v5.4h, v3.h[3] 503 SMLAL2 v31.4s, v5.8h, v3.h[3] 504 B.EQ 3b 505 506 LDP d4, d5, [x5], 16 507 SXTL v4.8h, v4.8b 508 SXTL v5.8h, v5.8b 509 SMLAL v16.4s, v4.4h, v0.h[4] 510 SMLAL2 v20.4s, v4.8h, v0.h[4] 511 SMLAL v24.4s, v5.4h, v0.h[4] 512 SMLAL2 v28.4s, v5.8h, v0.h[4] 513 SMLAL v17.4s, v4.4h, v1.h[4] 514 SMLAL2 v21.4s, v4.8h, v1.h[4] 515 SMLAL v25.4s, v5.4h, v1.h[4] 516 SMLAL2 v29.4s, v5.8h, v1.h[4] 517 SMLAL v18.4s, v4.4h, v2.h[4] 518 SMLAL2 v22.4s, v4.8h, v2.h[4] 519 SMLAL v26.4s, v5.4h, v2.h[4] 520 SMLAL2 v30.4s, v5.8h, v2.h[4] 521 SMLAL v19.4s, v4.4h, v3.h[4] 522 SMLAL2 v23.4s, v4.8h, v3.h[4] 523 SMLAL v27.4s, v5.4h, v3.h[4] 524 SMLAL2 v31.4s, v5.8h, v3.h[4] 525 CMP x0, 6 526 B.LO 3b 527 528 LDP d4, d5, [x5], 16 529 SXTL v4.8h, v4.8b 530 SXTL v5.8h, v5.8b 531 SMLAL v16.4s, v4.4h, v0.h[5] 532 SMLAL2 v20.4s, v4.8h, v0.h[5] 533 SMLAL v24.4s, v5.4h, v0.h[5] 534 SMLAL2 v28.4s, v5.8h, v0.h[5] 535 SMLAL v17.4s, v4.4h, v1.h[5] 536 SMLAL2 v21.4s, v4.8h, v1.h[5] 537 SMLAL v25.4s, v5.4h, v1.h[5] 538 SMLAL2 v29.4s, v5.8h, v1.h[5] 539 SMLAL v18.4s, v4.4h, v2.h[5] 540 SMLAL2 v22.4s, v4.8h, v2.h[5] 541 SMLAL v26.4s, v5.4h, v2.h[5] 542 SMLAL2 v30.4s, v5.8h, v2.h[5] 543 SMLAL v19.4s, v4.4h, v3.h[5] 544 SMLAL2 v23.4s, v4.8h, v3.h[5] 545 SMLAL v27.4s, v5.4h, v3.h[5] 546 SMLAL2 v31.4s, v5.8h, v3.h[5] 547 B.EQ 3b 548 549 LDP d4, d5, [x5], 16 550 SXTL v4.8h, v4.8b 551 SXTL v5.8h, v5.8b 552 SMLAL v16.4s, v4.4h, v0.h[6] 553 SMLAL2 v20.4s, v4.8h, v0.h[6] 554 SMLAL v24.4s, v5.4h, v0.h[6] 555 SMLAL2 v28.4s, v5.8h, v0.h[6] 556 SMLAL v17.4s, v4.4h, v1.h[6] 557 SMLAL2 v21.4s, v4.8h, v1.h[6] 558 SMLAL v25.4s, v5.4h, v1.h[6] 559 SMLAL2 v29.4s, v5.8h, v1.h[6] 560 SMLAL v18.4s, v4.4h, v2.h[6] 561 SMLAL2 v22.4s, v4.8h, v2.h[6] 562 SMLAL v26.4s, v5.4h, v2.h[6] 563 SMLAL2 v30.4s, v5.8h, v2.h[6] 564 SMLAL v19.4s, v4.4h, v3.h[6] 565 SMLAL2 v23.4s, v4.8h, v3.h[6] 566 SMLAL v27.4s, v5.4h, v3.h[6] 567 SMLAL2 v31.4s, v5.8h, v3.h[6] 568 B 3b 569 570 # Store odd width 571 .p2align 3 5725: 573 TBZ x1, 3, 6f 574 STR d3, [x7], 8 575 STR d2, [x17], 8 576 DUP d3, v3.d[1] 577 DUP d2, v2.d[1] 578 STR d1, [x16], 8 579 STR d0, [x6], 8 580 DUP d1, v1.d[1] 581 DUP d0, v0.d[1] 5826: 583 TBZ x1, 2, 7f 584 STR s3, [x7], 4 585 STR s2, [x17], 4 586 DUP s3, v3.s[1] 587 DUP s2, v2.s[1] 588 STR s1, [x16], 4 589 STR s0, [x6], 4 590 DUP s1, v1.s[1] 591 DUP s0, v0.s[1] 5927: 593 TBZ x1, 1, 8f 594 STR h3, [x7], 2 595 STR h2, [x17], 2 596 DUP h3, v3.h[1] 597 DUP h2, v2.h[1] 598 STR h1, [x16], 2 599 STR h0, [x6], 2 600 DUP h1, v1.h[1] 601 DUP h0, v0.h[1] 6028: 603 TBZ x1, 0, 9f 604 STR b3, [x7] 605 STR b2, [x17] 606 STR b1, [x16] 607 STR b0, [x6] 6089: 609 # Restore x20 from stack 610 LDR x20, [sp], 16 611 RET 612 613END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64 614 615#ifdef __ELF__ 616.section ".note.GNU-stack","",%progbits 617#endif 618