1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> (x0) 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x10 v3 34# B x5 v4 v5 v6 v7 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v8 v9 v10 v11 v12 v13 v14 v15 40 41BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128 42 43 # Clamp C pointers 44 CMP x0, 2 // if mr < 2 45 LDR x8, [sp, 8] // Load a_offset 46 ADD x16, x6, x7 // c1 = c0 + cm_stride 47 CSEL x16, x6, x16, LO // c1 = c0 48 ADD x2, x2, 3 // kc = (kc + 3) & ~3 49 50 ADD x17, x16, x7 // c2 = c1 + cm_stride 51 LDP x12, x11, [sp, 16] // Load zero, params pointer 52 // if mr <= 2 53 CSEL x17, x16, x17, LS // c2 = c1 54 BIC x2, x2, 3 55 56 CMP x0, 4 // if mr < 4 57 ADD x7, x17, x7 // c3 = c2 + cm_stride 58 CSEL x7, x17, x7, LO // c3 = c2 59 60 .p2align 3 610: 62 # Load initial bias from w into accumulators 63 LDP q16, q20, [x5], 32 64 MOV v17.16b, v16.16b 65 MOV v18.16b, v16.16b 66 LDP q24, q28, [x5], 32 67 MOV v19.16b, v16.16b 68 MOV v21.16b, v20.16b 69 MOV v22.16b, v20.16b 70 MOV v23.16b, v20.16b 71 MOV v25.16b, v24.16b 72 MOV v26.16b, v24.16b 73 MOV v27.16b, v24.16b 74 MOV v29.16b, v28.16b 75 MOV v30.16b, v28.16b 76 MOV v31.16b, v28.16b 77 MOV x9, x3 // p = ks 78 79 .p2align 3 801: 81 # Load next 4 A pointers 82 LDP x13, x14, [x4], 16 83 LDP x15, x10, [x4], 16 84 85 CMP x13, x12 // if a0 == zero 86 ADD x13, x13, x8 // a0 += a_offset 87 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 88 CMP x14, x12 // if a1 == zero 89 ADD x14, x14, x8 // a1 += a_offset 90 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 91 CMP x15, x12 // if a2 == zero 92 ADD x15, x15, x8 // a2 += a_offset 93 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 94 CMP x10, x12 // if a3 == zero 95 ADD x10, x10, x8 // a3 += a_offset 96 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 97 98 # Is there at least 16 bytes for main loop? 99 SUBS x0, x2, 16 // k = kc - 16 100 B.LO 4f 101 102 # Main loop - 16 bytes of A 103 .p2align 3 1042: 105 LDR q0, [x13], 16 106 LDR q4, [x5], 16 107 LDR q1, [x14], 16 108 LDR q2, [x15], 16 109 LDR q3, [x10], 16 110 LDR q5, [x5], 16 111 SDOT v16.4s, v4.16b, v0.4b[0] 112 SDOT v17.4s, v4.16b, v1.4b[0] 113 LDP q6, q7, [x5], 32 114 SDOT v18.4s, v4.16b, v2.4b[0] 115 SDOT v19.4s, v4.16b, v3.4b[0] 116 SDOT v20.4s, v5.16b, v0.4b[0] 117 SDOT v21.4s, v5.16b, v1.4b[0] 118 SDOT v22.4s, v5.16b, v2.4b[0] 119 SDOT v23.4s, v5.16b, v3.4b[0] 120 SDOT v24.4s, v6.16b, v0.4b[0] 121 SDOT v25.4s, v6.16b, v1.4b[0] 122 LDP q4, q5, [x5], 32 123 SDOT v26.4s, v6.16b, v2.4b[0] 124 SDOT v27.4s, v6.16b, v3.4b[0] 125 SDOT v28.4s, v7.16b, v0.4b[0] 126 SDOT v29.4s, v7.16b, v1.4b[0] 127 SDOT v30.4s, v7.16b, v2.4b[0] 128 SDOT v31.4s, v7.16b, v3.4b[0] 129 130 SDOT v16.4s, v4.16b, v0.4b[1] 131 SDOT v17.4s, v4.16b, v1.4b[1] 132 LDP q6, q7, [x5], 32 133 SDOT v18.4s, v4.16b, v2.4b[1] 134 SDOT v19.4s, v4.16b, v3.4b[1] 135 SDOT v20.4s, v5.16b, v0.4b[1] 136 SDOT v21.4s, v5.16b, v1.4b[1] 137 SDOT v22.4s, v5.16b, v2.4b[1] 138 SDOT v23.4s, v5.16b, v3.4b[1] 139 SDOT v24.4s, v6.16b, v0.4b[1] 140 SDOT v25.4s, v6.16b, v1.4b[1] 141 LDP q4, q5, [x5], 32 142 SDOT v26.4s, v6.16b, v2.4b[1] 143 SDOT v27.4s, v6.16b, v3.4b[1] 144 SDOT v28.4s, v7.16b, v0.4b[1] 145 SDOT v29.4s, v7.16b, v1.4b[1] 146 SDOT v30.4s, v7.16b, v2.4b[1] 147 SDOT v31.4s, v7.16b, v3.4b[1] 148 149 SDOT v16.4s, v4.16b, v0.4b[2] 150 SDOT v17.4s, v4.16b, v1.4b[2] 151 LDP q6, q7, [x5], 32 152 SDOT v18.4s, v4.16b, v2.4b[2] 153 SDOT v19.4s, v4.16b, v3.4b[2] 154 SDOT v20.4s, v5.16b, v0.4b[2] 155 SDOT v21.4s, v5.16b, v1.4b[2] 156 SDOT v22.4s, v5.16b, v2.4b[2] 157 SDOT v23.4s, v5.16b, v3.4b[2] 158 SDOT v24.4s, v6.16b, v0.4b[2] 159 SDOT v25.4s, v6.16b, v1.4b[2] 160 LDP q4, q5, [x5], 32 161 SDOT v26.4s, v6.16b, v2.4b[2] 162 SDOT v27.4s, v6.16b, v3.4b[2] 163 SDOT v28.4s, v7.16b, v0.4b[2] 164 SDOT v29.4s, v7.16b, v1.4b[2] 165 SDOT v30.4s, v7.16b, v2.4b[2] 166 SDOT v31.4s, v7.16b, v3.4b[2] 167 168 SDOT v16.4s, v4.16b, v0.4b[3] 169 SDOT v17.4s, v4.16b, v1.4b[3] 170 LDP q6, q7, [x5], 32 171 SDOT v18.4s, v4.16b, v2.4b[3] 172 SDOT v19.4s, v4.16b, v3.4b[3] 173 SDOT v20.4s, v5.16b, v0.4b[3] 174 SDOT v21.4s, v5.16b, v1.4b[3] 175 SDOT v22.4s, v5.16b, v2.4b[3] 176 SDOT v23.4s, v5.16b, v3.4b[3] 177 SDOT v24.4s, v6.16b, v0.4b[3] 178 SDOT v25.4s, v6.16b, v1.4b[3] 179 SDOT v26.4s, v6.16b, v2.4b[3] 180 SDOT v27.4s, v6.16b, v3.4b[3] 181 SUBS x0, x0, 16 182 SDOT v28.4s, v7.16b, v0.4b[3] 183 SDOT v29.4s, v7.16b, v1.4b[3] 184 SDOT v30.4s, v7.16b, v2.4b[3] 185 SDOT v31.4s, v7.16b, v3.4b[3] 186 B.HS 2b 187 188 # Is there a remainder?- 4 to 12 bytes of A 189 TST x0, 15 190 B.NE 4f 191 1923: 193 # ks loop 194 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 195 B.HI 1b 196 197 SCVTF v16.4s, v16.4s 198 SCVTF v17.4s, v17.4s 199 # Apply params - scale, bias and clamp 200 LD1R {v4.4s}, [x11], 4 201 SCVTF v18.4s, v18.4s 202 SCVTF v19.4s, v19.4s 203 SCVTF v20.4s, v20.4s 204 SCVTF v21.4s, v21.4s 205 SCVTF v22.4s, v22.4s 206 SCVTF v23.4s, v23.4s 207 SCVTF v24.4s, v24.4s 208 SCVTF v25.4s, v25.4s 209 SCVTF v26.4s, v26.4s 210 SCVTF v27.4s, v27.4s 211 SCVTF v28.4s, v28.4s 212 SCVTF v29.4s, v29.4s 213 SCVTF v30.4s, v30.4s 214 SCVTF v31.4s, v31.4s 215 216 FMUL v16.4s, v16.4s, v4.4s 217 FMUL v17.4s, v17.4s, v4.4s 218 FMUL v18.4s, v18.4s, v4.4s 219 FMUL v19.4s, v19.4s, v4.4s 220 FMUL v20.4s, v20.4s, v4.4s 221 FMUL v21.4s, v21.4s, v4.4s 222 FMUL v22.4s, v22.4s, v4.4s 223 FMUL v23.4s, v23.4s, v4.4s 224 FMUL v24.4s, v24.4s, v4.4s 225 FMUL v25.4s, v25.4s, v4.4s 226 FMUL v26.4s, v26.4s, v4.4s 227 FMUL v27.4s, v27.4s, v4.4s 228 FMUL v28.4s, v28.4s, v4.4s 229 FMUL v29.4s, v29.4s, v4.4s 230 FMUL v30.4s, v30.4s, v4.4s 231 FMUL v31.4s, v31.4s, v4.4s 232 233 FCVTNS v16.4s, v16.4s 234 FCVTNS v17.4s, v17.4s 235 FCVTNS v18.4s, v18.4s 236 FCVTNS v19.4s, v19.4s 237 FCVTNS v20.4s, v20.4s 238 FCVTNS v21.4s, v21.4s 239 FCVTNS v22.4s, v22.4s 240 FCVTNS v23.4s, v23.4s 241 FCVTNS v24.4s, v24.4s 242 FCVTNS v25.4s, v25.4s 243 FCVTNS v26.4s, v26.4s 244 FCVTNS v27.4s, v27.4s 245 FCVTNS v28.4s, v28.4s 246 FCVTNS v29.4s, v29.4s 247 FCVTNS v30.4s, v30.4s 248 FCVTNS v31.4s, v31.4s 249 250 SQXTN v16.4h, v16.4s 251 SQXTN v17.4h, v17.4s 252 SQXTN v18.4h, v18.4s 253 SQXTN v19.4h, v19.4s 254 SQXTN v24.4h, v24.4s 255 SQXTN v25.4h, v25.4s 256 SQXTN v26.4h, v26.4s 257 SQXTN v27.4h, v27.4s 258 LD1R {v6.8h}, [x11], 2 // add bias 259 260 SQXTN2 v16.8h, v20.4s 261 SQXTN2 v17.8h, v21.4s 262 SQXTN2 v18.8h, v22.4s 263 SQXTN2 v19.8h, v23.4s 264 SQXTN2 v24.8h, v28.4s 265 SQXTN2 v25.8h, v29.4s 266 SQXTN2 v26.8h, v30.4s 267 SQXTN2 v27.8h, v31.4s 268 269 SQADD v16.8h, v16.8h, v6.8h 270 SQADD v17.8h, v17.8h, v6.8h 271 SQADD v18.8h, v18.8h, v6.8h 272 SQADD v19.8h, v19.8h, v6.8h 273 SQADD v24.8h, v24.8h, v6.8h 274 SQADD v25.8h, v25.8h, v6.8h 275 SQADD v26.8h, v26.8h, v6.8h 276 SQADD v27.8h, v27.8h, v6.8h 277 LD1R {v4.16b}, [x11], 1 // clamp min value 278 279 SQXTN v0.8b, v16.8h 280 SQXTN v1.8b, v17.8h 281 SQXTN v2.8b, v18.8h 282 SQXTN v3.8b, v19.8h 283 LD1R {v5.16b}, [x11] // clamp max value 284 SQXTN2 v0.16b, v24.8h 285 SQXTN2 v1.16b, v25.8h 286 SQXTN2 v2.16b, v26.8h 287 SQXTN2 v3.16b, v27.8h 288 LDR x0, [sp] // cn_stride 289 290 SMAX v0.16b, v0.16b, v4.16b 291 SMAX v1.16b, v1.16b, v4.16b 292 SUB x11, x11, 7 // rewind params pointer 293 SMAX v2.16b, v2.16b, v4.16b 294 SMAX v3.16b, v3.16b, v4.16b 295 SUBS x1, x1, 16 296 SMIN v0.16b, v0.16b, v5.16b 297 SMIN v1.16b, v1.16b, v5.16b 298 SMIN v2.16b, v2.16b, v5.16b 299 SMIN v3.16b, v3.16b, v5.16b 300 B.LO 6f 301 302 # Store full 4 x 16 303 ST1 {v3.16b}, [x7], x0 304 ST1 {v2.16b}, [x17], x0 305 ST1 {v1.16b}, [x16], x0 306 ST1 {v0.16b}, [x6], x0 307 308 SUB x4, x4, x3 // a -= ks 309 310 # nc loop 311 B.HI 0b 312 RET 313 314 # Remainder- 8 bytes of A 315 .p2align 3 3164: 317 # Is there a remainder?- 8 bytes of A 318 TBZ x0, 3, 5f 319 320 LDR d0, [x13], 8 321 LDR q4, [x5], 16 322 LDR d1, [x14], 8 323 LDR d2, [x15], 8 324 LDR d3, [x10], 8 325 LDR q5, [x5], 16 326 SDOT v16.4s, v4.16b, v0.4b[0] 327 SDOT v17.4s, v4.16b, v1.4b[0] 328 LDP q6, q7, [x5], 32 329 SDOT v18.4s, v4.16b, v2.4b[0] 330 SDOT v19.4s, v4.16b, v3.4b[0] 331 SDOT v20.4s, v5.16b, v0.4b[0] 332 SDOT v21.4s, v5.16b, v1.4b[0] 333 SDOT v22.4s, v5.16b, v2.4b[0] 334 SDOT v23.4s, v5.16b, v3.4b[0] 335 SDOT v24.4s, v6.16b, v0.4b[0] 336 SDOT v25.4s, v6.16b, v1.4b[0] 337 LDP q4, q5, [x5], 32 338 SDOT v26.4s, v6.16b, v2.4b[0] 339 SDOT v27.4s, v6.16b, v3.4b[0] 340 SDOT v28.4s, v7.16b, v0.4b[0] 341 SDOT v29.4s, v7.16b, v1.4b[0] 342 SDOT v30.4s, v7.16b, v2.4b[0] 343 SDOT v31.4s, v7.16b, v3.4b[0] 344 SDOT v16.4s, v4.16b, v0.4b[1] 345 SDOT v17.4s, v4.16b, v1.4b[1] 346 LDP q6, q7, [x5], 32 347 SDOT v18.4s, v4.16b, v2.4b[1] 348 SDOT v19.4s, v4.16b, v3.4b[1] 349 SDOT v20.4s, v5.16b, v0.4b[1] 350 SDOT v21.4s, v5.16b, v1.4b[1] 351 SDOT v22.4s, v5.16b, v2.4b[1] 352 SDOT v23.4s, v5.16b, v3.4b[1] 353 SDOT v24.4s, v6.16b, v0.4b[1] 354 SDOT v25.4s, v6.16b, v1.4b[1] 355 SDOT v26.4s, v6.16b, v2.4b[1] 356 SDOT v27.4s, v6.16b, v3.4b[1] 357 SDOT v28.4s, v7.16b, v0.4b[1] 358 SDOT v29.4s, v7.16b, v1.4b[1] 359 SDOT v30.4s, v7.16b, v2.4b[1] 360 SDOT v31.4s, v7.16b, v3.4b[1] 361 # Is there a remainder?- 4 bytes of A 362 TBZ x0, 2, 3b 363 364 # Remainder- 4 bytes of A 3655: 366 LDR s0, [x13], 4 367 LDR q4, [x5], 16 368 LDR s1, [x14], 4 369 LDR s2, [x15], 4 370 LDR s3, [x10], 4 371 LDR q5, [x5], 16 372 SDOT v16.4s, v4.16b, v0.4b[0] 373 SDOT v17.4s, v4.16b, v1.4b[0] 374 LDP q6, q7, [x5], 32 375 SDOT v18.4s, v4.16b, v2.4b[0] 376 SDOT v19.4s, v4.16b, v3.4b[0] 377 SDOT v20.4s, v5.16b, v0.4b[0] 378 SDOT v21.4s, v5.16b, v1.4b[0] 379 SDOT v22.4s, v5.16b, v2.4b[0] 380 SDOT v23.4s, v5.16b, v3.4b[0] 381 SDOT v24.4s, v6.16b, v0.4b[0] 382 SDOT v25.4s, v6.16b, v1.4b[0] 383 SDOT v26.4s, v6.16b, v2.4b[0] 384 SDOT v27.4s, v6.16b, v3.4b[0] 385 SDOT v28.4s, v7.16b, v0.4b[0] 386 SDOT v29.4s, v7.16b, v1.4b[0] 387 SDOT v30.4s, v7.16b, v2.4b[0] 388 SDOT v31.4s, v7.16b, v3.4b[0] 389 B 3b 390 391 # Store odd width 392 .p2align 3 3936: 394 TBZ x1, 3, 7f 395 STR d3, [x7], 8 396 STR d2, [x17], 8 397 DUP d3, v3.d[1] 398 DUP d2, v2.d[1] 399 STR d1, [x16], 8 400 STR d0, [x6], 8 401 DUP d1, v1.d[1] 402 DUP d0, v0.d[1] 4037: 404 TBZ x1, 2, 8f 405 STR s3, [x7], 4 406 STR s2, [x17], 4 407 DUP s3, v3.s[1] 408 DUP s2, v2.s[1] 409 STR s1, [x16], 4 410 STR s0, [x6], 4 411 DUP s1, v1.s[1] 412 DUP s0, v0.s[1] 4138: 414 TBZ x1, 1, 9f 415 STR h3, [x7], 2 416 STR h2, [x17], 2 417 DUP h3, v3.h[1] 418 DUP h2, v2.h[1] 419 STR h1, [x16], 2 420 STR h0, [x6], 2 421 DUP h1, v1.h[1] 422 DUP h0, v0.h[1] 4239: 424 TBZ x1, 0, 10f 425 STR b3, [x7] 426 STR b2, [x17] 427 STR b1, [x16] 428 STR b0, [x6] 42910: 430 RET 431 432END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128 433 434#ifdef __ELF__ 435.section ".note.GNU-stack","",%progbits 436#endif 437