1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# params structure is 8 bytes 26# struct { 27# float scale; 28# int16_t output_zero_point; 29# int8_t output_min; 30# int8_t output_max; 31# } fp32_neonv8; 32 33# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 34 35# Register usage 36# A0 x3 v0 37# A1 x15 v1 38# A2 x13 v2 39# A3 x4 v3 40# B x5 v4 v5 v6 v7 41# C0 x6 v16 v20 v24 v28 42# C1 x8 v17 v21 v25 v29 43# C2 x9 v18 v22 v26 v30 44# C3 x7 v19 v23 v27 v31 45# unused v8 v9 v10 v11 v12 v13 v14 v15 46 47BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128 48 49 # Clamp A and C pointers 50 CMP x0, 2 // if mr < 2 51 ADD x2, x2, 3 // kc = (kc + 3) & ~3 52 ADD x15, x3, x4 // a1 = a0 + a_stride 53 ADD x8, x6, x7 // c1 = c0 + cm_stride 54 CSEL x15, x3, x15, LO // a1 = a0 55 CSEL x8, x6, x8, LO // c1 = c0 56 BIC x2, x2, 3 57 58 ADD x13, x15, x4 // a2 = a1 + a_stride 59 ADD x9, x8, x7 // c2 = c1 + cm_stride 60 // if mr <= 2 61 CSEL x13, x15, x13, LS // a2 = a1 62 CSEL x9, x8, x9, LS // c2 = c1 63 64 LDP x12, x11, [sp] // cn_stride, params 65 66 CMP x0, 4 // if mr < 4 67 ADD x4, x13, x4 // a3 = a2 + a_stride 68 ADD x7, x9, x7 // c3 = c2 + cm_stride 69 CSEL x4, x13, x4, LO // a3 = a2 70 CSEL x7, x9, x7, LO // c3 = c2 71 72 .p2align 3 730: 74 # Load initial bias from w into accumulators 75 LDP q16, q20, [x5], 32 76 MOV v17.16b, v16.16b 77 MOV v18.16b, v16.16b 78 LDP q24, q28, [x5], 32 79 MOV v19.16b, v16.16b 80 MOV v21.16b, v20.16b 81 MOV v22.16b, v20.16b 82 MOV v23.16b, v20.16b 83 MOV v25.16b, v24.16b 84 MOV v26.16b, v24.16b 85 SUBS x0, x2, 16 // k = kc - 16 86 MOV v27.16b, v24.16b 87 MOV v29.16b, v28.16b 88 MOV v30.16b, v28.16b 89 MOV v31.16b, v28.16b 90 # Is there at least 16 bytes? 91 B.LO 3f 92 93 # Main loop - 16 bytes of A 94 .p2align 3 951: 96 LDR q0, [x3], 16 97 LDR q4, [x5], 16 98 LDR q1, [x15], 16 99 LDR q2, [x13], 16 100 LDR q3, [x4], 16 101 LDR q5, [x5], 16 102 SDOT v16.4s, v4.16b, v0.4b[0] 103 SDOT v17.4s, v4.16b, v1.4b[0] 104 LDP q6, q7, [x5], 32 105 SDOT v18.4s, v4.16b, v2.4b[0] 106 SDOT v19.4s, v4.16b, v3.4b[0] 107 SDOT v20.4s, v5.16b, v0.4b[0] 108 SDOT v21.4s, v5.16b, v1.4b[0] 109 SDOT v22.4s, v5.16b, v2.4b[0] 110 SDOT v23.4s, v5.16b, v3.4b[0] 111 SDOT v24.4s, v6.16b, v0.4b[0] 112 SDOT v25.4s, v6.16b, v1.4b[0] 113 LDP q4, q5, [x5], 32 114 SDOT v26.4s, v6.16b, v2.4b[0] 115 SDOT v27.4s, v6.16b, v3.4b[0] 116 SDOT v28.4s, v7.16b, v0.4b[0] 117 SDOT v29.4s, v7.16b, v1.4b[0] 118 SDOT v30.4s, v7.16b, v2.4b[0] 119 SDOT v31.4s, v7.16b, v3.4b[0] 120 121 SDOT v16.4s, v4.16b, v0.4b[1] 122 SDOT v17.4s, v4.16b, v1.4b[1] 123 LDP q6, q7, [x5], 32 124 SDOT v18.4s, v4.16b, v2.4b[1] 125 SDOT v19.4s, v4.16b, v3.4b[1] 126 SDOT v20.4s, v5.16b, v0.4b[1] 127 SDOT v21.4s, v5.16b, v1.4b[1] 128 SDOT v22.4s, v5.16b, v2.4b[1] 129 SDOT v23.4s, v5.16b, v3.4b[1] 130 SDOT v24.4s, v6.16b, v0.4b[1] 131 SDOT v25.4s, v6.16b, v1.4b[1] 132 LDP q4, q5, [x5], 32 133 SDOT v26.4s, v6.16b, v2.4b[1] 134 SDOT v27.4s, v6.16b, v3.4b[1] 135 SDOT v28.4s, v7.16b, v0.4b[1] 136 SDOT v29.4s, v7.16b, v1.4b[1] 137 SDOT v30.4s, v7.16b, v2.4b[1] 138 SDOT v31.4s, v7.16b, v3.4b[1] 139 140 SDOT v16.4s, v4.16b, v0.4b[2] 141 SDOT v17.4s, v4.16b, v1.4b[2] 142 LDP q6, q7, [x5], 32 143 SDOT v18.4s, v4.16b, v2.4b[2] 144 SDOT v19.4s, v4.16b, v3.4b[2] 145 SDOT v20.4s, v5.16b, v0.4b[2] 146 SDOT v21.4s, v5.16b, v1.4b[2] 147 SDOT v22.4s, v5.16b, v2.4b[2] 148 SDOT v23.4s, v5.16b, v3.4b[2] 149 SDOT v24.4s, v6.16b, v0.4b[2] 150 SDOT v25.4s, v6.16b, v1.4b[2] 151 LDP q4, q5, [x5], 32 152 SDOT v26.4s, v6.16b, v2.4b[2] 153 SDOT v27.4s, v6.16b, v3.4b[2] 154 SDOT v28.4s, v7.16b, v0.4b[2] 155 SDOT v29.4s, v7.16b, v1.4b[2] 156 SDOT v30.4s, v7.16b, v2.4b[2] 157 SDOT v31.4s, v7.16b, v3.4b[2] 158 159 SDOT v16.4s, v4.16b, v0.4b[3] 160 SDOT v17.4s, v4.16b, v1.4b[3] 161 LDP q6, q7, [x5], 32 162 SDOT v18.4s, v4.16b, v2.4b[3] 163 SDOT v19.4s, v4.16b, v3.4b[3] 164 SDOT v20.4s, v5.16b, v0.4b[3] 165 SDOT v21.4s, v5.16b, v1.4b[3] 166 SDOT v22.4s, v5.16b, v2.4b[3] 167 SDOT v23.4s, v5.16b, v3.4b[3] 168 SDOT v24.4s, v6.16b, v0.4b[3] 169 SDOT v25.4s, v6.16b, v1.4b[3] 170 SDOT v26.4s, v6.16b, v2.4b[3] 171 SDOT v27.4s, v6.16b, v3.4b[3] 172 SUBS x0, x0, 16 173 SDOT v28.4s, v7.16b, v0.4b[3] 174 SDOT v29.4s, v7.16b, v1.4b[3] 175 SDOT v30.4s, v7.16b, v2.4b[3] 176 SDOT v31.4s, v7.16b, v3.4b[3] 177 B.HS 1b 178 179 # Is there a remainder?- 4 to 12 bytes of A 180 TST x0, 15 181 B.NE 3f 182 1832: 184 SCVTF v16.4s, v16.4s 185 SCVTF v17.4s, v17.4s 186 # Apply params - scale, bias and clamp 187 LD1R {v4.4s}, [x11], 4 188 SCVTF v18.4s, v18.4s 189 SCVTF v19.4s, v19.4s 190 SCVTF v20.4s, v20.4s 191 SCVTF v21.4s, v21.4s 192 SCVTF v22.4s, v22.4s 193 SCVTF v23.4s, v23.4s 194 SCVTF v24.4s, v24.4s 195 SCVTF v25.4s, v25.4s 196 SCVTF v26.4s, v26.4s 197 SCVTF v27.4s, v27.4s 198 SCVTF v28.4s, v28.4s 199 SCVTF v29.4s, v29.4s 200 SCVTF v30.4s, v30.4s 201 SCVTF v31.4s, v31.4s 202 203 FMUL v16.4s, v16.4s, v4.4s 204 FMUL v17.4s, v17.4s, v4.4s 205 FMUL v18.4s, v18.4s, v4.4s 206 FMUL v19.4s, v19.4s, v4.4s 207 FMUL v20.4s, v20.4s, v4.4s 208 FMUL v21.4s, v21.4s, v4.4s 209 FMUL v22.4s, v22.4s, v4.4s 210 FMUL v23.4s, v23.4s, v4.4s 211 FMUL v24.4s, v24.4s, v4.4s 212 FMUL v25.4s, v25.4s, v4.4s 213 FMUL v26.4s, v26.4s, v4.4s 214 FMUL v27.4s, v27.4s, v4.4s 215 FMUL v28.4s, v28.4s, v4.4s 216 FMUL v29.4s, v29.4s, v4.4s 217 FMUL v30.4s, v30.4s, v4.4s 218 FMUL v31.4s, v31.4s, v4.4s 219 220 FCVTNS v16.4s, v16.4s 221 FCVTNS v17.4s, v17.4s 222 FCVTNS v18.4s, v18.4s 223 FCVTNS v19.4s, v19.4s 224 FCVTNS v20.4s, v20.4s 225 FCVTNS v21.4s, v21.4s 226 FCVTNS v22.4s, v22.4s 227 FCVTNS v23.4s, v23.4s 228 FCVTNS v24.4s, v24.4s 229 FCVTNS v25.4s, v25.4s 230 FCVTNS v26.4s, v26.4s 231 FCVTNS v27.4s, v27.4s 232 FCVTNS v28.4s, v28.4s 233 FCVTNS v29.4s, v29.4s 234 FCVTNS v30.4s, v30.4s 235 FCVTNS v31.4s, v31.4s 236 237 SQXTN v16.4h, v16.4s 238 SQXTN v17.4h, v17.4s 239 SQXTN v18.4h, v18.4s 240 SQXTN v19.4h, v19.4s 241 SQXTN v24.4h, v24.4s 242 SQXTN v25.4h, v25.4s 243 SQXTN v26.4h, v26.4s 244 SQXTN v27.4h, v27.4s 245 LD1R {v6.8h}, [x11], 2 // add bias 246 247 SQXTN2 v16.8h, v20.4s 248 SQXTN2 v17.8h, v21.4s 249 SQXTN2 v18.8h, v22.4s 250 SQXTN2 v19.8h, v23.4s 251 SQXTN2 v24.8h, v28.4s 252 SQXTN2 v25.8h, v29.4s 253 SQXTN2 v26.8h, v30.4s 254 SQXTN2 v27.8h, v31.4s 255 256 SQADD v16.8h, v16.8h, v6.8h 257 SQADD v17.8h, v17.8h, v6.8h 258 SQADD v18.8h, v18.8h, v6.8h 259 SQADD v19.8h, v19.8h, v6.8h 260 SQADD v24.8h, v24.8h, v6.8h 261 SQADD v25.8h, v25.8h, v6.8h 262 SQADD v26.8h, v26.8h, v6.8h 263 SQADD v27.8h, v27.8h, v6.8h 264 LD1R {v4.16b}, [x11], 1 // clamp min value 265 266 SQXTN v0.8b, v16.8h 267 SQXTN v1.8b, v17.8h 268 SQXTN v2.8b, v18.8h 269 SQXTN v3.8b, v19.8h 270 LD1R {v5.16b}, [x11] // clamp max value 271 SQXTN2 v0.16b, v24.8h 272 SQXTN2 v1.16b, v25.8h 273 SQXTN2 v2.16b, v26.8h 274 SQXTN2 v3.16b, v27.8h 275 SUB x11, x11, 7 // rewind params pointer 276 277 SMAX v0.16b, v0.16b, v4.16b 278 SMAX v1.16b, v1.16b, v4.16b 279 SMAX v2.16b, v2.16b, v4.16b 280 SMAX v3.16b, v3.16b, v4.16b 281 SUBS x1, x1, 16 282 SMIN v0.16b, v0.16b, v5.16b 283 SMIN v1.16b, v1.16b, v5.16b 284 SMIN v2.16b, v2.16b, v5.16b 285 SMIN v3.16b, v3.16b, v5.16b 286 B.LO 5f 287 288 # Store full 4 x 16 289 ST1 {v0.16b}, [x6], x12 290 SUB x3, x3, x2 // a0 -= kc 291 ST1 {v1.16b}, [x8], x12 292 SUB x15, x15, x2 // a1 -= kc 293 ST1 {v2.16b}, [x9], x12 294 SUB x13, x13, x2 // a2 -= kc 295 ST1 {v3.16b}, [x7], x12 296 SUB x4, x4, x2 // a3 -= kc 297 B.NE 0b 298 RET 299 300 # Remainder- 8 bytes of A 301 .p2align 3 3023: 303 # Is there a remainder?- 8 bytes of A 304 TBZ x0, 3, 4f 305 306 LDR d0, [x3], 8 307 LDR q4, [x5], 16 308 LDR d1, [x15], 8 309 LDR d2, [x13], 8 310 LDR d3, [x4], 8 311 LDR q5, [x5], 16 312 SDOT v16.4s, v4.16b, v0.4b[0] 313 SDOT v17.4s, v4.16b, v1.4b[0] 314 LDP q6, q7, [x5], 32 315 SDOT v18.4s, v4.16b, v2.4b[0] 316 SDOT v19.4s, v4.16b, v3.4b[0] 317 SDOT v20.4s, v5.16b, v0.4b[0] 318 SDOT v21.4s, v5.16b, v1.4b[0] 319 SDOT v22.4s, v5.16b, v2.4b[0] 320 SDOT v23.4s, v5.16b, v3.4b[0] 321 SDOT v24.4s, v6.16b, v0.4b[0] 322 SDOT v25.4s, v6.16b, v1.4b[0] 323 LDP q4, q5, [x5], 32 324 SDOT v26.4s, v6.16b, v2.4b[0] 325 SDOT v27.4s, v6.16b, v3.4b[0] 326 SDOT v28.4s, v7.16b, v0.4b[0] 327 SDOT v29.4s, v7.16b, v1.4b[0] 328 SDOT v30.4s, v7.16b, v2.4b[0] 329 SDOT v31.4s, v7.16b, v3.4b[0] 330 SDOT v16.4s, v4.16b, v0.4b[1] 331 SDOT v17.4s, v4.16b, v1.4b[1] 332 LDP q6, q7, [x5], 32 333 SDOT v18.4s, v4.16b, v2.4b[1] 334 SDOT v19.4s, v4.16b, v3.4b[1] 335 SDOT v20.4s, v5.16b, v0.4b[1] 336 SDOT v21.4s, v5.16b, v1.4b[1] 337 SDOT v22.4s, v5.16b, v2.4b[1] 338 SDOT v23.4s, v5.16b, v3.4b[1] 339 SDOT v24.4s, v6.16b, v0.4b[1] 340 SDOT v25.4s, v6.16b, v1.4b[1] 341 SDOT v26.4s, v6.16b, v2.4b[1] 342 SDOT v27.4s, v6.16b, v3.4b[1] 343 SDOT v28.4s, v7.16b, v0.4b[1] 344 SDOT v29.4s, v7.16b, v1.4b[1] 345 SDOT v30.4s, v7.16b, v2.4b[1] 346 SDOT v31.4s, v7.16b, v3.4b[1] 347 # Is there a remainder?- 4 bytes of A 348 TBZ x0, 2, 2b 349 350 # Remainder- 4 bytes of A 3514: 352 LDR s0, [x3], 4 353 LDR q4, [x5], 16 354 LDR s1, [x15], 4 355 LDR s2, [x13], 4 356 LDR s3, [x4], 4 357 SDOT v16.4s, v4.16b, v0.4b[0] 358 LDR q5, [x5], 16 359 SDOT v17.4s, v4.16b, v1.4b[0] 360 SDOT v18.4s, v4.16b, v2.4b[0] 361 SDOT v19.4s, v4.16b, v3.4b[0] 362 SDOT v20.4s, v5.16b, v0.4b[0] 363 LDP q6, q7, [x5], 32 364 SDOT v21.4s, v5.16b, v1.4b[0] 365 SDOT v22.4s, v5.16b, v2.4b[0] 366 SDOT v23.4s, v5.16b, v3.4b[0] 367 SDOT v24.4s, v6.16b, v0.4b[0] 368 SDOT v25.4s, v6.16b, v1.4b[0] 369 SDOT v26.4s, v6.16b, v2.4b[0] 370 SDOT v27.4s, v6.16b, v3.4b[0] 371 SDOT v28.4s, v7.16b, v0.4b[0] 372 SDOT v29.4s, v7.16b, v1.4b[0] 373 SDOT v30.4s, v7.16b, v2.4b[0] 374 SDOT v31.4s, v7.16b, v3.4b[0] 375 B 2b 376 377 # Store odd width 378 .p2align 3 3795: 380 TBZ x1, 3, 6f 381 STR d0, [x6], 8 382 STR d1, [x8], 8 383 DUP d0, v0.d[1] 384 DUP d1, v1.d[1] 385 STR d2, [x9], 8 386 STR d3, [x7], 8 387 DUP d2, v2.d[1] 388 DUP d3, v3.d[1] 3896: 390 TBZ x1, 2, 7f 391 STR s0, [x6], 4 392 STR s1, [x8], 4 393 DUP s0, v0.s[1] 394 DUP s1, v1.s[1] 395 STR s2, [x9], 4 396 STR s3, [x7], 4 397 DUP s2, v2.s[1] 398 DUP s3, v3.s[1] 3997: 400 TBZ x1, 1, 8f 401 STR h0, [x6], 2 402 STR h1, [x8], 2 403 DUP h0, v0.h[1] 404 DUP h1, v1.h[1] 405 STR h2, [x9], 2 406 STR h3, [x7], 2 407 DUP h2, v2.h[1] 408 DUP h3, v3.h[1] 4098: 410 TBZ x1, 0, 9f 411 STR b0, [x6] 412 STR b1, [x8] 413 STR b2, [x9] 414 STR b3, [x7] 4159: 416 RET 417 418END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128 419 420#ifdef __ELF__ 421.section ".note.GNU-stack","",%progbits 422#endif 423