1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v6 31# A1 x15 v1 v7 32# B x5 v4 v5 v8 v9 33# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 34# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 35# temp0 v2 v10 v12 v14 36# temp1 v3 v11 v13 v15 37# x16, x17, x20, x21 tenporary a53 gpr load data 38 39 40BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53 41 42 # Clamp C pointers 43 LDP x10, x8, [sp] // Load cn_stride, a_offset 44 CMP x0, 2 // if mr < 2 45 LDP x12, x11, [sp, 16] // Load zero, params pointer 46 ADD x7, x6, x7 // c1 = c0 + cm_stride 47 STP d8, d9, [sp, -80]! 48 ADD x2, x2, 7 // kc = (kc + 7) & ~7 49 STP d10, d11, [sp, 16] 50 CSEL x7, x6, x7, LO // c1 = c0 51 STP d12, d13, [sp, 32] 52 BIC x2, x2, 7 53 STP d14, d15, [sp, 48] 54 STP x20, x21, [sp, 64] // Save x20,x21 on stack 55 56 .p2align 3 570: 58 # Load initial bias from w into accumulators 59 LDP s16, s18, [x5], 8 60 MOV v17.16b, v16.16b 61 MOV v19.16b, v18.16b 62 LDP s20, s22, [x5], 8 63 MOV v21.16b, v20.16b 64 MOV v23.16b, v22.16b 65 LDP s24, s26, [x5], 8 66 MOV v25.16b, v24.16b 67 MOV v27.16b, v26.16b 68 LDP s28, s30, [x5], 8 69 MOV v29.16b, v28.16b 70 MOV v31.16b, v30.16b 71 MOV x9, x3 // p = ks 72 73 .p2align 3 741: 75 # Load next 2 A pointers 76 LDP x13, x15, [x4], 16 77 CMP x13, x12 // if a0 == zero 78 ADD x13, x13, x8 // a0 += a_offset 79 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 80 CMP x15, x12 // if a1 == zero 81 ADD x15, x15, x8 // a1 += a_offset 82 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 83 84 # Is there at least 16 bytes for epilogue? 85 SUBS x0, x2, 16 // k = kc - 16 86 B.LO 5f 87 88 # Prologue: load A0, A1 and 2 B's 89 LDP d4, d5, [x5] // Read B 90 LDP d0, d6, [x13], 16 91 LDP d1, d7, [x15], 16 92// LDP d8, d9, [x5, 64] 93 LDR x17, [x5, 64] // Read B 94 LDR x16, [x5, 16] 95 96 # Is there at least 16 bytes for main loop? 97 SUBS x0, x0, 16 // k = k - 16 98 B.LO 3f 99 100 # Main loop - 16 bytes of A 101 # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles. 102 # 2 loads for A0 = +2 cycles. Total 18 * 4 + 2 = 74 cycles. 103 104 .p2align 3 1052: 106 # BLOCK 0 - 18 cycles - includes prfm 107 LDR d9, [x5, 72] // Read B 108 INS v8.d[0], x17 109 SMULL v2.8h, v4.8b, v0.8b 110 SMULL v3.8h, v4.8b, v1.8b 111 LDR x17, [x5, 80] 112 SMULL v10.8h, v5.8b, v0.8b 113 SMULL v11.8h, v5.8b, v1.8b 114 LDR d5, [x5, 24] 115 INS v4.d[0], x16 116 SMLAL v2.8h, v8.8b, v6.8b 117 SMLAL v3.8h, v8.8b, v7.8b 118 LDR x16, [x5, 32] 119 SMLAL v10.8h, v9.8b, v6.8b 120 SMLAL v11.8h, v9.8b, v7.8b 121 PRFM PLDL1KEEP, [x5, 448] 122 SADALP v16.4s, v2.8h 123 SADALP v17.4s, v3.8h 124 PRFM PLDL1KEEP, [x5, 512] 125 SADALP v18.4s, v10.8h 126 SADALP v19.4s, v11.8h 127 128 # BLOCK 1- 18 cycles 129 LDR d9, [x5, 88] 130 INS v8.d[0], x17 131 SMULL v12.8h, v4.8b, v0.8b 132 SMULL v13.8h, v4.8b, v1.8b 133 LDR x17, [x5, 96] 134 SMULL v14.8h, v5.8b, v0.8b 135 SMULL v15.8h, v5.8b, v1.8b 136 LDR d5, [x5, 40] 137 INS v4.d[0], x16 138 SMLAL v12.8h, v8.8b, v6.8b 139 SMLAL v13.8h, v8.8b, v7.8b 140 LDR x16, [x5, 48] 141 SMLAL v14.8h, v9.8b, v6.8b 142 SMLAL v15.8h, v9.8b, v7.8b 143 PRFM PLDL1KEEP, [x13, 128] 144 SADALP v20.4s, v12.8h 145 SADALP v21.4s, v13.8h 146 PRFM PLDL1KEEP, [x15, 128] 147 SADALP v22.4s, v14.8h 148 SADALP v23.4s, v15.8h 149 150 # BLOCK 2 - 18 cycles 151 LDR d9, [x5, 104] 152 INS v8.d[0], x17 153 SMULL v2.8h, v4.8b, v0.8b 154 SMULL v3.8h, v4.8b, v1.8b 155 LDR x17, [x5, 112] 156 SMULL v10.8h, v5.8b, v0.8b 157 SMULL v11.8h, v5.8b, v1.8b 158 LDR d5, [x5, 56] 159 INS v4.d[0], x16 160 SMLAL v2.8h, v8.8b, v6.8b 161 SMLAL v3.8h, v8.8b, v7.8b 162 LDR x16, [x5, 128] 163 SMLAL v10.8h, v9.8b, v6.8b 164 SMLAL v11.8h, v9.8b, v7.8b 165 SADALP v24.4s, v2.8h 166 LDR x20, [x13], 8 // Read A0 167 SADALP v25.4s, v3.8h 168 LDR x21, [x15], 8 // Read A1 169 SADALP v26.4s, v10.8h 170 SADALP v27.4s, v11.8h 171 SUBS x0, x0, 16 172 173 # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles 174 LDR d9, [x5, 120] 175 INS v8.d[0], x17 176 SMULL v12.8h, v4.8b, v0.8b 177 SMULL v13.8h, v4.8b, v1.8b 178 LDR x17, [x5, 192] // Read B 179 SMULL v14.8h, v5.8b, v0.8b 180 SMULL v15.8h, v5.8b, v1.8b 181 LDR d5, [x5, 136] // Read B 182 INS v4.d[0], x16 183 SMLAL v12.8h, v8.8b, v6.8b 184 SMLAL v13.8h, v8.8b, v7.8b 185 LDR x16, [x5, 144] 186 SMLAL v14.8h, v9.8b, v6.8b 187 SMLAL v15.8h, v9.8b, v7.8b 188 LDR d6, [x13], 8 // Read A0 189 INS v0.d[0], x20 190 LDR d7, [x15], 8 // Read A1 191 INS v1.d[0], x21 192 SADALP v28.4s, v12.8h 193 SADALP v29.4s, v13.8h 194 ADD x5, x5, 128 195 SADALP v30.4s, v14.8h 196 SADALP v31.4s, v15.8h 197 B.HS 2b 198 199 # Epilogue 200 # Same as main loop except no loads at end of loop 201 .p2align 3 2023: 203 # BLOCK 0 - 18 cycles 204 LDR d9, [x5, 72] // Read B 205 INS v8.d[0], x17 206 SMULL v2.8h, v4.8b, v0.8b 207 SMULL v3.8h, v4.8b, v1.8b 208 LDR x17, [x5, 80] 209 SMULL v10.8h, v5.8b, v0.8b 210 SMULL v11.8h, v5.8b, v1.8b 211 LDR d5, [x5, 24] 212 INS v4.d[0], x16 213 SMLAL v2.8h, v8.8b, v6.8b 214 SMLAL v3.8h, v8.8b, v7.8b 215 LDR x16, [x5, 32] 216 SMLAL v10.8h, v9.8b, v6.8b 217 SMLAL v11.8h, v9.8b, v7.8b 218 SADALP v16.4s, v2.8h 219 SADALP v17.4s, v3.8h 220 SADALP v18.4s, v10.8h 221 SADALP v19.4s, v11.8h 222 223 # BLOCK 1- 18 cycles 224 LDR d9, [x5, 88] 225 INS v8.d[0], x17 226 SMULL v12.8h, v4.8b, v0.8b 227 SMULL v13.8h, v4.8b, v1.8b 228 LDR x17, [x5, 96] 229 SMULL v14.8h, v5.8b, v0.8b 230 SMULL v15.8h, v5.8b, v1.8b 231 LDR d5, [x5, 40] 232 INS v4.d[0], x16 233 SMLAL v12.8h, v8.8b, v6.8b 234 SMLAL v13.8h, v8.8b, v7.8b 235 LDR x16, [x5, 48] 236 SMLAL v14.8h, v9.8b, v6.8b 237 SMLAL v15.8h, v9.8b, v7.8b 238 SADALP v20.4s, v12.8h 239 SADALP v21.4s, v13.8h 240 SADALP v22.4s, v14.8h 241 SADALP v23.4s, v15.8h 242 243 # BLOCK 2 - 18 cycles 244 LDR d9, [x5, 104] 245 INS v8.d[0], x17 246 SMULL v2.8h, v4.8b, v0.8b 247 SMULL v3.8h, v4.8b, v1.8b 248 LDR x17, [x5, 112] 249 SMULL v10.8h, v5.8b, v0.8b 250 SMULL v11.8h, v5.8b, v1.8b 251 LDR d5, [x5, 56] 252 INS v4.d[0], x16 253 SMLAL v2.8h, v8.8b, v6.8b 254 SMLAL v3.8h, v8.8b, v7.8b 255 SMLAL v10.8h, v9.8b, v6.8b 256 SMLAL v11.8h, v9.8b, v7.8b 257 SADALP v24.4s, v2.8h 258 SADALP v25.4s, v3.8h 259 SADALP v26.4s, v10.8h 260 SADALP v27.4s, v11.8h 261 262 # BLOCK 3 - 17 cycles 263 LDR d9, [x5, 120] 264 INS v8.d[0], x17 265 SMULL v12.8h, v4.8b, v0.8b 266 SMULL v13.8h, v4.8b, v1.8b 267 SMULL v14.8h, v5.8b, v0.8b 268 SMULL v15.8h, v5.8b, v1.8b 269 SMLAL v12.8h, v8.8b, v6.8b 270 SMLAL v13.8h, v8.8b, v7.8b 271 SMLAL v14.8h, v9.8b, v6.8b 272 SMLAL v15.8h, v9.8b, v7.8b 273 SADALP v28.4s, v12.8h 274 SADALP v29.4s, v13.8h 275 ADD x5, x5, 128 276 SADALP v30.4s, v14.8h 277 SADALP v31.4s, v15.8h 278 279 # Is there a remainder?- 8 bytes of A 280 TBNZ x0, 3, 5f 281 282 # ks loop 283 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 284 B.HI 1b 285 2864: 287 # Add columns 288 ADDP v16.4s, v16.4s, v18.4s 289 ADDP v20.4s, v20.4s, v22.4s 290 ADDP v24.4s, v24.4s, v26.4s 291 ADDP v28.4s, v28.4s, v30.4s 292 ADDP v17.4s, v17.4s, v19.4s 293 ADDP v21.4s, v21.4s, v23.4s 294 ADDP v25.4s, v25.4s, v27.4s 295 ADDP v29.4s, v29.4s, v31.4s 296 ADDP v0.4s, v16.4s, v20.4s 297 ADDP v1.4s, v24.4s, v28.4s 298 ADDP v2.4s, v17.4s, v21.4s 299 ADDP v3.4s, v25.4s, v29.4s 300 301 # Load per channel scale values from weights 302 SCVTF v0.4s, v0.4s 303 LDR q4, [x5], 16 304 SCVTF v1.4s, v1.4s 305 LDR q5, [x5], 16 306 SCVTF v2.4s, v2.4s 307 SCVTF v3.4s, v3.4s 308 FMUL v0.4s, v0.4s, v4.4s 309 FMUL v1.4s, v1.4s, v5.4s 310 FMUL v2.4s, v2.4s, v4.4s 311 FMUL v3.4s, v3.4s, v5.4s 312 313 FCVTNS v0.4s, v0.4s 314 FCVTNS v1.4s, v1.4s 315 FCVTNS v2.4s, v2.4s 316 FCVTNS v3.4s, v3.4s 317 318 LD1R {v5.8h}, [x11], 2 319 SQXTN v0.4h, v0.4s 320 SQXTN v2.4h, v2.4s 321 SQXTN2 v0.8h, v1.4s 322 SQXTN2 v2.8h, v3.4s 323 SUBS x1, x1, 8 324 SQADD v0.8h, v0.8h, v5.8h 325 SQADD v1.8h, v2.8h, v5.8h 326 SQXTN v0.8b, v0.8h 327 SQXTN2 v0.16b, v1.8h 328 LD1R {v1.16b}, [x11], 1 329 LD1R {v2.16b}, [x11] 330 SMAX v0.16b, v0.16b, v1.16b 331 SUB x11, x11, 3 // rewind params pointer 332 SMIN v0.16b, v0.16b, v2.16b 333 B.LO 6f 334 335 # Store full 2 x 8 336 ST1 {v0.d}[1], [x7], x10 337 ST1 {v0.8b}, [x6], x10 338 339 SUB x4, x4, x3 // a -= ks 340 341 # nc loop 342 B.HI 0b 343 344 # Restore x20,x21 from stack 345 LDP x20, x21, [sp, 64] 346 347 # Restore d8-d15 from stack 348 LDP d14, d15, [sp, 48] 349 LDP d12, d13, [sp, 32] 350 LDP d10, d11, [sp, 16] 351 LDP d8, d9, [sp], 80 352 RET 353 354 # Remainder - 8 bytes of A 355 .p2align 3 3565: 357 LDR d0, [x13], 8 358 LDP d4, d5, [x5] 359 LDR d1, [x15], 8 360 LDP d6, d7, [x5, 16] 361 SMULL v2.8h, v4.8b, v0.8b 362 SMULL v3.8h, v4.8b, v1.8b 363 SMULL v10.8h, v5.8b, v0.8b 364 SMULL v11.8h, v5.8b, v1.8b 365 SMULL v12.8h, v6.8b, v0.8b 366 SADALP v16.4s, v2.8h 367 SMULL v13.8h, v6.8b, v1.8b 368 SADALP v17.4s, v3.8h 369 SMULL v14.8h, v7.8b, v0.8b 370 SADALP v18.4s, v10.8h 371 SMULL v15.8h, v7.8b, v1.8b 372 SADALP v19.4s, v11.8h 373 LDP d4, d5, [x5, 32] 374 SMULL v2.8h, v4.8b, v0.8b 375 SADALP v20.4s, v12.8h 376 SMULL v3.8h, v4.8b, v1.8b 377 SADALP v21.4s, v13.8h 378 SMULL v10.8h, v5.8b, v0.8b 379 SADALP v22.4s, v14.8h 380 SMULL v11.8h, v5.8b, v1.8b 381 SADALP v23.4s, v15.8h 382 LDP d6, d7, [x5, 48] 383 SMULL v12.8h, v6.8b, v0.8b 384 SADALP v24.4s, v2.8h 385 SMULL v13.8h, v6.8b, v1.8b 386 SADALP v25.4s, v3.8h 387 SMULL v14.8h, v7.8b, v0.8b 388 SADALP v26.4s, v10.8h 389 SMULL v15.8h, v7.8b, v1.8b 390 SADALP v27.4s, v11.8h 391 ADD x5, x5, 64 392 SADALP v28.4s, v12.8h 393 SADALP v29.4s, v13.8h 394 SADALP v30.4s, v14.8h 395 SADALP v31.4s, v15.8h 396 397 # ks loop 398 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 399 B.HI 1b 400 B 4b 401 402 # Store odd width 403 .p2align 3 4046: 405 TBZ x1, 2, 7f 406 ST1 {v0.s}[2], [x7], 4 407 STR s0, [x6], 4 408 EXT v0.16b, v0.16b, v0.16b, 4 409 4107: 411 TBZ x1, 1, 8f 412 ST1 {v0.h}[4], [x7], 2 413 STR h0, [x6], 2 414 EXT v0.16b, v0.16b, v0.16b, 2 4158: 416 TBZ x1, 0, 9f 417 ST1 {v0.b}[8], [x7] 418 STR b0, [x6] 4199: 420 # Restore x20,x21 from stack 421 LDP x20, x21, [sp, 64] 422 423 # Restore d8-d15 from stack 424 LDP d14, d15, [sp, 48] 425 LDP d12, d13, [sp, 32] 426 LDP d10, d11, [sp, 16] 427 LDP d8, d9, [sp], 80 428 RET 429 430END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53 431 432#ifdef __ELF__ 433.section ".note.GNU-stack","",%progbits 434#endif 435