1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v6 31# A1 x15 v1 v7 32# B x5 v4 v5 v8 v9 33# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 34# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 35# temp0 v2 v10 v12 v14 36# temp1 v3 v11 v13 v15 37# x16, x17, x20, x21 tenporary a53 gpr load data 38 39 40BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53 41 42 # Clamp C pointers 43 LDP x10, x8, [sp] // Load cn_stride, a_offset 44 CMP x0, 2 // if mr < 2 45 LDP x12, x11, [sp, 16] // Load zero, params pointer 46 ADD x7, x6, x7 // c1 = c0 + cm_stride 47 STP d8, d9, [sp, -80]! 48 ADD x2, x2, 7 // kc = (kc + 7) & ~7 49 STP d10, d11, [sp, 16] 50 CSEL x7, x6, x7, LO // c1 = c0 51 STP d12, d13, [sp, 32] 52 BIC x2, x2, 7 53 STP d14, d15, [sp, 48] 54 STP x20, x21, [sp, 64] // Save x20,x21 on stack 55 56 .p2align 3 570: 58 # Load initial bias from w into accumulators 59 LDP s16, s18, [x5], 8 60 MOV v17.16b, v16.16b 61 MOV v19.16b, v18.16b 62 LDP s20, s22, [x5], 8 63 MOV v21.16b, v20.16b 64 MOV v23.16b, v22.16b 65 LDP s24, s26, [x5], 8 66 MOV v25.16b, v24.16b 67 MOV v27.16b, v26.16b 68 LDP s28, s30, [x5], 8 69 MOV v29.16b, v28.16b 70 MOV v31.16b, v30.16b 71 MOV x9, x3 // p = ks 72 73 .p2align 3 741: 75 # Load next 2 A pointers 76 LDP x13, x15, [x4], 16 77 CMP x13, x12 // if a0 == zero 78 ADD x13, x13, x8 // a0 += a_offset 79 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 80 CMP x15, x12 // if a1 == zero 81 ADD x15, x15, x8 // a1 += a_offset 82 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 83 84 # Is there at least 16 bytes for epilogue? 85 SUBS x0, x2, 16 // k = kc - 16 86 B.LO 5f 87 88 # Prologue: load A0, A1 and 2 B's 89 LDP d4, d5, [x5] // Read B 90 LDP d0, d6, [x13], 16 91 LDP d1, d7, [x15], 16 92// LDP d8, d9, [x5, 64] 93 LDR x17, [x5, 64] // Read B 94 LDR x16, [x5, 16] 95 96 # Is there at least 16 bytes for main loop? 97 SUBS x0, x0, 16 // k = k - 16 98 B.LO 3f 99 100 # Main loop - 16 bytes of A 101 # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles. 102 # 2 loads for A0 = +2 cycles. Total 18 * 4 + 2 = 74 cycles. 103 104 .p2align 3 1052: 106 # BLOCK 0 - 18 cycles - includes prfm 107 LDR d9, [x5, 72] // Read B 108 INS v8.d[0], x17 109 SMULL v2.8h, v4.8b, v0.8b 110 SMULL v3.8h, v4.8b, v1.8b 111 LDR x17, [x5, 80] 112 SMULL v10.8h, v5.8b, v0.8b 113 SMULL v11.8h, v5.8b, v1.8b 114 LDR d5, [x5, 24] 115 INS v4.d[0], x16 116 SMLAL v2.8h, v8.8b, v6.8b 117 SMLAL v3.8h, v8.8b, v7.8b 118 LDR x16, [x5, 32] 119 SMLAL v10.8h, v9.8b, v6.8b 120 SMLAL v11.8h, v9.8b, v7.8b 121 PRFM PLDL1KEEP, [x5, 448] 122 SADALP v16.4s, v2.8h 123 SADALP v17.4s, v3.8h 124 PRFM PLDL1KEEP, [x5, 512] 125 SADALP v18.4s, v10.8h 126 SADALP v19.4s, v11.8h 127 128 # BLOCK 1- 18 cycles 129 LDR d9, [x5, 88] 130 INS v8.d[0], x17 131 SMULL v12.8h, v4.8b, v0.8b 132 SMULL v13.8h, v4.8b, v1.8b 133 LDR x17, [x5, 96] 134 SMULL v14.8h, v5.8b, v0.8b 135 SMULL v15.8h, v5.8b, v1.8b 136 LDR d5, [x5, 40] 137 INS v4.d[0], x16 138 SMLAL v12.8h, v8.8b, v6.8b 139 SMLAL v13.8h, v8.8b, v7.8b 140 LDR x16, [x5, 48] 141 SMLAL v14.8h, v9.8b, v6.8b 142 SMLAL v15.8h, v9.8b, v7.8b 143 PRFM PLDL1KEEP, [x13, 128] 144 SADALP v20.4s, v12.8h 145 SADALP v21.4s, v13.8h 146 PRFM PLDL1KEEP, [x15, 128] 147 SADALP v22.4s, v14.8h 148 SADALP v23.4s, v15.8h 149 150 # BLOCK 2 - 18 cycles 151 LDR d9, [x5, 104] 152 INS v8.d[0], x17 153 SMULL v2.8h, v4.8b, v0.8b 154 SMULL v3.8h, v4.8b, v1.8b 155 LDR x17, [x5, 112] 156 SMULL v10.8h, v5.8b, v0.8b 157 SMULL v11.8h, v5.8b, v1.8b 158 LDR d5, [x5, 56] 159 INS v4.d[0], x16 160 SMLAL v2.8h, v8.8b, v6.8b 161 SMLAL v3.8h, v8.8b, v7.8b 162 LDR x16, [x5, 128] 163 SMLAL v10.8h, v9.8b, v6.8b 164 SMLAL v11.8h, v9.8b, v7.8b 165 SADALP v24.4s, v2.8h 166 LDR x20, [x13], 8 // Read A0 167 SADALP v25.4s, v3.8h 168 LDR x21, [x15], 8 // Read A1 169 SADALP v26.4s, v10.8h 170 SADALP v27.4s, v11.8h 171 SUBS x0, x0, 16 172 173 # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles 174 LDR d9, [x5, 120] 175 INS v8.d[0], x17 176 SMULL v12.8h, v4.8b, v0.8b 177 SMULL v13.8h, v4.8b, v1.8b 178 LDR x17, [x5, 192] // Read B 179 SMULL v14.8h, v5.8b, v0.8b 180 SMULL v15.8h, v5.8b, v1.8b 181 LDR d5, [x5, 136] // Read B 182 INS v4.d[0], x16 183 SMLAL v12.8h, v8.8b, v6.8b 184 SMLAL v13.8h, v8.8b, v7.8b 185 LDR x16, [x5, 144] 186 SMLAL v14.8h, v9.8b, v6.8b 187 SMLAL v15.8h, v9.8b, v7.8b 188 LDR d6, [x13], 8 // Read A0 189 INS v0.d[0], x20 190 LDR d7, [x15], 8 // Read A1 191 INS v1.d[0], x21 192 SADALP v28.4s, v12.8h 193 SADALP v29.4s, v13.8h 194 ADD x5, x5, 128 195 SADALP v30.4s, v14.8h 196 SADALP v31.4s, v15.8h 197 B.HS 2b 198 199 # Epilogue 200 # Same as main loop except no loads at end of loop 201 .p2align 3 2023: 203 # BLOCK 0 - 18 cycles 204 LDR d9, [x5, 72] // Read B 205 INS v8.d[0], x17 206 SMULL v2.8h, v4.8b, v0.8b 207 SMULL v3.8h, v4.8b, v1.8b 208 LDR x17, [x5, 80] 209 SMULL v10.8h, v5.8b, v0.8b 210 SMULL v11.8h, v5.8b, v1.8b 211 LDR d5, [x5, 24] 212 INS v4.d[0], x16 213 SMLAL v2.8h, v8.8b, v6.8b 214 SMLAL v3.8h, v8.8b, v7.8b 215 LDR x16, [x5, 32] 216 SMLAL v10.8h, v9.8b, v6.8b 217 SMLAL v11.8h, v9.8b, v7.8b 218 SADALP v16.4s, v2.8h 219 SADALP v17.4s, v3.8h 220 SADALP v18.4s, v10.8h 221 SADALP v19.4s, v11.8h 222 223 # BLOCK 1- 18 cycles 224 LDR d9, [x5, 88] 225 INS v8.d[0], x17 226 SMULL v12.8h, v4.8b, v0.8b 227 SMULL v13.8h, v4.8b, v1.8b 228 LDR x17, [x5, 96] 229 SMULL v14.8h, v5.8b, v0.8b 230 SMULL v15.8h, v5.8b, v1.8b 231 LDR d5, [x5, 40] 232 INS v4.d[0], x16 233 SMLAL v12.8h, v8.8b, v6.8b 234 SMLAL v13.8h, v8.8b, v7.8b 235 LDR x16, [x5, 48] 236 SMLAL v14.8h, v9.8b, v6.8b 237 SMLAL v15.8h, v9.8b, v7.8b 238 SADALP v20.4s, v12.8h 239 SADALP v21.4s, v13.8h 240 SADALP v22.4s, v14.8h 241 SADALP v23.4s, v15.8h 242 243 # BLOCK 2 - 18 cycles 244 LDR d9, [x5, 104] 245 INS v8.d[0], x17 246 SMULL v2.8h, v4.8b, v0.8b 247 SMULL v3.8h, v4.8b, v1.8b 248 LDR x17, [x5, 112] 249 SMULL v10.8h, v5.8b, v0.8b 250 SMULL v11.8h, v5.8b, v1.8b 251 LDR d5, [x5, 56] 252 INS v4.d[0], x16 253 SMLAL v2.8h, v8.8b, v6.8b 254 SMLAL v3.8h, v8.8b, v7.8b 255 SMLAL v10.8h, v9.8b, v6.8b 256 SMLAL v11.8h, v9.8b, v7.8b 257 SADALP v24.4s, v2.8h 258 SADALP v25.4s, v3.8h 259 SADALP v26.4s, v10.8h 260 SADALP v27.4s, v11.8h 261 262 # BLOCK 3 - 17 cycles 263 LDR d9, [x5, 120] 264 INS v8.d[0], x17 265 SMULL v12.8h, v4.8b, v0.8b 266 SMULL v13.8h, v4.8b, v1.8b 267 SMULL v14.8h, v5.8b, v0.8b 268 SMULL v15.8h, v5.8b, v1.8b 269 SMLAL v12.8h, v8.8b, v6.8b 270 SMLAL v13.8h, v8.8b, v7.8b 271 SMLAL v14.8h, v9.8b, v6.8b 272 SMLAL v15.8h, v9.8b, v7.8b 273 SADALP v28.4s, v12.8h 274 SADALP v29.4s, v13.8h 275 ADD x5, x5, 128 276 SADALP v30.4s, v14.8h 277 SADALP v31.4s, v15.8h 278 279 # Is there a remainder?- 8 bytes of A 280 TBNZ x0, 3, 5f 281 282 # ks loop 283 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 284 B.HI 1b 285 2864: 287 # Add columns 288 ADDP v16.4s, v16.4s, v18.4s 289 ADDP v20.4s, v20.4s, v22.4s 290 ADDP v24.4s, v24.4s, v26.4s 291 ADDP v28.4s, v28.4s, v30.4s 292 ADDP v17.4s, v17.4s, v19.4s 293 ADDP v21.4s, v21.4s, v23.4s 294 ADDP v25.4s, v25.4s, v27.4s 295 ADDP v29.4s, v29.4s, v31.4s 296 ADDP v0.4s, v16.4s, v20.4s 297 ADDP v1.4s, v24.4s, v28.4s 298 ADDP v2.4s, v17.4s, v21.4s 299 ADDP v3.4s, v25.4s, v29.4s 300 301 # Apply params - scale, bias and clamp 302 SCVTF v0.4s, v0.4s 303 LD1R {v4.4s}, [x11], 4 304 SCVTF v1.4s, v1.4s 305 SCVTF v2.4s, v2.4s 306 SCVTF v3.4s, v3.4s 307 FMUL v0.4s, v0.4s, v4.4s 308 FMUL v1.4s, v1.4s, v4.4s 309 FMUL v2.4s, v2.4s, v4.4s 310 FMUL v3.4s, v3.4s, v4.4s 311 312 FCVTNS v0.4s, v0.4s 313 FCVTNS v1.4s, v1.4s 314 FCVTNS v2.4s, v2.4s 315 FCVTNS v3.4s, v3.4s 316 317 LD1R {v5.8h}, [x11], 2 318 SQXTN v0.4h, v0.4s 319 SQXTN v2.4h, v2.4s 320 SQXTN2 v0.8h, v1.4s 321 SQXTN2 v2.8h, v3.4s 322 SUBS x1, x1, 8 323 SQADD v0.8h, v0.8h, v5.8h 324 SQADD v1.8h, v2.8h, v5.8h 325 SQXTN v0.8b, v0.8h 326 SQXTN2 v0.16b, v1.8h 327 LD1R {v1.16b}, [x11], 1 328 LD1R {v2.16b}, [x11] 329 SMAX v0.16b, v0.16b, v1.16b 330 SUB x11, x11, 7 // rewind params pointer 331 SMIN v0.16b, v0.16b, v2.16b 332 B.LO 6f 333 334 # Store full 2 x 8 335 ST1 {v0.d}[1], [x7], x10 336 ST1 {v0.8b}, [x6], x10 337 338 SUB x4, x4, x3 // a -= ks 339 340 # nc loop 341 B.HI 0b 342 343 # Restore x20,x21 from stack 344 LDP x20, x21, [sp, 64] 345 346 # Restore d8-d15 from stack 347 LDP d14, d15, [sp, 48] 348 LDP d12, d13, [sp, 32] 349 LDP d10, d11, [sp, 16] 350 LDP d8, d9, [sp], 80 351 RET 352 353 # Remainder - 8 bytes of A 354 .p2align 3 3555: 356 LDR d0, [x13], 8 357 LDP d4, d5, [x5] 358 LDR d1, [x15], 8 359 LDP d6, d7, [x5, 16] 360 SMULL v2.8h, v4.8b, v0.8b 361 SMULL v3.8h, v4.8b, v1.8b 362 SMULL v10.8h, v5.8b, v0.8b 363 SMULL v11.8h, v5.8b, v1.8b 364 SMULL v12.8h, v6.8b, v0.8b 365 SADALP v16.4s, v2.8h 366 SMULL v13.8h, v6.8b, v1.8b 367 SADALP v17.4s, v3.8h 368 SMULL v14.8h, v7.8b, v0.8b 369 SADALP v18.4s, v10.8h 370 SMULL v15.8h, v7.8b, v1.8b 371 SADALP v19.4s, v11.8h 372 LDP d4, d5, [x5, 32] 373 SMULL v2.8h, v4.8b, v0.8b 374 SADALP v20.4s, v12.8h 375 SMULL v3.8h, v4.8b, v1.8b 376 SADALP v21.4s, v13.8h 377 SMULL v10.8h, v5.8b, v0.8b 378 SADALP v22.4s, v14.8h 379 SMULL v11.8h, v5.8b, v1.8b 380 SADALP v23.4s, v15.8h 381 LDP d6, d7, [x5, 48] 382 SMULL v12.8h, v6.8b, v0.8b 383 SADALP v24.4s, v2.8h 384 SMULL v13.8h, v6.8b, v1.8b 385 SADALP v25.4s, v3.8h 386 SMULL v14.8h, v7.8b, v0.8b 387 SADALP v26.4s, v10.8h 388 SMULL v15.8h, v7.8b, v1.8b 389 SADALP v27.4s, v11.8h 390 ADD x5, x5, 64 391 SADALP v28.4s, v12.8h 392 SADALP v29.4s, v13.8h 393 SADALP v30.4s, v14.8h 394 SADALP v31.4s, v15.8h 395 396 # ks loop 397 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 398 B.HI 1b 399 B 4b 400 401 # Store odd width 402 .p2align 3 4036: 404 TBZ x1, 2, 7f 405 ST1 {v0.s}[2], [x7], 4 406 STR s0, [x6], 4 407 EXT v0.16b, v0.16b, v0.16b, 4 408 4097: 410 TBZ x1, 1, 8f 411 ST1 {v0.h}[4], [x7], 2 412 STR h0, [x6], 2 413 EXT v0.16b, v0.16b, v0.16b, 2 4148: 415 TBZ x1, 0, 9f 416 ST1 {v0.b}[8], [x7] 417 STR b0, [x6] 4189: 419 # Restore x20,x21 from stack 420 LDP x20, x21, [sp, 64] 421 422 # Restore d8-d15 from stack 423 LDP d14, d15, [sp, 48] 424 LDP d12, d13, [sp, 32] 425 LDP d10, d11, [sp, 16] 426 LDP d8, d9, [sp], 80 427 RET 428 429END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53 430 431#ifdef __ELF__ 432.section ".note.GNU-stack","",%progbits 433#endif 434