1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v6 29# A1 x4 v1 v7 30# B x5 v4 v5 v8 v9 31# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 32# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 33# temp0 v2 v10 v12 v14 34# temp1 v3 v11 v13 v15 35# x16, x17, x20, x21 tenporary a53 gpr load data 36 37 38BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53 39 40 # Clamp A and C pointers 41 CMP x0, 2 // if mr < 2 42 STP d8, d9, [sp, -80]! 43 ADD x4, x3, x4 // a1 = a0 + a_stride 44 STP d10, d11, [sp, 16] 45 ADD x7, x6, x7 // c1 = c0 + cm_stride 46 STP d12, d13, [sp, 32] 47 CSEL x4, x3, x4, LO // a1 = a0 48 STP d14, d15, [sp, 48] 49 ADD x2, x2, 7 // kc = (kc + 7) & ~7 50 CSEL x7, x6, x7, LO // c1 = c0 51 BIC x2, x2, 7 52 STP x20, x21, [sp, 64] // Save x20,x21 on stack 53 54 .p2align 3 550: 56 # Load initial bias from w into accumulators 57 SUBS x0, x2, 16 // k = kc - 16 58 LDP s16, s18, [x5], 8 59 MOV v17.16b, v16.16b 60 MOV v19.16b, v18.16b 61 LDP s20, s22, [x5], 8 62 MOV v21.16b, v20.16b 63 MOV v23.16b, v22.16b 64 LDP s24, s26, [x5], 8 65 MOV v25.16b, v24.16b 66 MOV v27.16b, v26.16b 67 LDP s28, s30, [x5], 8 68 MOV v29.16b, v28.16b 69 LDP x10, x11, [sp, 80] // cn_stride, params 70 MOV v31.16b, v30.16b 71 # Is there at least 16 bytes for epilogue? 72 B.LO 4f 73 74 # Prologue: load A0, A1 and 2 B's 75 LDP d4, d5, [x5] // Read B 76 LDP d0, d6, [x3], 16 // Read A0 77 LDR x17, [x5, 64] // Read B 78 LDP d1, d7, [x4], 16 // Read A1 79 LDR x16, [x5, 16] 80 81 # Is there at least 16 bytes for main loop? 82 SUBS x0, x0, 16 // k = k - 16 83 B.LO 2f 84 85 # Main loop - 16 bytes of A 86 # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles. 87 # 2 loads for A0 = +2 cycles. Total 18 * 4 + 2 = 74 cycles. 88 89 .p2align 3 901: 91 # BLOCK 0 - 18 cycles - includes prfm 92 LDR d9, [x5, 72] // Read B 93 INS v8.d[0], x17 94 SMULL v2.8h, v4.8b, v0.8b 95 SMULL v3.8h, v4.8b, v1.8b 96 LDR x17, [x5, 80] 97 SMULL v10.8h, v5.8b, v0.8b 98 SMULL v11.8h, v5.8b, v1.8b 99 LDR d5, [x5, 24] 100 INS v4.d[0], x16 101 SMLAL v2.8h, v8.8b, v6.8b 102 SMLAL v3.8h, v8.8b, v7.8b 103 LDR x16, [x5, 32] 104 SMLAL v10.8h, v9.8b, v6.8b 105 SMLAL v11.8h, v9.8b, v7.8b 106 PRFM PLDL1KEEP, [x5, 448] 107 SADALP v16.4s, v2.8h 108 SADALP v17.4s, v3.8h 109 PRFM PLDL1KEEP, [x5, 512] 110 SADALP v18.4s, v10.8h 111 SADALP v19.4s, v11.8h 112 113 # BLOCK 1- 18 cycles 114 LDR d9, [x5, 88] 115 INS v8.d[0], x17 116 SMULL v12.8h, v4.8b, v0.8b 117 SMULL v13.8h, v4.8b, v1.8b 118 LDR x17, [x5, 96] 119 SMULL v14.8h, v5.8b, v0.8b 120 SMULL v15.8h, v5.8b, v1.8b 121 LDR d5, [x5, 40] 122 INS v4.d[0], x16 123 SMLAL v12.8h, v8.8b, v6.8b 124 SMLAL v13.8h, v8.8b, v7.8b 125 LDR x16, [x5, 48] 126 SMLAL v14.8h, v9.8b, v6.8b 127 SMLAL v15.8h, v9.8b, v7.8b 128 PRFM PLDL1KEEP, [x3, 128] 129 SADALP v20.4s, v12.8h 130 SADALP v21.4s, v13.8h 131 PRFM PLDL1KEEP, [x4, 128] 132 SADALP v22.4s, v14.8h 133 SADALP v23.4s, v15.8h 134 135 # BLOCK 2 - 18 cycles 136 LDR d9, [x5, 104] 137 INS v8.d[0], x17 138 SMULL v2.8h, v4.8b, v0.8b 139 SMULL v3.8h, v4.8b, v1.8b 140 LDR x17, [x5, 112] 141 SMULL v10.8h, v5.8b, v0.8b 142 SMULL v11.8h, v5.8b, v1.8b 143 LDR d5, [x5, 56] 144 INS v4.d[0], x16 145 SMLAL v2.8h, v8.8b, v6.8b 146 SMLAL v3.8h, v8.8b, v7.8b 147 LDR x16, [x5, 128] 148 SMLAL v10.8h, v9.8b, v6.8b 149 SMLAL v11.8h, v9.8b, v7.8b 150 SADALP v24.4s, v2.8h 151 LDR x20, [x3], 8 // Read A0 152 SADALP v25.4s, v3.8h 153 LDR x21, [x4], 8 // Read A1 154 SADALP v26.4s, v10.8h 155 SADALP v27.4s, v11.8h 156 SUBS x0, x0, 16 157 158 # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles 159 LDR d9, [x5, 120] 160 INS v8.d[0], x17 161 SMULL v12.8h, v4.8b, v0.8b 162 SMULL v13.8h, v4.8b, v1.8b 163 LDR x17, [x5, 192] // Read B 164 SMULL v14.8h, v5.8b, v0.8b 165 SMULL v15.8h, v5.8b, v1.8b 166 LDR d5, [x5, 136] // Read B 167 INS v4.d[0], x16 168 SMLAL v12.8h, v8.8b, v6.8b 169 SMLAL v13.8h, v8.8b, v7.8b 170 LDR x16, [x5, 144] 171 SMLAL v14.8h, v9.8b, v6.8b 172 SMLAL v15.8h, v9.8b, v7.8b 173 LDR d6, [x3], 8 // Read A0 174 INS v0.d[0], x20 175 LDR d7, [x4], 8 // Read A1 176 INS v1.d[0], x21 177 SADALP v28.4s, v12.8h 178 SADALP v29.4s, v13.8h 179 ADD x5, x5, 128 180 SADALP v30.4s, v14.8h 181 SADALP v31.4s, v15.8h 182 B.HS 1b 183 184 # Epilogue 185 # Same as main loop except no loads at end of loop 186 187 .p2align 3 1882: 189 # BLOCK 0 - 18 cycles 190 LDR d9, [x5, 72] // Read B 191 INS v8.d[0], x17 192 SMULL v2.8h, v4.8b, v0.8b 193 SMULL v3.8h, v4.8b, v1.8b 194 LDR x17, [x5, 80] 195 SMULL v10.8h, v5.8b, v0.8b 196 SMULL v11.8h, v5.8b, v1.8b 197 LDR d5, [x5, 24] 198 INS v4.d[0], x16 199 SMLAL v2.8h, v8.8b, v6.8b 200 SMLAL v3.8h, v8.8b, v7.8b 201 LDR x16, [x5, 32] 202 SMLAL v10.8h, v9.8b, v6.8b 203 SMLAL v11.8h, v9.8b, v7.8b 204 SADALP v16.4s, v2.8h 205 SADALP v17.4s, v3.8h 206 SADALP v18.4s, v10.8h 207 SADALP v19.4s, v11.8h 208 209 # BLOCK 1- 18 cycles 210 LDR d9, [x5, 88] 211 INS v8.d[0], x17 212 SMULL v12.8h, v4.8b, v0.8b 213 SMULL v13.8h, v4.8b, v1.8b 214 LDR x17, [x5, 96] 215 SMULL v14.8h, v5.8b, v0.8b 216 SMULL v15.8h, v5.8b, v1.8b 217 LDR d5, [x5, 40] 218 INS v4.d[0], x16 219 SMLAL v12.8h, v8.8b, v6.8b 220 SMLAL v13.8h, v8.8b, v7.8b 221 LDR x16, [x5, 48] 222 SMLAL v14.8h, v9.8b, v6.8b 223 SMLAL v15.8h, v9.8b, v7.8b 224 SADALP v20.4s, v12.8h 225 SADALP v21.4s, v13.8h 226 SADALP v22.4s, v14.8h 227 SADALP v23.4s, v15.8h 228 229 # BLOCK 2 - 18 cycles 230 LDR d9, [x5, 104] 231 INS v8.d[0], x17 232 SMULL v2.8h, v4.8b, v0.8b 233 SMULL v3.8h, v4.8b, v1.8b 234 LDR x17, [x5, 112] 235 SMULL v10.8h, v5.8b, v0.8b 236 SMULL v11.8h, v5.8b, v1.8b 237 LDR d5, [x5, 56] 238 INS v4.d[0], x16 239 SMLAL v2.8h, v8.8b, v6.8b 240 SMLAL v3.8h, v8.8b, v7.8b 241 SMLAL v10.8h, v9.8b, v6.8b 242 SMLAL v11.8h, v9.8b, v7.8b 243 SADALP v24.4s, v2.8h 244 SADALP v25.4s, v3.8h 245 SADALP v26.4s, v10.8h 246 SADALP v27.4s, v11.8h 247 248 # BLOCK 3 - 17 cycles 249 LDR d9, [x5, 120] 250 INS v8.d[0], x17 251 SMULL v12.8h, v4.8b, v0.8b 252 SMULL v13.8h, v4.8b, v1.8b 253 SMULL v14.8h, v5.8b, v0.8b 254 SMULL v15.8h, v5.8b, v1.8b 255 SMLAL v12.8h, v8.8b, v6.8b 256 SMLAL v13.8h, v8.8b, v7.8b 257 SMLAL v14.8h, v9.8b, v6.8b 258 SMLAL v15.8h, v9.8b, v7.8b 259 SADALP v28.4s, v12.8h 260 SADALP v29.4s, v13.8h 261 ADD x5, x5, 128 262 SADALP v30.4s, v14.8h 263 SADALP v31.4s, v15.8h 264 265 # Is there a remainder?- 8 bytes of A 266 TBNZ x0, 3, 4f 267 268 .p2align 3 2693: 270 # Add columns 271 ADDP v16.4s, v16.4s, v18.4s 272 ADDP v20.4s, v20.4s, v22.4s 273 ADDP v24.4s, v24.4s, v26.4s 274 ADDP v28.4s, v28.4s, v30.4s 275 ADDP v17.4s, v17.4s, v19.4s 276 ADDP v21.4s, v21.4s, v23.4s 277 ADDP v25.4s, v25.4s, v27.4s 278 ADDP v29.4s, v29.4s, v31.4s 279 ADDP v0.4s, v16.4s, v20.4s 280 ADDP v1.4s, v24.4s, v28.4s 281 ADDP v2.4s, v17.4s, v21.4s 282 ADDP v3.4s, v25.4s, v29.4s 283 284 # Apply params - scale, bias and clamp 285 SCVTF v0.4s, v0.4s 286 LD1R {v4.4s}, [x11], 4 287 SCVTF v1.4s, v1.4s 288 SCVTF v2.4s, v2.4s 289 SCVTF v3.4s, v3.4s 290 FMUL v0.4s, v0.4s, v4.4s 291 FMUL v1.4s, v1.4s, v4.4s 292 FMUL v2.4s, v2.4s, v4.4s 293 FMUL v3.4s, v3.4s, v4.4s 294 295 FCVTNS v0.4s, v0.4s 296 FCVTNS v1.4s, v1.4s 297 FCVTNS v2.4s, v2.4s 298 FCVTNS v3.4s, v3.4s 299 300 LD1R {v5.8h}, [x11], 2 301 SQXTN v0.4h, v0.4s 302 SQXTN v2.4h, v2.4s 303 SQXTN2 v0.8h, v1.4s 304 SQXTN2 v2.8h, v3.4s 305 SUBS x1, x1, 8 306 SQADD v0.8h, v0.8h, v5.8h 307 SQADD v1.8h, v2.8h, v5.8h 308 SQXTN v0.8b, v0.8h 309 SQXTN2 v0.16b, v1.8h 310 LD1R {v1.16b}, [x11], 1 311 LD1R {v2.16b}, [x11] 312 SMAX v0.16b, v0.16b, v1.16b 313 SMIN v0.16b, v0.16b, v2.16b 314 B.LO 5f 315 316 # Store full 2 x 8 317 ST1 {v0.8b}, [x6], x10 318 SUB x3, x3, x2 // a0 -= kc 319 ST1 {v0.d}[1], [x7], x10 320 SUB x4, x4, x2 // a1 -= kc 321 B.HI 0b 322 323 # Restore x20,x21 from stack 324 LDP x20, x21, [sp, 64] 325 326 # Restore d8-d15 from stack 327 LDP d14, d15, [sp, 48] 328 LDP d12, d13, [sp, 32] 329 LDP d10, d11, [sp, 16] 330 LDP d8, d9, [sp], 80 331 RET 332 333 # Remainder - 8 bytes of A 334 .p2align 3 3354: 336 LDR d0, [x3], 8 337 LDP d4, d5, [x5] 338 LDR d1, [x4], 8 339 LDP d6, d7, [x5, 16] 340 SMULL v2.8h, v4.8b, v0.8b 341 SMULL v3.8h, v4.8b, v1.8b 342 SMULL v10.8h, v5.8b, v0.8b 343 SMULL v11.8h, v5.8b, v1.8b 344 SMULL v12.8h, v6.8b, v0.8b 345 SADALP v16.4s, v2.8h 346 SMULL v13.8h, v6.8b, v1.8b 347 SADALP v17.4s, v3.8h 348 SMULL v14.8h, v7.8b, v0.8b 349 SADALP v18.4s, v10.8h 350 SMULL v15.8h, v7.8b, v1.8b 351 SADALP v19.4s, v11.8h 352 LDP d4, d5, [x5, 32] 353 SMULL v2.8h, v4.8b, v0.8b 354 SADALP v20.4s, v12.8h 355 SMULL v3.8h, v4.8b, v1.8b 356 SADALP v21.4s, v13.8h 357 SMULL v10.8h, v5.8b, v0.8b 358 SADALP v22.4s, v14.8h 359 SMULL v11.8h, v5.8b, v1.8b 360 SADALP v23.4s, v15.8h 361 LDP d6, d7, [x5, 48] 362 SMULL v12.8h, v6.8b, v0.8b 363 SADALP v24.4s, v2.8h 364 SMULL v13.8h, v6.8b, v1.8b 365 SADALP v25.4s, v3.8h 366 SMULL v14.8h, v7.8b, v0.8b 367 SADALP v26.4s, v10.8h 368 SMULL v15.8h, v7.8b, v1.8b 369 SADALP v27.4s, v11.8h 370 ADD x5, x5, 64 371 SADALP v28.4s, v12.8h 372 SADALP v29.4s, v13.8h 373 SADALP v30.4s, v14.8h 374 SADALP v31.4s, v15.8h 375 B 3b 376 377 # Store odd width 378 .p2align 3 3795: 380 TBZ x1, 2, 6f 381 STR s0, [x6], 4 382 ST1 {v0.s}[2], [x7], 4 383 EXT v0.16b, v0.16b, v0.16b, 4 384 3856: 386 TBZ x1, 1, 7f 387 STR h0, [x6], 2 388 ST1 {v0.h}[4], [x7], 2 389 EXT v0.16b, v0.16b, v0.16b, 2 3907: 391 TBZ x1, 0, 8f 392 STR b0, [x6] 393 ST1 {v0.b}[8], [x7] 3948: 395 # Restore x20,x21 from stack 396 LDP x20, x21, [sp, 64] 397 398 # Restore d8-d15 from stack 399 LDP d14, d15, [sp, 48] 400 LDP d12, d13, [sp, 32] 401 LDP d10, d11, [sp, 16] 402 LDP d8, d9, [sp], 80 403 RET 404 405END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53 406 407#ifdef __ELF__ 408.section ".note.GNU-stack","",%progbits 409#endif 410 411