1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v6 29# A1 x4 v1 v7 30# B x5 v4 v5 v8 v9 31# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 32# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 33# temp0 v2 v10 v12 v14 34# temp1 v3 v11 v13 v15 35# x16, x17, x20, x21 tenporary a53 gpr load data 36 37 38BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53 39 40 # Clamp A and C pointers 41 CMP x0, 2 // if mr < 2 42 STP d8, d9, [sp, -80]! 43 ADD x4, x3, x4 // a1 = a0 + a_stride 44 STP d10, d11, [sp, 16] 45 ADD x7, x6, x7 // c1 = c0 + cm_stride 46 STP d12, d13, [sp, 32] 47 CSEL x4, x3, x4, LO // a1 = a0 48 STP d14, d15, [sp, 48] 49 ADD x2, x2, 7 // kc = (kc + 7) & ~7 50 CSEL x7, x6, x7, LO // c1 = c0 51 BIC x2, x2, 7 52 STP x20, x21, [sp, 64] // Save x20,x21 on stack 53 54 .p2align 3 550: 56 # Load initial bias from w into accumulators 57 SUBS x0, x2, 16 // k = kc - 16 58 LDP s16, s18, [x5], 8 59 MOV v17.16b, v16.16b 60 MOV v19.16b, v18.16b 61 LDP s20, s22, [x5], 8 62 MOV v21.16b, v20.16b 63 MOV v23.16b, v22.16b 64 LDP s24, s26, [x5], 8 65 MOV v25.16b, v24.16b 66 MOV v27.16b, v26.16b 67 LDP s28, s30, [x5], 8 68 MOV v29.16b, v28.16b 69 LDP x10, x11, [sp, 80] // cn_stride, params 70 MOV v31.16b, v30.16b 71 # Is there at least 16 bytes for epilogue? 72 B.LO 4f 73 74 # Prologue: load A0, A1 and 2 B's 75 LDP d4, d5, [x5] // Read B 76 LDP d0, d6, [x3], 16 // Read A0 77 LDR x17, [x5, 64] // Read B 78 LDP d1, d7, [x4], 16 // Read A1 79 LDR x16, [x5, 16] 80 81 # Is there at least 16 bytes for main loop? 82 SUBS x0, x0, 16 // k = k - 16 83 B.LO 2f 84 85 # Main loop - 16 bytes of A 86 # 4 groups of 4 mul/mla/adap + 2 load = 18 cycles. 87 # 2 loads for A0 = +2 cycles. Total 18 * 4 + 2 = 74 cycles. 88 89 .p2align 3 901: 91 # BLOCK 0 - 18 cycles - includes prfm 92 LDR d9, [x5, 72] // Read B 93 INS v8.d[0], x17 94 SMULL v2.8h, v4.8b, v0.8b 95 SMULL v3.8h, v4.8b, v1.8b 96 LDR x17, [x5, 80] 97 SMULL v10.8h, v5.8b, v0.8b 98 SMULL v11.8h, v5.8b, v1.8b 99 LDR d5, [x5, 24] 100 INS v4.d[0], x16 101 SMLAL v2.8h, v8.8b, v6.8b 102 SMLAL v3.8h, v8.8b, v7.8b 103 LDR x16, [x5, 32] 104 SMLAL v10.8h, v9.8b, v6.8b 105 SMLAL v11.8h, v9.8b, v7.8b 106 SADALP v16.4s, v2.8h 107 SADALP v17.4s, v3.8h 108 SADALP v18.4s, v10.8h 109 SADALP v19.4s, v11.8h 110 111 # BLOCK 1- 18 cycles 112 LDR d9, [x5, 88] 113 INS v8.d[0], x17 114 SMULL v12.8h, v4.8b, v0.8b 115 SMULL v13.8h, v4.8b, v1.8b 116 LDR x17, [x5, 96] 117 SMULL v14.8h, v5.8b, v0.8b 118 SMULL v15.8h, v5.8b, v1.8b 119 LDR d5, [x5, 40] 120 INS v4.d[0], x16 121 SMLAL v12.8h, v8.8b, v6.8b 122 SMLAL v13.8h, v8.8b, v7.8b 123 LDR x16, [x5, 48] 124 SMLAL v14.8h, v9.8b, v6.8b 125 SMLAL v15.8h, v9.8b, v7.8b 126 SADALP v20.4s, v12.8h 127 SADALP v21.4s, v13.8h 128 SADALP v22.4s, v14.8h 129 SADALP v23.4s, v15.8h 130 131 # BLOCK 2 - 18 cycles 132 LDR d9, [x5, 104] 133 INS v8.d[0], x17 134 SMULL v2.8h, v4.8b, v0.8b 135 SMULL v3.8h, v4.8b, v1.8b 136 LDR x17, [x5, 112] 137 SMULL v10.8h, v5.8b, v0.8b 138 SMULL v11.8h, v5.8b, v1.8b 139 LDR d5, [x5, 56] 140 INS v4.d[0], x16 141 SMLAL v2.8h, v8.8b, v6.8b 142 SMLAL v3.8h, v8.8b, v7.8b 143 LDR x16, [x5, 128] 144 SMLAL v10.8h, v9.8b, v6.8b 145 SMLAL v11.8h, v9.8b, v7.8b 146 SADALP v24.4s, v2.8h 147 LDR x20, [x3], 8 // Read A0 148 SADALP v25.4s, v3.8h 149 LDR x21, [x4], 8 // Read A1 150 SADALP v26.4s, v10.8h 151 SADALP v27.4s, v11.8h 152 SUBS x0, x0, 16 153 154 # BLOCK 3 - includes 2 cycles to read A0, A1 = 20 cycles 155 LDR d9, [x5, 120] 156 INS v8.d[0], x17 157 SMULL v12.8h, v4.8b, v0.8b 158 SMULL v13.8h, v4.8b, v1.8b 159 LDR x17, [x5, 192] // Read B 160 SMULL v14.8h, v5.8b, v0.8b 161 SMULL v15.8h, v5.8b, v1.8b 162 LDR d5, [x5, 136] // Read B 163 INS v4.d[0], x16 164 SMLAL v12.8h, v8.8b, v6.8b 165 SMLAL v13.8h, v8.8b, v7.8b 166 LDR x16, [x5, 144] 167 SMLAL v14.8h, v9.8b, v6.8b 168 SMLAL v15.8h, v9.8b, v7.8b 169 LDR d6, [x3], 8 // Read A0 170 INS v0.d[0], x20 171 LDR d7, [x4], 8 // Read A1 172 INS v1.d[0], x21 173 SADALP v28.4s, v12.8h 174 SADALP v29.4s, v13.8h 175 ADD x5, x5, 128 176 SADALP v30.4s, v14.8h 177 SADALP v31.4s, v15.8h 178 B.HS 1b 179 180 # Epilogue 181 # Same as main loop except no loads at end of loop 182 183 .p2align 3 1842: 185 # BLOCK 0 - 18 cycles 186 LDR d9, [x5, 72] // Read B 187 INS v8.d[0], x17 188 SMULL v2.8h, v4.8b, v0.8b 189 SMULL v3.8h, v4.8b, v1.8b 190 LDR x17, [x5, 80] 191 SMULL v10.8h, v5.8b, v0.8b 192 SMULL v11.8h, v5.8b, v1.8b 193 LDR d5, [x5, 24] 194 INS v4.d[0], x16 195 SMLAL v2.8h, v8.8b, v6.8b 196 SMLAL v3.8h, v8.8b, v7.8b 197 LDR x16, [x5, 32] 198 SMLAL v10.8h, v9.8b, v6.8b 199 SMLAL v11.8h, v9.8b, v7.8b 200 SADALP v16.4s, v2.8h 201 SADALP v17.4s, v3.8h 202 SADALP v18.4s, v10.8h 203 SADALP v19.4s, v11.8h 204 205 # BLOCK 1- 18 cycles 206 LDR d9, [x5, 88] 207 INS v8.d[0], x17 208 SMULL v12.8h, v4.8b, v0.8b 209 SMULL v13.8h, v4.8b, v1.8b 210 LDR x17, [x5, 96] 211 SMULL v14.8h, v5.8b, v0.8b 212 SMULL v15.8h, v5.8b, v1.8b 213 LDR d5, [x5, 40] 214 INS v4.d[0], x16 215 SMLAL v12.8h, v8.8b, v6.8b 216 SMLAL v13.8h, v8.8b, v7.8b 217 LDR x16, [x5, 48] 218 SMLAL v14.8h, v9.8b, v6.8b 219 SMLAL v15.8h, v9.8b, v7.8b 220 SADALP v20.4s, v12.8h 221 SADALP v21.4s, v13.8h 222 SADALP v22.4s, v14.8h 223 SADALP v23.4s, v15.8h 224 225 # BLOCK 2 - 18 cycles 226 LDR d9, [x5, 104] 227 INS v8.d[0], x17 228 SMULL v2.8h, v4.8b, v0.8b 229 SMULL v3.8h, v4.8b, v1.8b 230 LDR x17, [x5, 112] 231 SMULL v10.8h, v5.8b, v0.8b 232 SMULL v11.8h, v5.8b, v1.8b 233 LDR d5, [x5, 56] 234 INS v4.d[0], x16 235 SMLAL v2.8h, v8.8b, v6.8b 236 SMLAL v3.8h, v8.8b, v7.8b 237 SMLAL v10.8h, v9.8b, v6.8b 238 SMLAL v11.8h, v9.8b, v7.8b 239 SADALP v24.4s, v2.8h 240 SADALP v25.4s, v3.8h 241 SADALP v26.4s, v10.8h 242 SADALP v27.4s, v11.8h 243 244 # BLOCK 3 - 17 cycles 245 LDR d9, [x5, 120] 246 INS v8.d[0], x17 247 SMULL v12.8h, v4.8b, v0.8b 248 SMULL v13.8h, v4.8b, v1.8b 249 SMULL v14.8h, v5.8b, v0.8b 250 SMULL v15.8h, v5.8b, v1.8b 251 SMLAL v12.8h, v8.8b, v6.8b 252 SMLAL v13.8h, v8.8b, v7.8b 253 SMLAL v14.8h, v9.8b, v6.8b 254 SMLAL v15.8h, v9.8b, v7.8b 255 SADALP v28.4s, v12.8h 256 SADALP v29.4s, v13.8h 257 ADD x5, x5, 128 258 SADALP v30.4s, v14.8h 259 SADALP v31.4s, v15.8h 260 261 # Is there a remainder?- 8 bytes of A 262 TBNZ x0, 3, 4f 263 264 .p2align 3 2653: 266 # Add columns 267 ADDP v16.4s, v16.4s, v18.4s 268 ADDP v20.4s, v20.4s, v22.4s 269 ADDP v24.4s, v24.4s, v26.4s 270 ADDP v28.4s, v28.4s, v30.4s 271 ADDP v17.4s, v17.4s, v19.4s 272 ADDP v21.4s, v21.4s, v23.4s 273 ADDP v25.4s, v25.4s, v27.4s 274 ADDP v29.4s, v29.4s, v31.4s 275 ADDP v0.4s, v16.4s, v20.4s 276 ADDP v1.4s, v24.4s, v28.4s 277 ADDP v2.4s, v17.4s, v21.4s 278 ADDP v3.4s, v25.4s, v29.4s 279 280 # Apply params - scale, bias and clamp 281 SCVTF v0.4s, v0.4s 282 LD1R {v4.4s}, [x11], 4 283 SCVTF v1.4s, v1.4s 284 SCVTF v2.4s, v2.4s 285 SCVTF v3.4s, v3.4s 286 FMUL v0.4s, v0.4s, v4.4s 287 FMUL v1.4s, v1.4s, v4.4s 288 FMUL v2.4s, v2.4s, v4.4s 289 FMUL v3.4s, v3.4s, v4.4s 290 291 FCVTNS v0.4s, v0.4s 292 FCVTNS v1.4s, v1.4s 293 FCVTNS v2.4s, v2.4s 294 FCVTNS v3.4s, v3.4s 295 296 LD1R {v5.8h}, [x11], 2 297 SQXTN v0.4h, v0.4s 298 SQXTN v2.4h, v2.4s 299 SQXTN2 v0.8h, v1.4s 300 SQXTN2 v2.8h, v3.4s 301 SUBS x1, x1, 8 302 SQADD v0.8h, v0.8h, v5.8h 303 SQADD v1.8h, v2.8h, v5.8h 304 SQXTN v0.8b, v0.8h 305 SQXTN2 v0.16b, v1.8h 306 LD1R {v1.16b}, [x11], 1 307 LD1R {v2.16b}, [x11] 308 SMAX v0.16b, v0.16b, v1.16b 309 SMIN v0.16b, v0.16b, v2.16b 310 B.LO 5f 311 312 # Store full 2 x 8 313 ST1 {v0.8b}, [x6], x10 314 SUB x3, x3, x2 // a0 -= kc 315 ST1 {v0.d}[1], [x7], x10 316 SUB x4, x4, x2 // a1 -= kc 317 B.HI 0b 318 319 # Restore x20,x21 from stack 320 LDP x20, x21, [sp, 64] 321 322 # Restore d8-d15 from stack 323 LDP d14, d15, [sp, 48] 324 LDP d12, d13, [sp, 32] 325 LDP d10, d11, [sp, 16] 326 LDP d8, d9, [sp], 80 327 RET 328 329 # Remainder - 8 bytes of A 330 .p2align 3 3314: 332 LDR d0, [x3], 8 333 LDP d4, d5, [x5] 334 LDR d1, [x4], 8 335 LDP d6, d7, [x5, 16] 336 SMULL v2.8h, v4.8b, v0.8b 337 SMULL v3.8h, v4.8b, v1.8b 338 SMULL v10.8h, v5.8b, v0.8b 339 SMULL v11.8h, v5.8b, v1.8b 340 SMULL v12.8h, v6.8b, v0.8b 341 SADALP v16.4s, v2.8h 342 SMULL v13.8h, v6.8b, v1.8b 343 SADALP v17.4s, v3.8h 344 SMULL v14.8h, v7.8b, v0.8b 345 SADALP v18.4s, v10.8h 346 SMULL v15.8h, v7.8b, v1.8b 347 SADALP v19.4s, v11.8h 348 LDP d4, d5, [x5, 32] 349 SMULL v2.8h, v4.8b, v0.8b 350 SADALP v20.4s, v12.8h 351 SMULL v3.8h, v4.8b, v1.8b 352 SADALP v21.4s, v13.8h 353 SMULL v10.8h, v5.8b, v0.8b 354 SADALP v22.4s, v14.8h 355 SMULL v11.8h, v5.8b, v1.8b 356 SADALP v23.4s, v15.8h 357 LDP d6, d7, [x5, 48] 358 SMULL v12.8h, v6.8b, v0.8b 359 SADALP v24.4s, v2.8h 360 SMULL v13.8h, v6.8b, v1.8b 361 SADALP v25.4s, v3.8h 362 SMULL v14.8h, v7.8b, v0.8b 363 SADALP v26.4s, v10.8h 364 SMULL v15.8h, v7.8b, v1.8b 365 SADALP v27.4s, v11.8h 366 ADD x5, x5, 64 367 SADALP v28.4s, v12.8h 368 SADALP v29.4s, v13.8h 369 SADALP v30.4s, v14.8h 370 SADALP v31.4s, v15.8h 371 B 3b 372 373 # Store odd width 374 .p2align 3 3755: 376 TBZ x1, 2, 6f 377 STR s0, [x6], 4 378 ST1 {v0.s}[2], [x7], 4 379 EXT v0.16b, v0.16b, v0.16b, 4 380 3816: 382 TBZ x1, 1, 7f 383 STR h0, [x6], 2 384 ST1 {v0.h}[4], [x7], 2 385 EXT v0.16b, v0.16b, v0.16b, 2 3867: 387 TBZ x1, 0, 8f 388 STR b0, [x6] 389 ST1 {v0.b}[8], [x7] 3908: 391 # Restore x20,x21 from stack 392 LDP x20, x21, [sp, 64] 393 394 # Restore d8-d15 from stack 395 LDP d14, d15, [sp, 48] 396 LDP d12, d13, [sp, 32] 397 LDP d10, d11, [sp, 16] 398 LDP d8, d9, [sp], 80 399 RET 400 401END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53 402 403#ifdef __ELF__ 404.section ".note.GNU-stack","",%progbits 405#endif 406 407