1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v6 31# A1 x15 v1 v7 32# B x5 v4 v5 v8 v9 33# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 34# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 35# temp0 v2 v10 v12 v14 36# temp1 v3 v11 v13 v15 37 38BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm 39 40 # Clamp C pointers 41 LDP x10, x8, [sp] // Load cn_stride, a_offset 42 CMP x0, 2 // if mr < 2 43 LDP x12, x11, [sp, 16] // Load zero, params pointer 44 ADD x7, x6, x7 // c1 = c0 + cm_stride 45 STP d8, d9, [sp, -64]! 46 ADD x2, x2, 7 // kc = (kc + 7) & ~7 47 STP d10, d11, [sp, 16] 48 CSEL x7, x6, x7, LO // c1 = c0 49 STP d12, d13, [sp, 32] 50 BIC x2, x2, 7 51 STP d14, d15, [sp, 48] 52 53 .p2align 3 540: 55 # Load initial bias from w into accumulators 56 LDP s16, s18, [x5], 8 57 MOV v17.16b, v16.16b 58 MOV v19.16b, v18.16b 59 LDP s20, s22, [x5], 8 60 MOV v21.16b, v20.16b 61 MOV v23.16b, v22.16b 62 LDP s24, s26, [x5], 8 63 MOV v25.16b, v24.16b 64 MOV v27.16b, v26.16b 65 LDP s28, s30, [x5], 8 66 MOV v29.16b, v28.16b 67 MOV v31.16b, v30.16b 68 MOV x9, x3 // p = ks 69 70 .p2align 3 711: 72 # Load next 2 A pointers 73 LDP x13, x15, [x4], 16 74 CMP x13, x12 // if a0 == zero 75 ADD x13, x13, x8 // a0 += a_offset 76 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 77 CMP x15, x12 // if a1 == zero 78 ADD x15, x15, x8 // a1 += a_offset 79 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 80 81 # Is there at least 16 bytes for epilogue? 82 SUBS x0, x2, 16 // k = kc - 16 83 B.LO 5f 84 85 # Prologue: load A0, A1 and 2 B's 86 LDP d4, d5, [x5] 87 LDP d0, d6, [x13], 16 88 LDP d1, d7, [x15], 16 89 LDP d8, d9, [x5, 64] 90 91 # Is there at least 16 bytes for main loop? 92 SUBS x0, x0, 16 // k = k - 16 93 B.LO 3f 94 95 # Main loop - 16 bytes of A 96 .p2align 3 972: 98 SMULL v2.8h, v4.8b, v0.8b 99 SMULL v3.8h, v4.8b, v1.8b 100 PRFM PLDL1KEEP, [x5, 448] 101 SMULL v10.8h, v5.8b, v0.8b 102 SMULL v11.8h, v5.8b, v1.8b 103 LDP d4, d5, [x5, 16] 104 SMLAL v2.8h, v8.8b, v6.8b 105 SMLAL v3.8h, v8.8b, v7.8b 106 PRFM PLDL1KEEP, [x5, 512] 107 SMLAL v10.8h, v9.8b, v6.8b 108 SMLAL v11.8h, v9.8b, v7.8b 109 110 LDP d8, d9, [x5, 80] 111 SMULL v12.8h, v4.8b, v0.8b 112 SADALP v16.4s, v2.8h 113 SMULL v13.8h, v4.8b, v1.8b 114 SADALP v17.4s, v3.8h 115 SMULL v14.8h, v5.8b, v0.8b 116 SADALP v18.4s, v10.8h 117 SMULL v15.8h, v5.8b, v1.8b 118 SADALP v19.4s, v11.8h 119 LDP d4, d5, [x5, 32] 120 SMLAL v12.8h, v8.8b, v6.8b 121 SMLAL v13.8h, v8.8b, v7.8b 122 PRFM PLDL1KEEP, [x13, 128] 123 SMLAL v14.8h, v9.8b, v6.8b 124 SMLAL v15.8h, v9.8b, v7.8b 125 126 LDP d8, d9, [x5, 96] 127 SMULL v2.8h, v4.8b, v0.8b 128 SADALP v20.4s, v12.8h 129 SMULL v3.8h, v4.8b, v1.8b 130 SADALP v21.4s, v13.8h 131 SMULL v10.8h, v5.8b, v0.8b 132 SADALP v22.4s, v14.8h 133 SMULL v11.8h, v5.8b, v1.8b 134 SADALP v23.4s, v15.8h 135 LDP d4, d5, [x5, 48] 136 SMLAL v2.8h, v8.8b, v6.8b 137 SMLAL v3.8h, v8.8b, v7.8b 138 PRFM PLDL1KEEP, [x15, 128] 139 SMLAL v10.8h, v9.8b, v6.8b 140 SMLAL v11.8h, v9.8b, v7.8b 141 142 LDP d8, d9, [x5, 112] 143 SMULL v12.8h, v4.8b, v0.8b 144 ADD x5, x5, 128 145 SADALP v24.4s, v2.8h 146 SMULL v13.8h, v4.8b, v1.8b 147 SADALP v25.4s, v3.8h 148 SMULL v14.8h, v5.8b, v0.8b 149 SADALP v26.4s, v10.8h 150 SMULL v15.8h, v5.8b, v1.8b 151 SADALP v27.4s, v11.8h 152 SMLAL v12.8h, v8.8b, v6.8b 153 LDP d4, d5, [x5] // Read B 154 SMLAL v13.8h, v8.8b, v7.8b 155 SUBS x0, x0, 16 156 SMLAL v14.8h, v9.8b, v6.8b 157 LDP d0, d6, [x13], 16 // Read A0 158 SMLAL v15.8h, v9.8b, v7.8b 159 160 SADALP v28.4s, v12.8h 161 LDP d1, d7, [x15], 16 // Read A1 162 SADALP v29.4s, v13.8h 163 SADALP v30.4s, v14.8h 164 LDP d8, d9, [x5, 64] // Read B 165 SADALP v31.4s, v15.8h 166 B.HS 2b 167 168 # Epilogue 169 # Same as main loop except no loads at end of loop 170 .p2align 3 1713: 172 SMULL v2.8h, v4.8b, v0.8b 173 SMULL v3.8h, v4.8b, v1.8b 174 SMULL v10.8h, v5.8b, v0.8b 175 SMULL v11.8h, v5.8b, v1.8b 176 LDP d4, d5, [x5, 16] 177 SMLAL v2.8h, v8.8b, v6.8b 178 SMLAL v3.8h, v8.8b, v7.8b 179 SMLAL v10.8h, v9.8b, v6.8b 180 SMLAL v11.8h, v9.8b, v7.8b 181 182 LDP d8, d9, [x5, 80] 183 SMULL v12.8h, v4.8b, v0.8b 184 SADALP v16.4s, v2.8h 185 SMULL v13.8h, v4.8b, v1.8b 186 SADALP v17.4s, v3.8h 187 SMULL v14.8h, v5.8b, v0.8b 188 SADALP v18.4s, v10.8h 189 SMULL v15.8h, v5.8b, v1.8b 190 SADALP v19.4s, v11.8h 191 LDP d4, d5, [x5, 32] 192 SMLAL v12.8h, v8.8b, v6.8b 193 SMLAL v13.8h, v8.8b, v7.8b 194 SMLAL v14.8h, v9.8b, v6.8b 195 SMLAL v15.8h, v9.8b, v7.8b 196 197 LDP d8, d9, [x5, 96] 198 SMULL v2.8h, v4.8b, v0.8b 199 SADALP v20.4s, v12.8h 200 SMULL v3.8h, v4.8b, v1.8b 201 SADALP v21.4s, v13.8h 202 SMULL v10.8h, v5.8b, v0.8b 203 SADALP v22.4s, v14.8h 204 SMULL v11.8h, v5.8b, v1.8b 205 SADALP v23.4s, v15.8h 206 LDP d4, d5, [x5, 48] 207 SMLAL v2.8h, v8.8b, v6.8b 208 SMLAL v3.8h, v8.8b, v7.8b 209 SMLAL v10.8h, v9.8b, v6.8b 210 SMLAL v11.8h, v9.8b, v7.8b 211 212 LDP d8, d9, [x5, 112] 213 SMULL v12.8h, v4.8b, v0.8b 214 SADALP v24.4s, v2.8h 215 SMULL v13.8h, v4.8b, v1.8b 216 SADALP v25.4s, v3.8h 217 SMULL v14.8h, v5.8b, v0.8b 218 SADALP v26.4s, v10.8h 219 SMULL v15.8h, v5.8b, v1.8b 220 SADALP v27.4s, v11.8h 221 SMLAL v12.8h, v8.8b, v6.8b 222 SMLAL v13.8h, v8.8b, v7.8b 223 SMLAL v14.8h, v9.8b, v6.8b 224 SMLAL v15.8h, v9.8b, v7.8b 225 ADD x5, x5, 128 226 227 SADALP v28.4s, v12.8h 228 SADALP v29.4s, v13.8h 229 SADALP v30.4s, v14.8h 230 SADALP v31.4s, v15.8h 231 232 # Is there a remainder?- 8 bytes of A 233 TBNZ x0, 3, 5f 234 235 # ks loop 236 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 237 B.HI 1b 238 2394: 240 # Add columns 241 ADDP v16.4s, v16.4s, v18.4s 242 ADDP v20.4s, v20.4s, v22.4s 243 ADDP v24.4s, v24.4s, v26.4s 244 ADDP v28.4s, v28.4s, v30.4s 245 ADDP v17.4s, v17.4s, v19.4s 246 ADDP v21.4s, v21.4s, v23.4s 247 ADDP v25.4s, v25.4s, v27.4s 248 ADDP v29.4s, v29.4s, v31.4s 249 ADDP v0.4s, v16.4s, v20.4s 250 ADDP v1.4s, v24.4s, v28.4s 251 ADDP v2.4s, v17.4s, v21.4s 252 ADDP v3.4s, v25.4s, v29.4s 253 254 # Load per channel scale values from weights 255 SCVTF v0.4s, v0.4s 256 LDR q4, [x5], 16 257 SCVTF v1.4s, v1.4s 258 LDR q5, [x5], 16 259 SCVTF v2.4s, v2.4s 260 SCVTF v3.4s, v3.4s 261 FMUL v0.4s, v0.4s, v4.4s 262 FMUL v1.4s, v1.4s, v5.4s 263 FMUL v2.4s, v2.4s, v4.4s 264 FMUL v3.4s, v3.4s, v5.4s 265 266 FCVTNS v0.4s, v0.4s 267 FCVTNS v1.4s, v1.4s 268 FCVTNS v2.4s, v2.4s 269 FCVTNS v3.4s, v3.4s 270 271 LD1R {v5.8h}, [x11], 2 272 SQXTN v0.4h, v0.4s 273 SQXTN v2.4h, v2.4s 274 SQXTN2 v0.8h, v1.4s 275 SQXTN2 v2.8h, v3.4s 276 SUBS x1, x1, 8 277 SQADD v0.8h, v0.8h, v5.8h 278 SQADD v1.8h, v2.8h, v5.8h 279 SQXTN v0.8b, v0.8h 280 SQXTN2 v0.16b, v1.8h 281 LD1R {v1.16b}, [x11], 1 282 LD1R {v2.16b}, [x11] 283 SMAX v0.16b, v0.16b, v1.16b 284 SUB x11, x11, 3 // rewind params pointer 285 SMIN v0.16b, v0.16b, v2.16b 286 B.LO 6f 287 288 # Store full 2 x 8 289 ST1 {v0.d}[1], [x7], x10 290 ST1 {v0.8b}, [x6], x10 291 292 SUB x4, x4, x3 // a -= ks 293 294 # nc loop 295 B.HI 0b 296 297 # Restore d8-d15 from stack 298 LDP d14, d15, [sp, 48] 299 LDP d12, d13, [sp, 32] 300 LDP d10, d11, [sp, 16] 301 LDP d8, d9, [sp], 64 302 RET 303 304 # Remainder - 8 bytes of A 305 .p2align 3 3065: 307 LDR d0, [x13] 308 LDP d4, d5, [x5] 309 LDR d1, [x15] 310 LDP d6, d7, [x5, 16] 311 SMULL v2.8h, v4.8b, v0.8b 312 SMULL v3.8h, v4.8b, v1.8b 313 SMULL v10.8h, v5.8b, v0.8b 314 SMULL v11.8h, v5.8b, v1.8b 315 SMULL v12.8h, v6.8b, v0.8b 316 SADALP v16.4s, v2.8h 317 SMULL v13.8h, v6.8b, v1.8b 318 SADALP v17.4s, v3.8h 319 SMULL v14.8h, v7.8b, v0.8b 320 SADALP v18.4s, v10.8h 321 SMULL v15.8h, v7.8b, v1.8b 322 SADALP v19.4s, v11.8h 323 LDP d4, d5, [x5, 32] 324 SMULL v2.8h, v4.8b, v0.8b 325 SADALP v20.4s, v12.8h 326 SMULL v3.8h, v4.8b, v1.8b 327 SADALP v21.4s, v13.8h 328 SMULL v10.8h, v5.8b, v0.8b 329 SADALP v22.4s, v14.8h 330 SMULL v11.8h, v5.8b, v1.8b 331 SADALP v23.4s, v15.8h 332 LDP d6, d7, [x5, 48] 333 SMULL v12.8h, v6.8b, v0.8b 334 SADALP v24.4s, v2.8h 335 SMULL v13.8h, v6.8b, v1.8b 336 SADALP v25.4s, v3.8h 337 SMULL v14.8h, v7.8b, v0.8b 338 SADALP v26.4s, v10.8h 339 SMULL v15.8h, v7.8b, v1.8b 340 SADALP v27.4s, v11.8h 341 ADD x5, x5, 64 342 SADALP v28.4s, v12.8h 343 SADALP v29.4s, v13.8h 344 SADALP v30.4s, v14.8h 345 SADALP v31.4s, v15.8h 346 347 # ks loop 348 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 349 B.HI 1b 350 B 4b 351 352 # Store odd width 353 .p2align 3 3546: 355 TBZ x1, 2, 7f 356 ST1 {v0.s}[2], [x7], 4 357 STR s0, [x6], 4 358 EXT v0.16b, v0.16b, v0.16b, 4 359 3607: 361 TBZ x1, 1, 8f 362 ST1 {v0.h}[4], [x7], 2 363 STR h0, [x6], 2 364 EXT v0.16b, v0.16b, v0.16b, 2 3658: 366 TBZ x1, 0, 9f 367 ST1 {v0.b}[8], [x7] 368 STR b0, [x6] 3699: 370 # Restore d8-d15 from stack 371 LDP d14, d15, [sp, 48] 372 LDP d12, d13, [sp, 32] 373 LDP d10, d11, [sp, 16] 374 LDP d8, d9, [sp], 64 375 RET 376 377END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm 378 379#ifdef __ELF__ 380.section ".note.GNU-stack","",%progbits 381#endif 382