1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v6 31# A1 x15 v1 v7 32# B x5 v4 v5 v8 v9 33# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 34# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 35# temp0 v2 v10 v12 v14 36# temp1 v3 v11 v13 v15 37 38BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal 39 40 # Clamp C pointers 41 LDP x10, x8, [sp] // Load cn_stride, a_offset 42 CMP x0, 2 // if mr < 2 43 LDP x12, x11, [sp, 16] // Load zero, params pointer 44 ADD x7, x6, x7 // c1 = c0 + cm_stride 45 STP d8, d9, [sp, -64]! 46 ADD x2, x2, 7 // kc = (kc + 7) & ~7 47 STP d10, d11, [sp, 16] 48 CSEL x7, x6, x7, LO // c1 = c0 49 STP d12, d13, [sp, 32] 50 BIC x2, x2, 7 51 STP d14, d15, [sp, 48] 52 53 .p2align 3 540: 55 # Load initial bias from w into accumulators 56 LDP s16, s18, [x5], 8 57 MOV v17.16b, v16.16b 58 MOV v19.16b, v18.16b 59 LDP s20, s22, [x5], 8 60 MOV v21.16b, v20.16b 61 MOV v23.16b, v22.16b 62 LDP s24, s26, [x5], 8 63 MOV v25.16b, v24.16b 64 MOV v27.16b, v26.16b 65 LDP s28, s30, [x5], 8 66 MOV v29.16b, v28.16b 67 MOV v31.16b, v30.16b 68 MOV x9, x3 // p = ks 69 70 .p2align 3 711: 72 # Load next 2 A pointers 73 LDP x13, x15, [x4], 16 74 CMP x13, x12 // if a0 == zero 75 ADD x13, x13, x8 // a0 += a_offset 76 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 77 CMP x15, x12 // if a1 == zero 78 ADD x15, x15, x8 // a1 += a_offset 79 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 80 81 # Is there at least 16 bytes for epilogue? 82 SUBS x0, x2, 16 // k = kc - 16 83 B.LO 5f 84 85 # Prologue: load A0, A1 and 2 B's 86 LDP d4, d5, [x5] 87 LDP d0, d6, [x13], 16 88 LDP d1, d7, [x15], 16 89 LDP d8, d9, [x5, 64] 90 91 # Is there at least 16 bytes for main loop? 92 SUBS x0, x0, 16 // k = k - 16 93 B.LO 3f 94 95 # Main loop - 16 bytes of A 96 .p2align 3 972: 98 SMULL v2.8h, v4.8b, v0.8b 99 SMULL v3.8h, v4.8b, v1.8b 100 SMULL v10.8h, v5.8b, v0.8b 101 SMULL v11.8h, v5.8b, v1.8b 102 LDP d4, d5, [x5, 16] 103 SMLAL v2.8h, v8.8b, v6.8b 104 SMLAL v3.8h, v8.8b, v7.8b 105 SMLAL v10.8h, v9.8b, v6.8b 106 SMLAL v11.8h, v9.8b, v7.8b 107 108 LDP d8, d9, [x5, 80] 109 SMULL v12.8h, v4.8b, v0.8b 110 SADALP v16.4s, v2.8h 111 SMULL v13.8h, v4.8b, v1.8b 112 SADALP v17.4s, v3.8h 113 SMULL v14.8h, v5.8b, v0.8b 114 SADALP v18.4s, v10.8h 115 SMULL v15.8h, v5.8b, v1.8b 116 SADALP v19.4s, v11.8h 117 LDP d4, d5, [x5, 32] 118 SMLAL v12.8h, v8.8b, v6.8b 119 SMLAL v13.8h, v8.8b, v7.8b 120 SMLAL v14.8h, v9.8b, v6.8b 121 SMLAL v15.8h, v9.8b, v7.8b 122 123 LDP d8, d9, [x5, 96] 124 SMULL v2.8h, v4.8b, v0.8b 125 SADALP v20.4s, v12.8h 126 SMULL v3.8h, v4.8b, v1.8b 127 SADALP v21.4s, v13.8h 128 SMULL v10.8h, v5.8b, v0.8b 129 SADALP v22.4s, v14.8h 130 SMULL v11.8h, v5.8b, v1.8b 131 SADALP v23.4s, v15.8h 132 LDP d4, d5, [x5, 48] 133 SMLAL v2.8h, v8.8b, v6.8b 134 SMLAL v3.8h, v8.8b, v7.8b 135 SMLAL v10.8h, v9.8b, v6.8b 136 SMLAL v11.8h, v9.8b, v7.8b 137 138 LDP d8, d9, [x5, 112] 139 SMULL v12.8h, v4.8b, v0.8b 140 ADD x5, x5, 128 141 SADALP v24.4s, v2.8h 142 SMULL v13.8h, v4.8b, v1.8b 143 SADALP v25.4s, v3.8h 144 SMULL v14.8h, v5.8b, v0.8b 145 SADALP v26.4s, v10.8h 146 SMULL v15.8h, v5.8b, v1.8b 147 SADALP v27.4s, v11.8h 148 SMLAL v12.8h, v8.8b, v6.8b 149 LDP d4, d5, [x5] // Read B 150 SMLAL v13.8h, v8.8b, v7.8b 151 SUBS x0, x0, 16 152 SMLAL v14.8h, v9.8b, v6.8b 153 LDP d0, d6, [x13], 16 // Read A0 154 SMLAL v15.8h, v9.8b, v7.8b 155 156 SADALP v28.4s, v12.8h 157 LDP d1, d7, [x15], 16 // Read A1 158 SADALP v29.4s, v13.8h 159 SADALP v30.4s, v14.8h 160 LDP d8, d9, [x5, 64] // Read B 161 SADALP v31.4s, v15.8h 162 B.HS 2b 163 164 # Epilogue 165 # Same as main loop except no loads at end of loop 166 .p2align 3 1673: 168 SMULL v2.8h, v4.8b, v0.8b 169 SMULL v3.8h, v4.8b, v1.8b 170 SMULL v10.8h, v5.8b, v0.8b 171 SMULL v11.8h, v5.8b, v1.8b 172 LDP d4, d5, [x5, 16] 173 SMLAL v2.8h, v8.8b, v6.8b 174 SMLAL v3.8h, v8.8b, v7.8b 175 SMLAL v10.8h, v9.8b, v6.8b 176 SMLAL v11.8h, v9.8b, v7.8b 177 178 LDP d8, d9, [x5, 80] 179 SMULL v12.8h, v4.8b, v0.8b 180 SADALP v16.4s, v2.8h 181 SMULL v13.8h, v4.8b, v1.8b 182 SADALP v17.4s, v3.8h 183 SMULL v14.8h, v5.8b, v0.8b 184 SADALP v18.4s, v10.8h 185 SMULL v15.8h, v5.8b, v1.8b 186 SADALP v19.4s, v11.8h 187 LDP d4, d5, [x5, 32] 188 SMLAL v12.8h, v8.8b, v6.8b 189 SMLAL v13.8h, v8.8b, v7.8b 190 SMLAL v14.8h, v9.8b, v6.8b 191 SMLAL v15.8h, v9.8b, v7.8b 192 193 LDP d8, d9, [x5, 96] 194 SMULL v2.8h, v4.8b, v0.8b 195 SADALP v20.4s, v12.8h 196 SMULL v3.8h, v4.8b, v1.8b 197 SADALP v21.4s, v13.8h 198 SMULL v10.8h, v5.8b, v0.8b 199 SADALP v22.4s, v14.8h 200 SMULL v11.8h, v5.8b, v1.8b 201 SADALP v23.4s, v15.8h 202 LDP d4, d5, [x5, 48] 203 SMLAL v2.8h, v8.8b, v6.8b 204 SMLAL v3.8h, v8.8b, v7.8b 205 SMLAL v10.8h, v9.8b, v6.8b 206 SMLAL v11.8h, v9.8b, v7.8b 207 208 LDP d8, d9, [x5, 112] 209 SMULL v12.8h, v4.8b, v0.8b 210 SADALP v24.4s, v2.8h 211 SMULL v13.8h, v4.8b, v1.8b 212 SADALP v25.4s, v3.8h 213 SMULL v14.8h, v5.8b, v0.8b 214 SADALP v26.4s, v10.8h 215 SMULL v15.8h, v5.8b, v1.8b 216 SADALP v27.4s, v11.8h 217 SMLAL v12.8h, v8.8b, v6.8b 218 SMLAL v13.8h, v8.8b, v7.8b 219 SMLAL v14.8h, v9.8b, v6.8b 220 SMLAL v15.8h, v9.8b, v7.8b 221 ADD x5, x5, 128 222 223 SADALP v28.4s, v12.8h 224 SADALP v29.4s, v13.8h 225 SADALP v30.4s, v14.8h 226 SADALP v31.4s, v15.8h 227 228 # Is there a remainder?- 8 bytes of A 229 TBNZ x0, 3, 5f 230 231 # ks loop 232 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 233 B.HI 1b 234 2354: 236 # Add columns 237 ADDP v16.4s, v16.4s, v18.4s 238 ADDP v20.4s, v20.4s, v22.4s 239 ADDP v24.4s, v24.4s, v26.4s 240 ADDP v28.4s, v28.4s, v30.4s 241 ADDP v17.4s, v17.4s, v19.4s 242 ADDP v21.4s, v21.4s, v23.4s 243 ADDP v25.4s, v25.4s, v27.4s 244 ADDP v29.4s, v29.4s, v31.4s 245 ADDP v0.4s, v16.4s, v20.4s 246 ADDP v1.4s, v24.4s, v28.4s 247 ADDP v2.4s, v17.4s, v21.4s 248 ADDP v3.4s, v25.4s, v29.4s 249 250 # Load per channel scale values from weights 251 SCVTF v0.4s, v0.4s 252 LDR q4, [x5], 16 253 SCVTF v1.4s, v1.4s 254 LDR q5, [x5], 16 255 SCVTF v2.4s, v2.4s 256 SCVTF v3.4s, v3.4s 257 FMUL v0.4s, v0.4s, v4.4s 258 FMUL v1.4s, v1.4s, v5.4s 259 FMUL v2.4s, v2.4s, v4.4s 260 FMUL v3.4s, v3.4s, v5.4s 261 262 FCVTNS v0.4s, v0.4s 263 FCVTNS v1.4s, v1.4s 264 FCVTNS v2.4s, v2.4s 265 FCVTNS v3.4s, v3.4s 266 267 LD1R {v5.8h}, [x11], 2 268 SQXTN v0.4h, v0.4s 269 SQXTN v2.4h, v2.4s 270 SQXTN2 v0.8h, v1.4s 271 SQXTN2 v2.8h, v3.4s 272 SUBS x1, x1, 8 273 SQADD v0.8h, v0.8h, v5.8h 274 SQADD v1.8h, v2.8h, v5.8h 275 SQXTN v0.8b, v0.8h 276 SQXTN2 v0.16b, v1.8h 277 LD1R {v1.16b}, [x11], 1 278 LD1R {v2.16b}, [x11] 279 SMAX v0.16b, v0.16b, v1.16b 280 SUB x11, x11, 3 // rewind params pointer 281 SMIN v0.16b, v0.16b, v2.16b 282 B.LO 6f 283 284 # Store full 2 x 8 285 ST1 {v0.d}[1], [x7], x10 286 ST1 {v0.8b}, [x6], x10 287 288 SUB x4, x4, x3 // a -= ks 289 290 # nc loop 291 B.HI 0b 292 293 # Restore d8-d15 from stack 294 LDP d14, d15, [sp, 48] 295 LDP d12, d13, [sp, 32] 296 LDP d10, d11, [sp, 16] 297 LDP d8, d9, [sp], 64 298 RET 299 300 # Remainder - 8 bytes of A 301 .p2align 3 3025: 303 LDR d0, [x13] 304 LDP d4, d5, [x5] 305 LDR d1, [x15] 306 LDP d6, d7, [x5, 16] 307 SMULL v2.8h, v4.8b, v0.8b 308 SMULL v3.8h, v4.8b, v1.8b 309 SMULL v10.8h, v5.8b, v0.8b 310 SMULL v11.8h, v5.8b, v1.8b 311 SMULL v12.8h, v6.8b, v0.8b 312 SADALP v16.4s, v2.8h 313 SMULL v13.8h, v6.8b, v1.8b 314 SADALP v17.4s, v3.8h 315 SMULL v14.8h, v7.8b, v0.8b 316 SADALP v18.4s, v10.8h 317 SMULL v15.8h, v7.8b, v1.8b 318 SADALP v19.4s, v11.8h 319 LDP d4, d5, [x5, 32] 320 SMULL v2.8h, v4.8b, v0.8b 321 SADALP v20.4s, v12.8h 322 SMULL v3.8h, v4.8b, v1.8b 323 SADALP v21.4s, v13.8h 324 SMULL v10.8h, v5.8b, v0.8b 325 SADALP v22.4s, v14.8h 326 SMULL v11.8h, v5.8b, v1.8b 327 SADALP v23.4s, v15.8h 328 LDP d6, d7, [x5, 48] 329 SMULL v12.8h, v6.8b, v0.8b 330 SADALP v24.4s, v2.8h 331 SMULL v13.8h, v6.8b, v1.8b 332 SADALP v25.4s, v3.8h 333 SMULL v14.8h, v7.8b, v0.8b 334 SADALP v26.4s, v10.8h 335 SMULL v15.8h, v7.8b, v1.8b 336 SADALP v27.4s, v11.8h 337 ADD x5, x5, 64 338 SADALP v28.4s, v12.8h 339 SADALP v29.4s, v13.8h 340 SADALP v30.4s, v14.8h 341 SADALP v31.4s, v15.8h 342 343 # ks loop 344 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 345 B.HI 1b 346 B 4b 347 348 # Store odd width 349 .p2align 3 3506: 351 TBZ x1, 2, 7f 352 ST1 {v0.s}[2], [x7], 4 353 STR s0, [x6], 4 354 EXT v0.16b, v0.16b, v0.16b, 4 355 3567: 357 TBZ x1, 1, 8f 358 ST1 {v0.h}[4], [x7], 2 359 STR h0, [x6], 2 360 EXT v0.16b, v0.16b, v0.16b, 2 3618: 362 TBZ x1, 0, 9f 363 ST1 {v0.b}[8], [x7] 364 STR b0, [x6] 3659: 366 # Restore d8-d15 from stack 367 LDP d14, d15, [sp, 48] 368 LDP d12, d13, [sp, 32] 369 LDP d10, d11, [sp, 16] 370 LDP d8, d9, [sp], 64 371 RET 372 373END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal 374 375#ifdef __ELF__ 376.section ".note.GNU-stack","",%progbits 377#endif 378