1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/2x8c8-aarch64-neon-mlal.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v6 31# A1 x15 v1 v7 32# B x5 v4 v5 v8 v9 33# C0 x6 v16 v18 v20 v22 v24 v26 v28 v30 34# C1 x7 v17 v19 v21 v23 v25 v27 v29 v31 35# temp0 v2 v10 v12 v14 36# temp1 v3 v11 v13 v15 37 38BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal 39 40 # Clamp C pointers 41 LDP x10, x8, [sp] // Load cn_stride, a_offset 42 CMP x0, 2 // if mr < 2 43 LDP x12, x11, [sp, 16] // Load zero, params pointer 44 ADD x7, x6, x7 // c1 = c0 + cm_stride 45 STP d8, d9, [sp, -64]! 46 ADD x2, x2, 7 // kc = (kc + 7) & ~7 47 STP d10, d11, [sp, 16] 48 CSEL x7, x6, x7, LO // c1 = c0 49 STP d12, d13, [sp, 32] 50 BIC x2, x2, 7 51 STP d14, d15, [sp, 48] 52 53 .p2align 3 540: 55 # Load initial bias from w into accumulators 56 LDP s16, s18, [x5], 8 57 MOV v17.16b, v16.16b 58 MOV v19.16b, v18.16b 59 LDP s20, s22, [x5], 8 60 MOV v21.16b, v20.16b 61 MOV v23.16b, v22.16b 62 LDP s24, s26, [x5], 8 63 MOV v25.16b, v24.16b 64 MOV v27.16b, v26.16b 65 LDP s28, s30, [x5], 8 66 MOV v29.16b, v28.16b 67 MOV v31.16b, v30.16b 68 MOV x9, x3 // p = ks 69 70 .p2align 3 711: 72 # Load next 2 A pointers 73 LDP x13, x15, [x4], 16 74 CMP x13, x12 // if a0 == zero 75 ADD x13, x13, x8 // a0 += a_offset 76 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 77 CMP x15, x12 // if a1 == zero 78 ADD x15, x15, x8 // a1 += a_offset 79 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 80 81 # Is there at least 16 bytes for epilogue? 82 SUBS x0, x2, 16 // k = kc - 16 83 B.LO 5f 84 85 # Prologue: load A0, A1 and 2 B's 86 LDP d4, d5, [x5] 87 LDP d0, d6, [x13], 16 88 LDP d1, d7, [x15], 16 89 LDP d8, d9, [x5, 64] 90 91 # Is there at least 16 bytes for main loop? 92 SUBS x0, x0, 16 // k = k - 16 93 B.LO 3f 94 95 # Main loop - 16 bytes of A 96 .p2align 3 972: 98 SMULL v2.8h, v4.8b, v0.8b 99 SMULL v3.8h, v4.8b, v1.8b 100 SMULL v10.8h, v5.8b, v0.8b 101 SMULL v11.8h, v5.8b, v1.8b 102 LDP d4, d5, [x5, 16] 103 SMLAL v2.8h, v8.8b, v6.8b 104 SMLAL v3.8h, v8.8b, v7.8b 105 SMLAL v10.8h, v9.8b, v6.8b 106 SMLAL v11.8h, v9.8b, v7.8b 107 108 LDP d8, d9, [x5, 80] 109 SMULL v12.8h, v4.8b, v0.8b 110 SADALP v16.4s, v2.8h 111 SMULL v13.8h, v4.8b, v1.8b 112 SADALP v17.4s, v3.8h 113 SMULL v14.8h, v5.8b, v0.8b 114 SADALP v18.4s, v10.8h 115 SMULL v15.8h, v5.8b, v1.8b 116 SADALP v19.4s, v11.8h 117 LDP d4, d5, [x5, 32] 118 SMLAL v12.8h, v8.8b, v6.8b 119 SMLAL v13.8h, v8.8b, v7.8b 120 SMLAL v14.8h, v9.8b, v6.8b 121 SMLAL v15.8h, v9.8b, v7.8b 122 123 LDP d8, d9, [x5, 96] 124 SMULL v2.8h, v4.8b, v0.8b 125 SADALP v20.4s, v12.8h 126 SMULL v3.8h, v4.8b, v1.8b 127 SADALP v21.4s, v13.8h 128 SMULL v10.8h, v5.8b, v0.8b 129 SADALP v22.4s, v14.8h 130 SMULL v11.8h, v5.8b, v1.8b 131 SADALP v23.4s, v15.8h 132 LDP d4, d5, [x5, 48] 133 SMLAL v2.8h, v8.8b, v6.8b 134 SMLAL v3.8h, v8.8b, v7.8b 135 SMLAL v10.8h, v9.8b, v6.8b 136 SMLAL v11.8h, v9.8b, v7.8b 137 138 LDP d8, d9, [x5, 112] 139 SMULL v12.8h, v4.8b, v0.8b 140 ADD x5, x5, 128 141 SADALP v24.4s, v2.8h 142 SMULL v13.8h, v4.8b, v1.8b 143 SADALP v25.4s, v3.8h 144 SMULL v14.8h, v5.8b, v0.8b 145 SADALP v26.4s, v10.8h 146 SMULL v15.8h, v5.8b, v1.8b 147 SADALP v27.4s, v11.8h 148 SMLAL v12.8h, v8.8b, v6.8b 149 LDP d4, d5, [x5] // Read B 150 SMLAL v13.8h, v8.8b, v7.8b 151 SUBS x0, x0, 16 152 SMLAL v14.8h, v9.8b, v6.8b 153 LDP d0, d6, [x13], 16 // Read A0 154 SMLAL v15.8h, v9.8b, v7.8b 155 156 SADALP v28.4s, v12.8h 157 LDP d1, d7, [x15], 16 // Read A1 158 SADALP v29.4s, v13.8h 159 SADALP v30.4s, v14.8h 160 LDP d8, d9, [x5, 64] // Read B 161 SADALP v31.4s, v15.8h 162 B.HS 2b 163 164 # Epilogue 165 # Same as main loop except no loads at end of loop 166 .p2align 3 1673: 168 SMULL v2.8h, v4.8b, v0.8b 169 SMULL v3.8h, v4.8b, v1.8b 170 SMULL v10.8h, v5.8b, v0.8b 171 SMULL v11.8h, v5.8b, v1.8b 172 LDP d4, d5, [x5, 16] 173 SMLAL v2.8h, v8.8b, v6.8b 174 SMLAL v3.8h, v8.8b, v7.8b 175 SMLAL v10.8h, v9.8b, v6.8b 176 SMLAL v11.8h, v9.8b, v7.8b 177 178 LDP d8, d9, [x5, 80] 179 SMULL v12.8h, v4.8b, v0.8b 180 SADALP v16.4s, v2.8h 181 SMULL v13.8h, v4.8b, v1.8b 182 SADALP v17.4s, v3.8h 183 SMULL v14.8h, v5.8b, v0.8b 184 SADALP v18.4s, v10.8h 185 SMULL v15.8h, v5.8b, v1.8b 186 SADALP v19.4s, v11.8h 187 LDP d4, d5, [x5, 32] 188 SMLAL v12.8h, v8.8b, v6.8b 189 SMLAL v13.8h, v8.8b, v7.8b 190 SMLAL v14.8h, v9.8b, v6.8b 191 SMLAL v15.8h, v9.8b, v7.8b 192 193 LDP d8, d9, [x5, 96] 194 SMULL v2.8h, v4.8b, v0.8b 195 SADALP v20.4s, v12.8h 196 SMULL v3.8h, v4.8b, v1.8b 197 SADALP v21.4s, v13.8h 198 SMULL v10.8h, v5.8b, v0.8b 199 SADALP v22.4s, v14.8h 200 SMULL v11.8h, v5.8b, v1.8b 201 SADALP v23.4s, v15.8h 202 LDP d4, d5, [x5, 48] 203 SMLAL v2.8h, v8.8b, v6.8b 204 SMLAL v3.8h, v8.8b, v7.8b 205 SMLAL v10.8h, v9.8b, v6.8b 206 SMLAL v11.8h, v9.8b, v7.8b 207 208 LDP d8, d9, [x5, 112] 209 SMULL v12.8h, v4.8b, v0.8b 210 SADALP v24.4s, v2.8h 211 SMULL v13.8h, v4.8b, v1.8b 212 SADALP v25.4s, v3.8h 213 SMULL v14.8h, v5.8b, v0.8b 214 SADALP v26.4s, v10.8h 215 SMULL v15.8h, v5.8b, v1.8b 216 SADALP v27.4s, v11.8h 217 SMLAL v12.8h, v8.8b, v6.8b 218 SMLAL v13.8h, v8.8b, v7.8b 219 SMLAL v14.8h, v9.8b, v6.8b 220 SMLAL v15.8h, v9.8b, v7.8b 221 ADD x5, x5, 128 222 223 SADALP v28.4s, v12.8h 224 SADALP v29.4s, v13.8h 225 SADALP v30.4s, v14.8h 226 SADALP v31.4s, v15.8h 227 228 # Is there a remainder?- 8 bytes of A 229 TBNZ x0, 3, 5f 230 231 # ks loop 232 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 233 B.HI 1b 234 2354: 236 # Add columns 237 ADDP v16.4s, v16.4s, v18.4s 238 ADDP v20.4s, v20.4s, v22.4s 239 ADDP v24.4s, v24.4s, v26.4s 240 ADDP v28.4s, v28.4s, v30.4s 241 ADDP v17.4s, v17.4s, v19.4s 242 ADDP v21.4s, v21.4s, v23.4s 243 ADDP v25.4s, v25.4s, v27.4s 244 ADDP v29.4s, v29.4s, v31.4s 245 ADDP v0.4s, v16.4s, v20.4s 246 ADDP v1.4s, v24.4s, v28.4s 247 ADDP v2.4s, v17.4s, v21.4s 248 ADDP v3.4s, v25.4s, v29.4s 249 250 # Apply params - scale, bias and clamp 251 SCVTF v0.4s, v0.4s 252 LD1R {v4.4s}, [x11], 4 253 SCVTF v1.4s, v1.4s 254 SCVTF v2.4s, v2.4s 255 SCVTF v3.4s, v3.4s 256 FMUL v0.4s, v0.4s, v4.4s 257 FMUL v1.4s, v1.4s, v4.4s 258 FMUL v2.4s, v2.4s, v4.4s 259 FMUL v3.4s, v3.4s, v4.4s 260 261 FCVTNS v0.4s, v0.4s 262 FCVTNS v1.4s, v1.4s 263 FCVTNS v2.4s, v2.4s 264 FCVTNS v3.4s, v3.4s 265 266 LD1R {v5.8h}, [x11], 2 267 SQXTN v0.4h, v0.4s 268 SQXTN v2.4h, v2.4s 269 SQXTN2 v0.8h, v1.4s 270 SQXTN2 v2.8h, v3.4s 271 SUBS x1, x1, 8 272 SQADD v0.8h, v0.8h, v5.8h 273 SQADD v1.8h, v2.8h, v5.8h 274 SQXTN v0.8b, v0.8h 275 SQXTN2 v0.16b, v1.8h 276 LD1R {v1.16b}, [x11], 1 277 LD1R {v2.16b}, [x11] 278 SMAX v0.16b, v0.16b, v1.16b 279 SUB x11, x11, 7 // rewind params pointer 280 SMIN v0.16b, v0.16b, v2.16b 281 B.LO 6f 282 283 # Store full 2 x 8 284 ST1 {v0.d}[1], [x7], x10 285 ST1 {v0.8b}, [x6], x10 286 287 SUB x4, x4, x3 // a -= ks 288 289 # nc loop 290 B.HI 0b 291 292 # Restore d8-d15 from stack 293 LDP d14, d15, [sp, 48] 294 LDP d12, d13, [sp, 32] 295 LDP d10, d11, [sp, 16] 296 LDP d8, d9, [sp], 64 297 RET 298 299 # Remainder - 8 bytes of A 300 .p2align 3 3015: 302 LDR d0, [x13] 303 LDP d4, d5, [x5] 304 LDR d1, [x15] 305 LDP d6, d7, [x5, 16] 306 SMULL v2.8h, v4.8b, v0.8b 307 SMULL v3.8h, v4.8b, v1.8b 308 SMULL v10.8h, v5.8b, v0.8b 309 SMULL v11.8h, v5.8b, v1.8b 310 SMULL v12.8h, v6.8b, v0.8b 311 SADALP v16.4s, v2.8h 312 SMULL v13.8h, v6.8b, v1.8b 313 SADALP v17.4s, v3.8h 314 SMULL v14.8h, v7.8b, v0.8b 315 SADALP v18.4s, v10.8h 316 SMULL v15.8h, v7.8b, v1.8b 317 SADALP v19.4s, v11.8h 318 LDP d4, d5, [x5, 32] 319 SMULL v2.8h, v4.8b, v0.8b 320 SADALP v20.4s, v12.8h 321 SMULL v3.8h, v4.8b, v1.8b 322 SADALP v21.4s, v13.8h 323 SMULL v10.8h, v5.8b, v0.8b 324 SADALP v22.4s, v14.8h 325 SMULL v11.8h, v5.8b, v1.8b 326 SADALP v23.4s, v15.8h 327 LDP d6, d7, [x5, 48] 328 SMULL v12.8h, v6.8b, v0.8b 329 SADALP v24.4s, v2.8h 330 SMULL v13.8h, v6.8b, v1.8b 331 SADALP v25.4s, v3.8h 332 SMULL v14.8h, v7.8b, v0.8b 333 SADALP v26.4s, v10.8h 334 SMULL v15.8h, v7.8b, v1.8b 335 SADALP v27.4s, v11.8h 336 ADD x5, x5, 64 337 SADALP v28.4s, v12.8h 338 SADALP v29.4s, v13.8h 339 SADALP v30.4s, v14.8h 340 SADALP v31.4s, v15.8h 341 342 # ks loop 343 SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*) 344 B.HI 1b 345 B 4b 346 347 # Store odd width 348 .p2align 3 3496: 350 TBZ x1, 2, 7f 351 ST1 {v0.s}[2], [x7], 4 352 STR s0, [x6], 4 353 EXT v0.16b, v0.16b, v0.16b, 4 354 3557: 356 TBZ x1, 1, 8f 357 ST1 {v0.h}[4], [x7], 2 358 STR h0, [x6], 2 359 EXT v0.16b, v0.16b, v0.16b, 2 3608: 361 TBZ x1, 0, 9f 362 ST1 {v0.b}[8], [x7] 363 STR b0, [x6] 3649: 365 # Restore d8-d15 from stack 366 LDP d14, d15, [sp, 48] 367 LDP d12, d13, [sp, 32] 368 LDP d10, d11, [sp, 16] 369 LDP d8, d9, [sp], 64 370 RET 371 372END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal 373 374#ifdef __ELF__ 375.section ".note.GNU-stack","",%progbits 376#endif 377