1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> (x0) 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x10 v3 34# B x5 v4 v5 v6 v7 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v8 v9 v10 v11 v12 v13 v14 v15 40 41BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64 42 43 # Clamp C pointers 44 CMP x0, 2 // if mr < 2 45 LDR x8, [sp, 8] // Load a_offset 46 ADD x16, x6, x7 // c1 = c0 + cm_stride 47 CSEL x16, x6, x16, LO // c1 = c0 48 ADD x2, x2, 3 // kc = (kc + 3) & ~3 49 50 ADD x17, x16, x7 // c2 = c1 + cm_stride 51 LDP x12, x11, [sp, 16] // Load zero, params pointer 52 // if mr <= 2 53 CSEL x17, x16, x17, LS // c2 = c1 54 BIC x2, x2, 3 55 56 CMP x0, 4 // if mr < 4 57 ADD x7, x17, x7 // c3 = c2 + cm_stride 58 CSEL x7, x17, x7, LO // c3 = c2 59 60 .p2align 3 610: 62 # Load initial bias from w into accumulators 63 LDP q16, q20, [x5], 32 64 MOV v17.16b, v16.16b 65 MOV v18.16b, v16.16b 66 LDP q24, q28, [x5], 32 67 MOV v19.16b, v16.16b 68 MOV v21.16b, v20.16b 69 MOV v22.16b, v20.16b 70 MOV v23.16b, v20.16b 71 MOV v25.16b, v24.16b 72 MOV v26.16b, v24.16b 73 MOV v27.16b, v24.16b 74 MOV v29.16b, v28.16b 75 MOV v30.16b, v28.16b 76 MOV v31.16b, v28.16b 77 MOV x9, x3 // p = ks 78 79 .p2align 3 801: 81 # Load next 4 A pointers 82 LDP x13, x14, [x4], 16 83 LDP x15, x10, [x4], 16 84 85 CMP x13, x12 // if a0 == zero 86 ADD x13, x13, x8 // a0 += a_offset 87 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 88 CMP x14, x12 // if a1 == zero 89 ADD x14, x14, x8 // a1 += a_offset 90 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 91 CMP x15, x12 // if a2 == zero 92 ADD x15, x15, x8 // a2 += a_offset 93 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 94 CMP x10, x12 // if a3 == zero 95 ADD x10, x10, x8 // a3 += a_offset 96 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 97 98 # Is there at least 8 bytes for main loop? 99 SUBS x0, x2, 8 // k = kc - 8 100 B.LO 4f 101 102 # Main loop - 8 bytes of A 103 .p2align 3 1042: 105 LDR d0, [x13], 8 106 LDR q4, [x5], 16 107 LDR d1, [x14], 8 108 LDR d2, [x15], 8 109 LDR d3, [x10], 8 110 LDR q5, [x5], 16 111 SDOT v16.4s, v4.16b, v0.4b[0] 112 SDOT v17.4s, v4.16b, v1.4b[0] 113 LDP q6, q7, [x5], 32 114 SDOT v18.4s, v4.16b, v2.4b[0] 115 SDOT v19.4s, v4.16b, v3.4b[0] 116 SDOT v20.4s, v5.16b, v0.4b[0] 117 SDOT v21.4s, v5.16b, v1.4b[0] 118 SDOT v22.4s, v5.16b, v2.4b[0] 119 SDOT v23.4s, v5.16b, v3.4b[0] 120 SDOT v24.4s, v6.16b, v0.4b[0] 121 SDOT v25.4s, v6.16b, v1.4b[0] 122 LDP q4, q5, [x5], 32 123 SDOT v26.4s, v6.16b, v2.4b[0] 124 SDOT v27.4s, v6.16b, v3.4b[0] 125 SDOT v28.4s, v7.16b, v0.4b[0] 126 SDOT v29.4s, v7.16b, v1.4b[0] 127 SDOT v30.4s, v7.16b, v2.4b[0] 128 SDOT v31.4s, v7.16b, v3.4b[0] 129 SDOT v16.4s, v4.16b, v0.4b[1] 130 SDOT v17.4s, v4.16b, v1.4b[1] 131 LDP q6, q7, [x5], 32 132 SDOT v18.4s, v4.16b, v2.4b[1] 133 SDOT v19.4s, v4.16b, v3.4b[1] 134 SDOT v20.4s, v5.16b, v0.4b[1] 135 SDOT v21.4s, v5.16b, v1.4b[1] 136 SDOT v22.4s, v5.16b, v2.4b[1] 137 SDOT v23.4s, v5.16b, v3.4b[1] 138 SDOT v24.4s, v6.16b, v0.4b[1] 139 SDOT v25.4s, v6.16b, v1.4b[1] 140 SDOT v26.4s, v6.16b, v2.4b[1] 141 SDOT v27.4s, v6.16b, v3.4b[1] 142 SDOT v28.4s, v7.16b, v0.4b[1] 143 SDOT v29.4s, v7.16b, v1.4b[1] 144 SDOT v30.4s, v7.16b, v2.4b[1] 145 SUBS x0, x0, 8 146 SDOT v31.4s, v7.16b, v3.4b[1] 147 B.HS 2b 148 149 # Is there a remainder?- 4 bytes of A 150 TBNZ x0, 2, 4f 151 152 # ks loop 153 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 154 B.HI 1b 155 1563: 157 SCVTF v16.4s, v16.4s 158 SCVTF v17.4s, v17.4s 159 # Apply params - scale, bias and clamp 160 LD1R {v4.4s}, [x11], 4 161 SCVTF v18.4s, v18.4s 162 SCVTF v19.4s, v19.4s 163 SCVTF v20.4s, v20.4s 164 SCVTF v21.4s, v21.4s 165 SCVTF v22.4s, v22.4s 166 SCVTF v23.4s, v23.4s 167 SCVTF v24.4s, v24.4s 168 SCVTF v25.4s, v25.4s 169 SCVTF v26.4s, v26.4s 170 SCVTF v27.4s, v27.4s 171 SCVTF v28.4s, v28.4s 172 SCVTF v29.4s, v29.4s 173 SCVTF v30.4s, v30.4s 174 SCVTF v31.4s, v31.4s 175 176 FMUL v16.4s, v16.4s, v4.4s 177 FMUL v17.4s, v17.4s, v4.4s 178 FMUL v18.4s, v18.4s, v4.4s 179 FMUL v19.4s, v19.4s, v4.4s 180 FMUL v20.4s, v20.4s, v4.4s 181 FMUL v21.4s, v21.4s, v4.4s 182 FMUL v22.4s, v22.4s, v4.4s 183 FMUL v23.4s, v23.4s, v4.4s 184 FMUL v24.4s, v24.4s, v4.4s 185 FMUL v25.4s, v25.4s, v4.4s 186 FMUL v26.4s, v26.4s, v4.4s 187 FMUL v27.4s, v27.4s, v4.4s 188 FMUL v28.4s, v28.4s, v4.4s 189 FMUL v29.4s, v29.4s, v4.4s 190 FMUL v30.4s, v30.4s, v4.4s 191 FMUL v31.4s, v31.4s, v4.4s 192 193 FCVTNS v16.4s, v16.4s 194 FCVTNS v17.4s, v17.4s 195 FCVTNS v18.4s, v18.4s 196 FCVTNS v19.4s, v19.4s 197 FCVTNS v20.4s, v20.4s 198 FCVTNS v21.4s, v21.4s 199 FCVTNS v22.4s, v22.4s 200 FCVTNS v23.4s, v23.4s 201 FCVTNS v24.4s, v24.4s 202 FCVTNS v25.4s, v25.4s 203 FCVTNS v26.4s, v26.4s 204 FCVTNS v27.4s, v27.4s 205 FCVTNS v28.4s, v28.4s 206 FCVTNS v29.4s, v29.4s 207 FCVTNS v30.4s, v30.4s 208 FCVTNS v31.4s, v31.4s 209 210 SQXTN v16.4h, v16.4s 211 SQXTN v17.4h, v17.4s 212 SQXTN v18.4h, v18.4s 213 SQXTN v19.4h, v19.4s 214 SQXTN v24.4h, v24.4s 215 SQXTN v25.4h, v25.4s 216 SQXTN v26.4h, v26.4s 217 SQXTN v27.4h, v27.4s 218 LD1R {v6.8h}, [x11], 2 // add bias 219 220 SQXTN2 v16.8h, v20.4s 221 SQXTN2 v17.8h, v21.4s 222 SQXTN2 v18.8h, v22.4s 223 SQXTN2 v19.8h, v23.4s 224 SQXTN2 v24.8h, v28.4s 225 SQXTN2 v25.8h, v29.4s 226 SQXTN2 v26.8h, v30.4s 227 SQXTN2 v27.8h, v31.4s 228 229 SQADD v16.8h, v16.8h, v6.8h 230 SQADD v17.8h, v17.8h, v6.8h 231 SQADD v18.8h, v18.8h, v6.8h 232 SQADD v19.8h, v19.8h, v6.8h 233 SQADD v24.8h, v24.8h, v6.8h 234 SQADD v25.8h, v25.8h, v6.8h 235 SQADD v26.8h, v26.8h, v6.8h 236 SQADD v27.8h, v27.8h, v6.8h 237 LD1R {v4.16b}, [x11], 1 // clamp min value 238 239 SQXTN v0.8b, v16.8h 240 SQXTN v1.8b, v17.8h 241 SQXTN v2.8b, v18.8h 242 SQXTN v3.8b, v19.8h 243 LD1R {v5.16b}, [x11] // clamp max value 244 SQXTN2 v0.16b, v24.8h 245 SQXTN2 v1.16b, v25.8h 246 SQXTN2 v2.16b, v26.8h 247 SQXTN2 v3.16b, v27.8h 248 LDR x0, [sp] // cn_stride 249 SMAX v0.16b, v0.16b, v4.16b 250 SMAX v1.16b, v1.16b, v4.16b 251 SUB x11, x11, 7 // rewind params pointer 252 SMAX v2.16b, v2.16b, v4.16b 253 SMAX v3.16b, v3.16b, v4.16b 254 SUBS x1, x1, 16 255 SMIN v0.16b, v0.16b, v5.16b 256 SMIN v1.16b, v1.16b, v5.16b 257 SMIN v2.16b, v2.16b, v5.16b 258 SMIN v3.16b, v3.16b, v5.16b 259 B.LO 5f 260 261 # Store full 4 x 16 262 ST1 {v3.16b}, [x7], x0 263 ST1 {v2.16b}, [x17], x0 264 ST1 {v1.16b}, [x16], x0 265 ST1 {v0.16b}, [x6], x0 266 267 SUB x4, x4, x3 // a -= ks 268 269 # nc loop 270 B.HI 0b 271 RET 272 273 # Remainder- 4 bytes of A 274 .p2align 3 2754: 276 LDR s0, [x13], 4 277 LDR q4, [x5], 16 278 LDR s1, [x14], 4 279 LDR s2, [x15], 4 280 LDR s3, [x10], 4 281 LDR q5, [x5], 16 282 SDOT v16.4s, v4.16b, v0.4b[0] 283 SDOT v17.4s, v4.16b, v1.4b[0] 284 LDP q6, q7, [x5], 32 285 SDOT v18.4s, v4.16b, v2.4b[0] 286 SDOT v19.4s, v4.16b, v3.4b[0] 287 SDOT v20.4s, v5.16b, v0.4b[0] 288 SDOT v21.4s, v5.16b, v1.4b[0] 289 SDOT v22.4s, v5.16b, v2.4b[0] 290 SDOT v23.4s, v5.16b, v3.4b[0] 291 SDOT v24.4s, v6.16b, v0.4b[0] 292 SDOT v25.4s, v6.16b, v1.4b[0] 293 SDOT v26.4s, v6.16b, v2.4b[0] 294 SDOT v27.4s, v6.16b, v3.4b[0] 295 SDOT v28.4s, v7.16b, v0.4b[0] 296 SDOT v29.4s, v7.16b, v1.4b[0] 297 SDOT v30.4s, v7.16b, v2.4b[0] 298 SDOT v31.4s, v7.16b, v3.4b[0] 299 300 # ks loop 301 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 302 B.HI 1b 303 B 3b 304 305 # Store odd width 306 .p2align 3 3075: 308 TBZ x1, 3, 6f 309 STR d3, [x7], 8 310 STR d2, [x17], 8 311 DUP d3, v3.d[1] 312 DUP d2, v2.d[1] 313 STR d1, [x16], 8 314 STR d0, [x6], 8 315 DUP d1, v1.d[1] 316 DUP d0, v0.d[1] 3176: 318 TBZ x1, 2, 7f 319 STR s3, [x7], 4 320 STR s2, [x17], 4 321 DUP s3, v3.s[1] 322 DUP s2, v2.s[1] 323 STR s1, [x16], 4 324 STR s0, [x6], 4 325 DUP s1, v1.s[1] 326 DUP s0, v0.s[1] 3277: 328 TBZ x1, 1, 8f 329 STR h3, [x7], 2 330 STR h2, [x17], 2 331 DUP h3, v3.h[1] 332 DUP h2, v2.h[1] 333 STR h1, [x16], 2 334 STR h0, [x6], 2 335 DUP h1, v1.h[1] 336 DUP h0, v0.h[1] 3378: 338 TBZ x1, 0, 9f 339 STR b3, [x7] 340 STR b2, [x17] 341 STR b1, [x16] 342 STR b0, [x6] 3439: 344 RET 345 346END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64 347 348#ifdef __ELF__ 349.section ".note.GNU-stack","",%progbits 350#endif 351