1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> (x0) 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x10 v3 34# B x5 v4 v5 v6 v7 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v8 v9 v10 v11 v12 v13 v14 v15 40 41BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64 42 43 # Clamp C pointers 44 CMP x0, 2 // if mr < 2 45 LDR x8, [sp, 8] // Load a_offset 46 ADD x16, x6, x7 // c1 = c0 + cm_stride 47 CSEL x16, x6, x16, LO // c1 = c0 48 ADD x2, x2, 3 // kc = (kc + 3) & ~3 49 50 ADD x17, x16, x7 // c2 = c1 + cm_stride 51 LDP x12, x11, [sp, 16] // Load zero, params pointer 52 // if mr <= 2 53 CSEL x17, x16, x17, LS // c2 = c1 54 BIC x2, x2, 3 55 56 CMP x0, 4 // if mr < 4 57 ADD x7, x17, x7 // c3 = c2 + cm_stride 58 CSEL x7, x17, x7, LO // c3 = c2 59 60 .p2align 3 610: 62 # Load initial bias from w into accumulators 63 LDP q16, q20, [x5], 32 64 MOV v17.16b, v16.16b 65 MOV v18.16b, v16.16b 66 LDP q24, q28, [x5], 32 67 MOV v19.16b, v16.16b 68 MOV v21.16b, v20.16b 69 MOV v22.16b, v20.16b 70 MOV v23.16b, v20.16b 71 MOV v25.16b, v24.16b 72 MOV v26.16b, v24.16b 73 MOV v27.16b, v24.16b 74 MOV v29.16b, v28.16b 75 MOV v30.16b, v28.16b 76 MOV v31.16b, v28.16b 77 MOV x9, x3 // p = ks 78 79 .p2align 3 801: 81 # Load next 4 A pointers 82 LDP x13, x14, [x4], 16 83 LDP x15, x10, [x4], 16 84 85 CMP x13, x12 // if a0 == zero 86 ADD x13, x13, x8 // a0 += a_offset 87 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 88 CMP x14, x12 // if a1 == zero 89 ADD x14, x14, x8 // a1 += a_offset 90 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 91 CMP x15, x12 // if a2 == zero 92 ADD x15, x15, x8 // a2 += a_offset 93 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 94 CMP x10, x12 // if a3 == zero 95 ADD x10, x10, x8 // a3 += a_offset 96 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 97 98 # Is there at least 8 bytes for main loop? 99 SUBS x0, x2, 8 // k = kc - 8 100 B.LO 4f 101 102 # Main loop - 8 bytes of A 103 .p2align 3 1042: 105 LDR d0, [x13], 8 106 LDR q4, [x5], 16 107 LDR d1, [x14], 8 108 LDR d2, [x15], 8 109 LDR d3, [x10], 8 110 LDR q5, [x5], 16 111 SDOT v16.4s, v4.16b, v0.4b[0] 112 SDOT v17.4s, v4.16b, v1.4b[0] 113 LDP q6, q7, [x5], 32 114 SDOT v18.4s, v4.16b, v2.4b[0] 115 SDOT v19.4s, v4.16b, v3.4b[0] 116 SDOT v20.4s, v5.16b, v0.4b[0] 117 SDOT v21.4s, v5.16b, v1.4b[0] 118 SDOT v22.4s, v5.16b, v2.4b[0] 119 SDOT v23.4s, v5.16b, v3.4b[0] 120 SDOT v24.4s, v6.16b, v0.4b[0] 121 SDOT v25.4s, v6.16b, v1.4b[0] 122 LDP q4, q5, [x5], 32 123 SDOT v26.4s, v6.16b, v2.4b[0] 124 SDOT v27.4s, v6.16b, v3.4b[0] 125 SDOT v28.4s, v7.16b, v0.4b[0] 126 SDOT v29.4s, v7.16b, v1.4b[0] 127 SDOT v30.4s, v7.16b, v2.4b[0] 128 SDOT v31.4s, v7.16b, v3.4b[0] 129 SDOT v16.4s, v4.16b, v0.4b[1] 130 SDOT v17.4s, v4.16b, v1.4b[1] 131 LDP q6, q7, [x5], 32 132 SDOT v18.4s, v4.16b, v2.4b[1] 133 SDOT v19.4s, v4.16b, v3.4b[1] 134 SDOT v20.4s, v5.16b, v0.4b[1] 135 SDOT v21.4s, v5.16b, v1.4b[1] 136 SDOT v22.4s, v5.16b, v2.4b[1] 137 SDOT v23.4s, v5.16b, v3.4b[1] 138 SDOT v24.4s, v6.16b, v0.4b[1] 139 SDOT v25.4s, v6.16b, v1.4b[1] 140 SDOT v26.4s, v6.16b, v2.4b[1] 141 SDOT v27.4s, v6.16b, v3.4b[1] 142 SDOT v28.4s, v7.16b, v0.4b[1] 143 SDOT v29.4s, v7.16b, v1.4b[1] 144 SDOT v30.4s, v7.16b, v2.4b[1] 145 SUBS x0, x0, 8 146 SDOT v31.4s, v7.16b, v3.4b[1] 147 B.HS 2b 148 149 # Is there a remainder?- 4 bytes of A 150 TBNZ x0, 2, 4f 151 152 # ks loop 153 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 154 B.HI 1b 155 1563: 157 # Apply params - preshift, scale, postshift, bias and clamp 158 LD1R {v4.4s}, [x11], 4 159 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 160 SQSHL v17.4s, v17.4s, v4.4s 161 SQSHL v18.4s, v18.4s, v4.4s 162 SQSHL v19.4s, v19.4s, v4.4s 163 SQSHL v20.4s, v20.4s, v4.4s 164 SQSHL v21.4s, v21.4s, v4.4s 165 SQSHL v22.4s, v22.4s, v4.4s 166 SQSHL v23.4s, v23.4s, v4.4s 167 LD1R {v5.4s}, [x11], 4 168 SQSHL v24.4s, v24.4s, v4.4s 169 SQSHL v25.4s, v25.4s, v4.4s 170 SQSHL v26.4s, v26.4s, v4.4s 171 SQSHL v27.4s, v27.4s, v4.4s 172 SQSHL v28.4s, v28.4s, v4.4s 173 SQSHL v29.4s, v29.4s, v4.4s 174 SQSHL v30.4s, v30.4s, v4.4s 175 SQSHL v31.4s, v31.4s, v4.4s 176 LD1R {v6.4s}, [x11], 4 177 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 178 SQDMULH v17.4s, v17.4s, v5.4s 179 SQDMULH v18.4s, v18.4s, v5.4s 180 SQDMULH v19.4s, v19.4s, v5.4s 181 SQDMULH v20.4s, v20.4s, v5.4s 182 SQDMULH v21.4s, v21.4s, v5.4s 183 SQDMULH v22.4s, v22.4s, v5.4s 184 SQDMULH v23.4s, v23.4s, v5.4s 185 SQDMULH v24.4s, v24.4s, v5.4s 186 SQDMULH v25.4s, v25.4s, v5.4s 187 SQDMULH v26.4s, v26.4s, v5.4s 188 SQDMULH v27.4s, v27.4s, v5.4s 189 SQDMULH v28.4s, v28.4s, v5.4s 190 SQDMULH v29.4s, v29.4s, v5.4s 191 SQDMULH v30.4s, v30.4s, v5.4s 192 SQDMULH v31.4s, v31.4s, v5.4s 193 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 194 SRSHL v17.4s, v17.4s, v6.4s 195 SRSHL v18.4s, v18.4s, v6.4s 196 SRSHL v19.4s, v19.4s, v6.4s 197 SRSHL v20.4s, v20.4s, v6.4s 198 SRSHL v21.4s, v21.4s, v6.4s 199 SRSHL v22.4s, v22.4s, v6.4s 200 SRSHL v23.4s, v23.4s, v6.4s 201 SRSHL v24.4s, v24.4s, v6.4s 202 SRSHL v25.4s, v25.4s, v6.4s 203 SRSHL v26.4s, v26.4s, v6.4s 204 SRSHL v27.4s, v27.4s, v6.4s 205 SRSHL v28.4s, v28.4s, v6.4s 206 SRSHL v29.4s, v29.4s, v6.4s 207 SRSHL v30.4s, v30.4s, v6.4s 208 SRSHL v31.4s, v31.4s, v6.4s 209 210 SQXTN v16.4h, v16.4s 211 SQXTN v17.4h, v17.4s 212 SQXTN v18.4h, v18.4s 213 SQXTN v19.4h, v19.4s 214 SQXTN v24.4h, v24.4s 215 SQXTN v25.4h, v25.4s 216 SQXTN v26.4h, v26.4s 217 SQXTN v27.4h, v27.4s 218 LD1R {v6.8h}, [x11], 2 // add bias 219 220 SQXTN2 v16.8h, v20.4s 221 SQXTN2 v17.8h, v21.4s 222 SQXTN2 v18.8h, v22.4s 223 SQXTN2 v19.8h, v23.4s 224 SQXTN2 v24.8h, v28.4s 225 SQXTN2 v25.8h, v29.4s 226 SQXTN2 v26.8h, v30.4s 227 SQXTN2 v27.8h, v31.4s 228 229 SQADD v16.8h, v16.8h, v6.8h 230 SQADD v17.8h, v17.8h, v6.8h 231 SQADD v18.8h, v18.8h, v6.8h 232 SQADD v19.8h, v19.8h, v6.8h 233 SQADD v24.8h, v24.8h, v6.8h 234 SQADD v25.8h, v25.8h, v6.8h 235 SQADD v26.8h, v26.8h, v6.8h 236 SQADD v27.8h, v27.8h, v6.8h 237 LD1R {v4.16b}, [x11], 1 // clamp min value 238 239 SQXTN v0.8b, v16.8h 240 SQXTN v1.8b, v17.8h 241 SQXTN v2.8b, v18.8h 242 SQXTN v3.8b, v19.8h 243 LD1R {v5.16b}, [x11] // clamp max value 244 SQXTN2 v0.16b, v24.8h 245 SQXTN2 v1.16b, v25.8h 246 SQXTN2 v2.16b, v26.8h 247 SQXTN2 v3.16b, v27.8h 248 LDR x0, [sp] // cn_stride 249 SMAX v0.16b, v0.16b, v4.16b 250 SMAX v1.16b, v1.16b, v4.16b 251 SUB x11, x11, 15 // rewind params pointer 252 SMAX v2.16b, v2.16b, v4.16b 253 SMAX v3.16b, v3.16b, v4.16b 254 SUBS x1, x1, 16 255 SMIN v0.16b, v0.16b, v5.16b 256 SMIN v1.16b, v1.16b, v5.16b 257 SMIN v2.16b, v2.16b, v5.16b 258 SMIN v3.16b, v3.16b, v5.16b 259 B.LO 5f 260 261 # Store full 4 x 16 262 ST1 {v3.16b}, [x7], x0 263 ST1 {v2.16b}, [x17], x0 264 ST1 {v1.16b}, [x16], x0 265 ST1 {v0.16b}, [x6], x0 266 267 SUB x4, x4, x3 // a -= ks 268 269 # nc loop 270 B.HI 0b 271 RET 272 273 # Remainder- 4 bytes of A 274 .p2align 3 2754: 276 LDR s0, [x13], 4 277 LDR q4, [x5], 16 278 LDR s1, [x14], 4 279 LDR s2, [x15], 4 280 LDR s3, [x10], 4 281 LDR q5, [x5], 16 282 SDOT v16.4s, v4.16b, v0.4b[0] 283 SDOT v17.4s, v4.16b, v1.4b[0] 284 LDP q6, q7, [x5], 32 285 SDOT v18.4s, v4.16b, v2.4b[0] 286 SDOT v19.4s, v4.16b, v3.4b[0] 287 SDOT v20.4s, v5.16b, v0.4b[0] 288 SDOT v21.4s, v5.16b, v1.4b[0] 289 SDOT v22.4s, v5.16b, v2.4b[0] 290 SDOT v23.4s, v5.16b, v3.4b[0] 291 SDOT v24.4s, v6.16b, v0.4b[0] 292 SDOT v25.4s, v6.16b, v1.4b[0] 293 SDOT v26.4s, v6.16b, v2.4b[0] 294 SDOT v27.4s, v6.16b, v3.4b[0] 295 SDOT v28.4s, v7.16b, v0.4b[0] 296 SDOT v29.4s, v7.16b, v1.4b[0] 297 SDOT v30.4s, v7.16b, v2.4b[0] 298 SDOT v31.4s, v7.16b, v3.4b[0] 299 300 # ks loop 301 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 302 B.HI 1b 303 B 3b 304 305 # Store odd width 306 .p2align 3 3075: 308 TBZ x1, 3, 6f 309 STR d3, [x7], 8 310 STR d2, [x17], 8 311 DUP d3, v3.d[1] 312 DUP d2, v2.d[1] 313 STR d1, [x16], 8 314 STR d0, [x6], 8 315 DUP d1, v1.d[1] 316 DUP d0, v0.d[1] 3176: 318 TBZ x1, 2, 7f 319 STR s3, [x7], 4 320 STR s2, [x17], 4 321 DUP s3, v3.s[1] 322 DUP s2, v2.s[1] 323 STR s1, [x16], 4 324 STR s0, [x6], 4 325 DUP s1, v1.s[1] 326 DUP s0, v0.s[1] 3277: 328 TBZ x1, 1, 8f 329 STR h3, [x7], 2 330 STR h2, [x17], 2 331 DUP h3, v3.h[1] 332 DUP h2, v2.h[1] 333 STR h1, [x16], 2 334 STR h0, [x6], 2 335 DUP h1, v1.h[1] 336 DUP h0, v0.h[1] 3378: 338 TBZ x1, 0, 9f 339 STR b3, [x7] 340 STR b2, [x17] 341 STR b1, [x16] 342 STR b0, [x6] 3439: 344 RET 345 346END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64 347 348#ifdef __ELF__ 349.section ".note.GNU-stack","",%progbits 350#endif 351