1// Copyright 2021 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert REQUANTIZATION in ["FP32", "RNDNU"] 7 8#include <xnnpack/assembly.h> 9 10$REWIND_DECREMENT = {"RNDNU": 15, "FP32": 7}[REQUANTIZATION] 11# void xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_ld128( 12# size_t mr, x0 13# size_t nc, x1 14# size_t kc, x2 / x0 15# size_t ks, x3 / x9 16# const int8_t**restrict a, x4 17# const int8_t* restrict w, x5 18# int8_t* restrict c, x6 19# size_t cm_stride, x7 20# size_t cn_stride, [sp] -> x0 21# size_t a_offset, [sp + 8] -> x8 22# const int8_t* zero, [sp + 16] -> x12 23# const union xnn_qu8_conv_minmax_params params) [sp + 24] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x13 v0 29# A1 x14 v1 30# A2 x15 v2 31# A3 x10 v3 32# B x5 v4 v5 v6 33# C0 x6 v16 v20 34# C1 x16 v17 v21 35# C2 x17 v18 v22 36# C3 x7 v19 v23 37# zero_point v7 v24 v25 v26 v27 38# unused v8 v9 v10 v11 v13 v14 v15 v28 v29 v30 v31 39 40BEGIN_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_ld128 41 42 # Clamp C pointers 43 CMP x0, 2 // if mr < 2 44 LDR x8, [sp, 8] // Load a_offset 45 ADD x16, x6, x7 // c1 = c0 + cm_stride 46 CSEL x16, x6, x16, LO // c1 = c0 47 ADD x2, x2, 3 // kc = (kc + 3) & ~3 48 49 ADD x17, x16, x7 // c2 = c1 + cm_stride 50 LDP x12, x11, [sp, 16] // Load zero pointer, params 51 // if mr <= 2 52 CSEL x17, x16, x17, LS // c2 = c1 53 BIC x2, x2, 3 54 55 CMP x0, 4 // if mr < 4 56 ADD x7, x17, x7 // c3 = c2 + cm_stride 57 CSEL x7, x17, x7, LO // c3 = c2 58 59 LD1R {v7.4s}, [x11], 4 // kernel_zero_point 60 61 .p2align 3 620: 63 # Load initial bias from w into accumulators 64 LDP q16, q20, [x5], 32 65 MOV x9, x3 // p = ks 66 MOVI v24.16b, 0 67 MOVI v25.16b, 0 68 MOVI v26.16b, 0 69 MOVI v27.16b, 0 70 MOV v17.16b, v16.16b 71 MOV v18.16b, v16.16b 72 MOV v19.16b, v16.16b 73 MOV v21.16b, v20.16b 74 MOV v22.16b, v20.16b 75 MOV v23.16b, v20.16b 76 77 .p2align 3 781: 79 # Load next 4 A pointers 80 LDP x13, x14, [x4], 16 81 LDP x15, x10, [x4], 16 82 83 CMP x13, x12 // if a0 == zero 84 ADD x13, x13, x8 // a0 += a_offset 85 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 86 CMP x14, x12 // if a1 == zero 87 ADD x14, x14, x8 // a1 += a_offset 88 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 89 CMP x15, x12 // if a2 == zero 90 ADD x15, x15, x8 // a2 += a_offset 91 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 92 CMP x10, x12 // if a3 == zero 93 ADD x10, x10, x8 // a3 += a_offset 94 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 95 96 # Is there at least 16 bytes for main loop? 97 SUBS x0, x2, 16 // k = kc - 8 98 B.LO 40f 99 100 # Main loop - 16 bytes of A 101 .p2align 3 1022: 103 LDR q0, [x13], 16 104 LDR q4, [x5], 16 105 LDR q1, [x14], 16 106 LDR q2, [x15], 16 107 LDR q3, [x10], 16 108 LDR q5, [x5], 16 109 UDOT v24.4s, v7.16b, v0.16b // update zero point 110 UDOT v25.4s, v7.16b, v1.16b 111 UDOT v26.4s, v7.16b, v2.16b 112 UDOT v27.4s, v7.16b, v3.16b 113 UDOT v16.4s, v4.16b, v0.4b[0] 114 UDOT v17.4s, v4.16b, v1.4b[0] 115 LDR q6, [x5], 16 116 UDOT v18.4s, v4.16b, v2.4b[0] 117 UDOT v19.4s, v4.16b, v3.4b[0] 118 UDOT v20.4s, v5.16b, v0.4b[0] 119 UDOT v21.4s, v5.16b, v1.4b[0] 120 LDR q4, [x5], 16 121 UDOT v22.4s, v5.16b, v2.4b[0] 122 UDOT v23.4s, v5.16b, v3.4b[0] 123 UDOT v16.4s, v6.16b, v0.4b[1] 124 UDOT v17.4s, v6.16b, v1.4b[1] 125 LDR q5, [x5], 16 126 UDOT v18.4s, v6.16b, v2.4b[1] 127 UDOT v19.4s, v6.16b, v3.4b[1] 128 UDOT v20.4s, v4.16b, v0.4b[1] 129 UDOT v21.4s, v4.16b, v1.4b[1] 130 LDR q6, [x5], 16 131 UDOT v22.4s, v4.16b, v2.4b[1] 132 UDOT v23.4s, v4.16b, v3.4b[1] 133 UDOT v16.4s, v5.16b, v0.4b[2] 134 UDOT v17.4s, v5.16b, v1.4b[2] 135 LDR q4, [x5], 16 136 UDOT v18.4s, v5.16b, v2.4b[2] 137 UDOT v19.4s, v5.16b, v3.4b[2] 138 UDOT v20.4s, v6.16b, v0.4b[2] 139 UDOT v21.4s, v6.16b, v1.4b[2] 140 LDR q5, [x5], 16 141 UDOT v22.4s, v6.16b, v2.4b[2] 142 UDOT v23.4s, v6.16b, v3.4b[2] 143 UDOT v16.4s, v4.16b, v0.4b[3] 144 UDOT v17.4s, v4.16b, v1.4b[3] 145 UDOT v18.4s, v4.16b, v2.4b[3] 146 UDOT v19.4s, v4.16b, v3.4b[3] 147 SUBS x0, x0, 16 148 UDOT v20.4s, v5.16b, v0.4b[3] 149 UDOT v21.4s, v5.16b, v1.4b[3] 150 UDOT v22.4s, v5.16b, v2.4b[3] 151 UDOT v23.4s, v5.16b, v3.4b[3] 152 B.HS 2b 153 154 # Is there a remainder?- 8 bytes of A 155 TBNZ x0, 3, 4f 156 # Is there a remainder?- 4 bytes of A 157 TBNZ x0, 2, 5f 158 1593: 160 # ks loop 161 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 162 B.HI 1b 163 164 ADDP v0.4s, v24.4s, v24.4s 165 ADDP v1.4s, v25.4s, v25.4s 166 ADDP v2.4s, v26.4s, v26.4s 167 ADDP v3.4s, v27.4s, v27.4s 168 ADDP v24.4s, v0.4s, v0.4s 169 ADDP v25.4s, v1.4s, v1.4s 170 ADDP v26.4s, v2.4s, v2.4s 171 ADDP v27.4s, v3.4s, v3.4s 172 173 # Subtract zero point from accumulators 174 SUB v16.4s, v16.4s, v24.4s 175 SUB v17.4s, v17.4s, v25.4s 176 SUB v18.4s, v18.4s, v26.4s 177 SUB v19.4s, v19.4s, v27.4s 178 SUB v20.4s, v20.4s, v24.4s 179 SUB v21.4s, v21.4s, v25.4s 180 SUB v22.4s, v22.4s, v26.4s 181 SUB v23.4s, v23.4s, v27.4s 182 183 $if REQUANTIZATION == "RNDNU": 184 # Apply params - preshift, scale, postshift, bias and clamp 185 LD1R {v4.4s}, [x11], 4 186 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits 187 SSHL v17.4s, v17.4s, v4.4s 188 SSHL v18.4s, v18.4s, v4.4s 189 SSHL v19.4s, v19.4s, v4.4s 190 LD1R {v5.4s}, [x11], 4 191 SSHL v20.4s, v20.4s, v4.4s 192 SSHL v21.4s, v21.4s, v4.4s 193 SSHL v22.4s, v22.4s, v4.4s 194 SSHL v23.4s, v23.4s, v4.4s 195 LD1R {v6.4s}, [x11], 4 196 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 197 SQDMULH v17.4s, v17.4s, v5.4s 198 SQDMULH v18.4s, v18.4s, v5.4s 199 SQDMULH v19.4s, v19.4s, v5.4s 200 SQDMULH v20.4s, v20.4s, v5.4s 201 SQDMULH v21.4s, v21.4s, v5.4s 202 SQDMULH v22.4s, v22.4s, v5.4s 203 SQDMULH v23.4s, v23.4s, v5.4s 204 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 205 SRSHL v17.4s, v17.4s, v6.4s 206 SRSHL v18.4s, v18.4s, v6.4s 207 SRSHL v19.4s, v19.4s, v6.4s 208 SRSHL v20.4s, v20.4s, v6.4s 209 SRSHL v21.4s, v21.4s, v6.4s 210 SRSHL v22.4s, v22.4s, v6.4s 211 SRSHL v23.4s, v23.4s, v6.4s 212 $elif REQUANTIZATION == "FP32": 213 # Apply params - scale, bias and clamp 214 SCVTF v16.4s, v16.4s 215 SCVTF v17.4s, v17.4s 216 LD1R {v4.4s}, [x11], 4 217 SCVTF v18.4s, v18.4s 218 SCVTF v19.4s, v19.4s 219 SCVTF v20.4s, v20.4s 220 SCVTF v21.4s, v21.4s 221 SCVTF v22.4s, v22.4s 222 SCVTF v23.4s, v23.4s 223 224 FMUL v16.4s, v16.4s, v4.4s 225 FMUL v17.4s, v17.4s, v4.4s 226 FMUL v18.4s, v18.4s, v4.4s 227 FMUL v19.4s, v19.4s, v4.4s 228 FMUL v20.4s, v20.4s, v4.4s 229 FMUL v21.4s, v21.4s, v4.4s 230 FMUL v22.4s, v22.4s, v4.4s 231 FMUL v23.4s, v23.4s, v4.4s 232 233 FCVTNS v16.4s, v16.4s 234 FCVTNS v17.4s, v17.4s 235 FCVTNS v18.4s, v18.4s 236 FCVTNS v19.4s, v19.4s 237 FCVTNS v20.4s, v20.4s 238 FCVTNS v21.4s, v21.4s 239 FCVTNS v22.4s, v22.4s 240 FCVTNS v23.4s, v23.4s 241 242 SQXTN v16.4h, v16.4s 243 SQXTN v17.4h, v17.4s 244 SQXTN v18.4h, v18.4s 245 SQXTN v19.4h, v19.4s 246 LD1R {v6.8h}, [x11], 2 // add bias 247 248 SQXTN2 v16.8h, v20.4s 249 SQXTN2 v17.8h, v21.4s 250 SQXTN2 v18.8h, v22.4s 251 SQXTN2 v19.8h, v23.4s 252 LDR x0, [sp] // Load cn_offset 253 254 SQADD v16.8h, v16.8h, v6.8h 255 SQADD v17.8h, v17.8h, v6.8h 256 SQADD v18.8h, v18.8h, v6.8h 257 SQADD v19.8h, v19.8h, v6.8h 258 LD1R {v4.16b}, [x11], 1 // clamp min value 259 260 SQXTUN v0.8b, v16.8h 261 SQXTUN v1.8b, v18.8h 262 LD1R {v5.16b}, [x11] // clamp max value 263 SQXTUN2 v0.16b, v17.8h 264 SQXTUN2 v1.16b, v19.8h 265 SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer 266 267 UMAX v0.16b, v0.16b, v4.16b 268 UMAX v1.16b, v1.16b, v4.16b 269 SUBS x1, x1, 8 270 UMIN v0.16b, v0.16b, v5.16b 271 UMIN v1.16b, v1.16b, v5.16b 272 B.LO 6f 273 274 # Store full 4 x 8 275 ST1 {v1.d}[1], [x7], x0 276 ST1 {v1.8b}, [x17], x0 277 ST1 {v0.d}[1], [x16], x0 278 ST1 {v0.8b}, [x6], x0 279 SUB x4, x4, x3 // a -= ks 280 281 # nc loop 282 B.HI 0b 283 RET 284 285 # Remainder- 4-12 bytes of A 286 .p2align 3 28740: TBZ x0, 3, 5f 2884: 289 LDR d0, [x13], 8 290 LDR q4, [x5] 291 LDR d1, [x14], 8 292 LDR d2, [x15], 8 293 LDR d3, [x10], 8 294 LDR q5, [x5, 16] 295 UDOT v24.4s, v7.16b, v0.16b // update zero point 296 UDOT v25.4s, v7.16b, v1.16b 297 UDOT v26.4s, v7.16b, v2.16b 298 UDOT v27.4s, v7.16b, v3.16b 299 UDOT v16.4s, v4.16b, v0.4b[0] 300 UDOT v17.4s, v4.16b, v1.4b[0] 301 LDR q6, [x5, 32] 302 UDOT v18.4s, v4.16b, v2.4b[0] 303 UDOT v19.4s, v4.16b, v3.4b[0] 304 UDOT v20.4s, v5.16b, v0.4b[0] 305 UDOT v21.4s, v5.16b, v1.4b[0] 306 LDR q4, [x5, 48] 307 UDOT v22.4s, v5.16b, v2.4b[0] 308 UDOT v23.4s, v5.16b, v3.4b[0] 309 UDOT v16.4s, v6.16b, v0.4b[1] 310 UDOT v17.4s, v6.16b, v1.4b[1] 311 UDOT v18.4s, v6.16b, v2.4b[1] 312 UDOT v19.4s, v6.16b, v3.4b[1] 313 ADD x5, x5, 64 314 UDOT v20.4s, v4.16b, v0.4b[1] 315 UDOT v21.4s, v4.16b, v1.4b[1] 316 UDOT v22.4s, v4.16b, v2.4b[1] 317 UDOT v23.4s, v4.16b, v3.4b[1] 318 TBZ x0, 2, 3b 3195: 320 LDR s0, [x13], 4 321 LDR q4, [x5], 16 322 LDR s1, [x14], 4 323 LDR s2, [x15], 4 324 LDR s3, [x10], 4 325 LDR q5, [x5], 16 326 UDOT v24.4s, v7.16b, v0.16b // update zero point 327 UDOT v25.4s, v7.16b, v1.16b 328 UDOT v26.4s, v7.16b, v2.16b 329 UDOT v27.4s, v7.16b, v3.16b 330 UDOT v16.4s, v4.16b, v0.4b[0] 331 UDOT v17.4s, v4.16b, v1.4b[0] 332 UDOT v18.4s, v4.16b, v2.4b[0] 333 UDOT v19.4s, v4.16b, v3.4b[0] 334 UDOT v20.4s, v5.16b, v0.4b[0] 335 UDOT v21.4s, v5.16b, v1.4b[0] 336 UDOT v22.4s, v5.16b, v2.4b[0] 337 UDOT v23.4s, v5.16b, v3.4b[0] 338 B 3b 339 340 # Store odd width 341 .p2align 3 3426: 343 TBZ x1, 2, 7f 344 ST1 {v1.s}[2], [x7], 4 345 STR s1, [x17], 4 346 ST1 {v0.s}[2], [x16], 4 347 STR s0, [x6], 4 348 EXT v0.16b, v0.16b, v0.16b, 4 349 EXT v1.16b, v1.16b, v1.16b, 4 3507: 351 TBZ x1, 1, 8f 352 ST1 {v1.h}[4], [x7], 2 353 STR h1, [x17], 2 354 ST1 {v0.h}[4], [x16], 2 355 STR h0, [x6], 2 356 EXT v0.16b, v0.16b, v0.16b, 2 357 EXT v1.16b, v1.16b, v1.16b, 2 3588: 359 TBZ x1, 0, 9f 360 ST1 {v1.b}[8], [x7] 361 STR b1, [x17] 362 ST1 {v0.b}[8], [x16] 363 STR b0, [x6] 3649: 365 RET 366 367END_FUNCTION xnn_qu8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8c4__aarch64_neondot_ld128 368 369#ifdef __ELF__ 370.section ".note.GNU-stack","",%progbits 371#endif 372