1// Auto-generated file. Do not edit! 2// Template: src/qu8-igemm/4x8c4-aarch64-neondot-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x0 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qu8_conv_minmax_params params) [sp + 24] -> x11 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x10 v3 34# B x5 v4 v5 v6 35# C0 x6 v16 v20 36# C1 x16 v17 v21 37# C2 x17 v18 v22 38# C3 x7 v19 v23 39# zero_point v7 v24 v25 v26 v27 40# unused v8 v9 v10 v11 v13 v14 v15 v28 v29 v30 v31 41 42BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128 43 44 # Clamp C pointers 45 CMP x0, 2 // if mr < 2 46 LDR x8, [sp, 8] // Load a_offset 47 ADD x16, x6, x7 // c1 = c0 + cm_stride 48 CSEL x16, x6, x16, LO // c1 = c0 49 ADD x2, x2, 3 // kc = (kc + 3) & ~3 50 51 ADD x17, x16, x7 // c2 = c1 + cm_stride 52 LDP x12, x11, [sp, 16] // Load zero pointer, params 53 // if mr <= 2 54 CSEL x17, x16, x17, LS // c2 = c1 55 BIC x2, x2, 3 56 57 CMP x0, 4 // if mr < 4 58 ADD x7, x17, x7 // c3 = c2 + cm_stride 59 CSEL x7, x17, x7, LO // c3 = c2 60 61 LD1R {v7.4s}, [x11], 4 // kernel_zero_point 62 63 .p2align 3 640: 65 # Load initial bias from w into accumulators 66 LDP q16, q20, [x5], 32 67 MOV x9, x3 // p = ks 68 MOVI v24.16b, 0 69 MOVI v25.16b, 0 70 MOVI v26.16b, 0 71 MOVI v27.16b, 0 72 MOV v17.16b, v16.16b 73 MOV v18.16b, v16.16b 74 MOV v19.16b, v16.16b 75 MOV v21.16b, v20.16b 76 MOV v22.16b, v20.16b 77 MOV v23.16b, v20.16b 78 79 .p2align 3 801: 81 # Load next 4 A pointers 82 LDP x13, x14, [x4], 16 83 LDP x15, x10, [x4], 16 84 85 CMP x13, x12 // if a0 == zero 86 ADD x13, x13, x8 // a0 += a_offset 87 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 88 CMP x14, x12 // if a1 == zero 89 ADD x14, x14, x8 // a1 += a_offset 90 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 91 CMP x15, x12 // if a2 == zero 92 ADD x15, x15, x8 // a2 += a_offset 93 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 94 CMP x10, x12 // if a3 == zero 95 ADD x10, x10, x8 // a3 += a_offset 96 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 97 98 # Is there at least 16 bytes for main loop? 99 SUBS x0, x2, 16 // k = kc - 8 100 B.LO 40f 101 102 # Main loop - 16 bytes of A 103 .p2align 3 1042: 105 LDR q0, [x13], 16 106 LDR q4, [x5], 16 107 LDR q1, [x14], 16 108 LDR q2, [x15], 16 109 LDR q3, [x10], 16 110 LDR q5, [x5], 16 111 UDOT v24.4s, v7.16b, v0.16b // update zero point 112 UDOT v25.4s, v7.16b, v1.16b 113 UDOT v26.4s, v7.16b, v2.16b 114 UDOT v27.4s, v7.16b, v3.16b 115 UDOT v16.4s, v4.16b, v0.4b[0] 116 UDOT v17.4s, v4.16b, v1.4b[0] 117 LDR q6, [x5], 16 118 UDOT v18.4s, v4.16b, v2.4b[0] 119 UDOT v19.4s, v4.16b, v3.4b[0] 120 UDOT v20.4s, v5.16b, v0.4b[0] 121 UDOT v21.4s, v5.16b, v1.4b[0] 122 LDR q4, [x5], 16 123 UDOT v22.4s, v5.16b, v2.4b[0] 124 UDOT v23.4s, v5.16b, v3.4b[0] 125 UDOT v16.4s, v6.16b, v0.4b[1] 126 UDOT v17.4s, v6.16b, v1.4b[1] 127 LDR q5, [x5], 16 128 UDOT v18.4s, v6.16b, v2.4b[1] 129 UDOT v19.4s, v6.16b, v3.4b[1] 130 UDOT v20.4s, v4.16b, v0.4b[1] 131 UDOT v21.4s, v4.16b, v1.4b[1] 132 LDR q6, [x5], 16 133 UDOT v22.4s, v4.16b, v2.4b[1] 134 UDOT v23.4s, v4.16b, v3.4b[1] 135 UDOT v16.4s, v5.16b, v0.4b[2] 136 UDOT v17.4s, v5.16b, v1.4b[2] 137 LDR q4, [x5], 16 138 UDOT v18.4s, v5.16b, v2.4b[2] 139 UDOT v19.4s, v5.16b, v3.4b[2] 140 UDOT v20.4s, v6.16b, v0.4b[2] 141 UDOT v21.4s, v6.16b, v1.4b[2] 142 LDR q5, [x5], 16 143 UDOT v22.4s, v6.16b, v2.4b[2] 144 UDOT v23.4s, v6.16b, v3.4b[2] 145 UDOT v16.4s, v4.16b, v0.4b[3] 146 UDOT v17.4s, v4.16b, v1.4b[3] 147 UDOT v18.4s, v4.16b, v2.4b[3] 148 UDOT v19.4s, v4.16b, v3.4b[3] 149 SUBS x0, x0, 16 150 UDOT v20.4s, v5.16b, v0.4b[3] 151 UDOT v21.4s, v5.16b, v1.4b[3] 152 UDOT v22.4s, v5.16b, v2.4b[3] 153 UDOT v23.4s, v5.16b, v3.4b[3] 154 B.HS 2b 155 156 # Is there a remainder?- 8 bytes of A 157 TBNZ x0, 3, 4f 158 # Is there a remainder?- 4 bytes of A 159 TBNZ x0, 2, 5f 160 1613: 162 # ks loop 163 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 164 B.HI 1b 165 166 ADDP v0.4s, v24.4s, v24.4s 167 ADDP v1.4s, v25.4s, v25.4s 168 ADDP v2.4s, v26.4s, v26.4s 169 ADDP v3.4s, v27.4s, v27.4s 170 ADDP v24.4s, v0.4s, v0.4s 171 ADDP v25.4s, v1.4s, v1.4s 172 ADDP v26.4s, v2.4s, v2.4s 173 ADDP v27.4s, v3.4s, v3.4s 174 175 # Subtract zero point from accumulators 176 SUB v16.4s, v16.4s, v24.4s 177 SUB v17.4s, v17.4s, v25.4s 178 SUB v18.4s, v18.4s, v26.4s 179 SUB v19.4s, v19.4s, v27.4s 180 SUB v20.4s, v20.4s, v24.4s 181 SUB v21.4s, v21.4s, v25.4s 182 SUB v22.4s, v22.4s, v26.4s 183 SUB v23.4s, v23.4s, v27.4s 184 185 # Apply params - preshift, scale, postshift, bias and clamp 186 LD1R {v4.4s}, [x11], 4 187 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits 188 SSHL v17.4s, v17.4s, v4.4s 189 SSHL v18.4s, v18.4s, v4.4s 190 SSHL v19.4s, v19.4s, v4.4s 191 LD1R {v5.4s}, [x11], 4 192 SSHL v20.4s, v20.4s, v4.4s 193 SSHL v21.4s, v21.4s, v4.4s 194 SSHL v22.4s, v22.4s, v4.4s 195 SSHL v23.4s, v23.4s, v4.4s 196 LD1R {v6.4s}, [x11], 4 197 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 198 SQDMULH v17.4s, v17.4s, v5.4s 199 SQDMULH v18.4s, v18.4s, v5.4s 200 SQDMULH v19.4s, v19.4s, v5.4s 201 SQDMULH v20.4s, v20.4s, v5.4s 202 SQDMULH v21.4s, v21.4s, v5.4s 203 SQDMULH v22.4s, v22.4s, v5.4s 204 SQDMULH v23.4s, v23.4s, v5.4s 205 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 206 SRSHL v17.4s, v17.4s, v6.4s 207 SRSHL v18.4s, v18.4s, v6.4s 208 SRSHL v19.4s, v19.4s, v6.4s 209 SRSHL v20.4s, v20.4s, v6.4s 210 SRSHL v21.4s, v21.4s, v6.4s 211 SRSHL v22.4s, v22.4s, v6.4s 212 SRSHL v23.4s, v23.4s, v6.4s 213 214 SQXTN v16.4h, v16.4s 215 SQXTN v17.4h, v17.4s 216 SQXTN v18.4h, v18.4s 217 SQXTN v19.4h, v19.4s 218 LD1R {v6.8h}, [x11], 2 // add bias 219 220 SQXTN2 v16.8h, v20.4s 221 SQXTN2 v17.8h, v21.4s 222 SQXTN2 v18.8h, v22.4s 223 SQXTN2 v19.8h, v23.4s 224 LDR x0, [sp] // Load cn_offset 225 226 SQADD v16.8h, v16.8h, v6.8h 227 SQADD v17.8h, v17.8h, v6.8h 228 SQADD v18.8h, v18.8h, v6.8h 229 SQADD v19.8h, v19.8h, v6.8h 230 LD1R {v4.16b}, [x11], 1 // clamp min value 231 232 SQXTUN v0.8b, v16.8h 233 SQXTUN v1.8b, v18.8h 234 LD1R {v5.16b}, [x11] // clamp max value 235 SQXTUN2 v0.16b, v17.8h 236 SQXTUN2 v1.16b, v19.8h 237 SUB x11, x11, 15 // rewind params pointer 238 239 UMAX v0.16b, v0.16b, v4.16b 240 UMAX v1.16b, v1.16b, v4.16b 241 SUBS x1, x1, 8 242 UMIN v0.16b, v0.16b, v5.16b 243 UMIN v1.16b, v1.16b, v5.16b 244 B.LO 6f 245 246 # Store full 4 x 8 247 ST1 {v1.d}[1], [x7], x0 248 ST1 {v1.8b}, [x17], x0 249 ST1 {v0.d}[1], [x16], x0 250 ST1 {v0.8b}, [x6], x0 251 SUB x4, x4, x3 // a -= ks 252 253 # nc loop 254 B.HI 0b 255 RET 256 257 # Remainder- 4-12 bytes of A 258 .p2align 3 25940: TBZ x0, 3, 5f 2604: 261 LDR d0, [x13], 8 262 LDR q4, [x5] 263 LDR d1, [x14], 8 264 LDR d2, [x15], 8 265 LDR d3, [x10], 8 266 LDR q5, [x5, 16] 267 UDOT v24.4s, v7.16b, v0.16b // update zero point 268 UDOT v25.4s, v7.16b, v1.16b 269 UDOT v26.4s, v7.16b, v2.16b 270 UDOT v27.4s, v7.16b, v3.16b 271 UDOT v16.4s, v4.16b, v0.4b[0] 272 UDOT v17.4s, v4.16b, v1.4b[0] 273 LDR q6, [x5, 32] 274 UDOT v18.4s, v4.16b, v2.4b[0] 275 UDOT v19.4s, v4.16b, v3.4b[0] 276 UDOT v20.4s, v5.16b, v0.4b[0] 277 UDOT v21.4s, v5.16b, v1.4b[0] 278 LDR q4, [x5, 48] 279 UDOT v22.4s, v5.16b, v2.4b[0] 280 UDOT v23.4s, v5.16b, v3.4b[0] 281 UDOT v16.4s, v6.16b, v0.4b[1] 282 UDOT v17.4s, v6.16b, v1.4b[1] 283 UDOT v18.4s, v6.16b, v2.4b[1] 284 UDOT v19.4s, v6.16b, v3.4b[1] 285 ADD x5, x5, 64 286 UDOT v20.4s, v4.16b, v0.4b[1] 287 UDOT v21.4s, v4.16b, v1.4b[1] 288 UDOT v22.4s, v4.16b, v2.4b[1] 289 UDOT v23.4s, v4.16b, v3.4b[1] 290 TBZ x0, 2, 3b 2915: 292 LDR s0, [x13], 4 293 LDR q4, [x5], 16 294 LDR s1, [x14], 4 295 LDR s2, [x15], 4 296 LDR s3, [x10], 4 297 LDR q5, [x5], 16 298 UDOT v24.4s, v7.16b, v0.16b // update zero point 299 UDOT v25.4s, v7.16b, v1.16b 300 UDOT v26.4s, v7.16b, v2.16b 301 UDOT v27.4s, v7.16b, v3.16b 302 UDOT v16.4s, v4.16b, v0.4b[0] 303 UDOT v17.4s, v4.16b, v1.4b[0] 304 UDOT v18.4s, v4.16b, v2.4b[0] 305 UDOT v19.4s, v4.16b, v3.4b[0] 306 UDOT v20.4s, v5.16b, v0.4b[0] 307 UDOT v21.4s, v5.16b, v1.4b[0] 308 UDOT v22.4s, v5.16b, v2.4b[0] 309 UDOT v23.4s, v5.16b, v3.4b[0] 310 B 3b 311 312 # Store odd width 313 .p2align 3 3146: 315 TBZ x1, 2, 7f 316 ST1 {v1.s}[2], [x7], 4 317 STR s1, [x17], 4 318 ST1 {v0.s}[2], [x16], 4 319 STR s0, [x6], 4 320 EXT v0.16b, v0.16b, v0.16b, 4 321 EXT v1.16b, v1.16b, v1.16b, 4 3227: 323 TBZ x1, 1, 8f 324 ST1 {v1.h}[4], [x7], 2 325 STR h1, [x17], 2 326 ST1 {v0.h}[4], [x16], 2 327 STR h0, [x6], 2 328 EXT v0.16b, v0.16b, v0.16b, 2 329 EXT v1.16b, v1.16b, v1.16b, 2 3308: 331 TBZ x1, 0, 9f 332 ST1 {v1.b}[8], [x7] 333 STR b1, [x17] 334 ST1 {v0.b}[8], [x16] 335 STR b0, [x6] 3369: 337 RET 338 339END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128 340 341#ifdef __ELF__ 342.section ".note.GNU-stack","",%progbits 343#endif 344