1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const uint8_t** restrict a, x4 19# const uint8_t* restrict w, x5 20# uint8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const uint8_t* zero, [sp + 16] -> x12 25# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 26 27# params structure is 20 bytes 28# struct { 29# uint8_t kernel_zero_point[4]; 30# int32_t right_pre_shift; 31# int32_t multiplier; 32# int32_t right_post_shift; 33# int16_t output_zero_point; 34# uint8_t output_min; 35# uint8_t output_max; 36# } rndnu_neon; 37# 38# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 39 40# Register usage 41# A0 x13 v0 42# A1 x14 v1 43# A2 x15 v2 44# A3 x20 v3 45# B x5 v4 v5 v6 46# C0 x6 v16 v20 v24 v28 47# C1 x16 v17 v21 v25 v29 48# C2 x17 v18 v22 v26 v30 49# C3 x7 v19 v23 v27 v31 50# zero_point v7 51# unused v8 v9 v10 v11 v12 v13 v14 v15 52# x11, x21 temp for Cortex-A53 loads 53 54BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53 55 56 # Clamp C pointers 57 CMP x0, 2 // if mr < 2 58 LDP x10, x8, [sp] // Load cn_stride, a_offset 59 ADD x16, x6, x7 // c1 = c0 + cm_stride 60 CSEL x16, x6, x16, LO // c1 = c0 61 62 ADD x17, x16, x7 // c2 = c1 + cm_stride 63 LDP x12, x11, [sp, 16] // Load zero, params pointer 64 // if mr <= 2 65 CSEL x17, x16, x17, LS // c2 = c1 66 67 CMP x0, 4 // if mr < 4 68 STP x20, x21, [sp, -16]! // Save x20-x21 on stack 69 ADD x7, x17, x7 // c3 = c2 + cm_stride 70 CSEL x7, x17, x7, LO // c3 = c2 71 72 LD1R {v7.4s}, [x11] // kernel_zero_point 73 74 .p2align 3 750: 76 # Load initial bias from w into accumulators 77 LDP q16, q20, [x5], 32 78 MOV v17.16b, v16.16b 79 MOV v18.16b, v16.16b 80 LDP q24, q28, [x5], 32 81 MOV v19.16b, v16.16b 82 MOV v21.16b, v20.16b 83 ADD x11, x11, 4 // adjust params pointer 84 MOV v22.16b, v20.16b 85 MOV v23.16b, v20.16b 86 MOV v25.16b, v24.16b 87 MOV v26.16b, v24.16b 88 MOV v27.16b, v24.16b 89 MOV v29.16b, v28.16b 90 MOV v30.16b, v28.16b 91 MOV v31.16b, v28.16b 92 MOV x9, x3 // p = ks 93 94 .p2align 3 951: 96 # Load next 4 A pointers 97 LDP x13, x14, [x4], 16 98 LDP x15, x20, [x4], 16 99 100 CMP x13, x12 // if a0 == zero 101 ADD x13, x13, x8 // a0 += a_offset 102 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 103 CMP x14, x12 // if a1 == zero 104 ADD x14, x14, x8 // a1 += a_offset 105 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 106 CMP x15, x12 // if a2 == zero 107 ADD x15, x15, x8 // a2 += a_offset 108 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 109 CMP x20, x12 // if a3 == zero 110 ADD x20, x20, x8 // a3 += a_offset 111 CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset 112 113 # Is there at least 8 bytes for epilogue? 114 SUBS x0, x2, 8 // k = kc - 8 115 B.LO 5f 116 117 # Prologue 118 LDR d0, [x13], 8 119 LDP d4, d6, [x5] 120 LDR d1, [x14], 8 121 LDR d2, [x15], 8 122 LDR d3, [x20], 8 123 UXTL v0.8h, v0.8b 124 LDR x11, [x5, 16] 125 USUBL v4.8h, v4.8b, v7.8b 126 UXTL v1.8h, v1.8b 127 UXTL v2.8h, v2.8b 128 UXTL v3.8h, v3.8b 129 USUBL v6.8h, v6.8b, v7.8b 130 131 SUBS x0, x0, 8 // k = k - 8 132 # Is there at least 8 bytes for main loop? 133 B.LO 3f 134 135 # Main loop - 8 bytes of A 136 .p2align 3 1372: 138 SMLAL v16.4s, v4.4h, v0.h[0] 139 SMLAL2 v20.4s, v4.8h, v0.h[0] 140 SMLAL v17.4s, v4.4h, v1.h[0] 141 SMLAL2 v21.4s, v4.8h, v1.h[0] 142 SMLAL v18.4s, v4.4h, v2.h[0] 143 SMLAL2 v22.4s, v4.8h, v2.h[0] 144 SMLAL v19.4s, v4.4h, v3.h[0] 145 SMLAL2 v23.4s, v4.8h, v3.h[0] 146 LDR d4, [x5, 24] 147 INS v5.d[0], x11 148 SMLAL v24.4s, v6.4h, v0.h[0] 149 SMLAL2 v28.4s, v6.8h, v0.h[0] 150 SMLAL v25.4s, v6.4h, v1.h[0] 151 SMLAL2 v29.4s, v6.8h, v1.h[0] 152 USUBL v5.8h, v5.8b, v7.8b 153 SMLAL v26.4s, v6.4h, v2.h[0] 154 SMLAL2 v30.4s, v6.8h, v2.h[0] 155 SMLAL v27.4s, v6.4h, v3.h[0] 156 SMLAL2 v31.4s, v6.8h, v3.h[0] 157 LDR x11, [x5, 32] 158 SMLAL v16.4s, v5.4h, v0.h[1] 159 SMLAL2 v20.4s, v5.8h, v0.h[1] 160 SMLAL v17.4s, v5.4h, v1.h[1] 161 SMLAL2 v21.4s, v5.8h, v1.h[1] 162 USUBL v4.8h, v4.8b, v7.8b 163 SMLAL v18.4s, v5.4h, v2.h[1] 164 SMLAL2 v22.4s, v5.8h, v2.h[1] 165 SMLAL v19.4s, v5.4h, v3.h[1] 166 SMLAL2 v23.4s, v5.8h, v3.h[1] 167 LDR d5, [x5, 40] 168 INS v6.d[0], x11 169 SMLAL v24.4s, v4.4h, v0.h[1] 170 SMLAL2 v28.4s, v4.8h, v0.h[1] 171 SMLAL v25.4s, v4.4h, v1.h[1] 172 SMLAL2 v29.4s, v4.8h, v1.h[1] 173 USUBL v6.8h, v6.8b, v7.8b 174 SMLAL v26.4s, v4.4h, v2.h[1] 175 SMLAL2 v30.4s, v4.8h, v2.h[1] 176 SMLAL v27.4s, v4.4h, v3.h[1] 177 SMLAL2 v31.4s, v4.8h, v3.h[1] 178 LDR x11, [x5, 48] 179 SMLAL v16.4s, v6.4h, v0.h[2] 180 SMLAL2 v20.4s, v6.8h, v0.h[2] 181 SMLAL v17.4s, v6.4h, v1.h[2] 182 USUBL v5.8h, v5.8b, v7.8b 183 SMLAL2 v21.4s, v6.8h, v1.h[2] 184 SMLAL v18.4s, v6.4h, v2.h[2] 185 SMLAL2 v22.4s, v6.8h, v2.h[2] 186 SMLAL v19.4s, v6.4h, v3.h[2] 187 SMLAL2 v23.4s, v6.8h, v3.h[2] 188 LDR d6, [x5, 56] 189 INS v4.d[0], x11 190 SMLAL v24.4s, v5.4h, v0.h[2] 191 SMLAL2 v28.4s, v5.8h, v0.h[2] 192 SMLAL v25.4s, v5.4h, v1.h[2] 193 SMLAL2 v29.4s, v5.8h, v1.h[2] 194 USUBL v4.8h, v4.8b, v7.8b 195 SMLAL v26.4s, v5.4h, v2.h[2] 196 SMLAL2 v30.4s, v5.8h, v2.h[2] 197 SMLAL v27.4s, v5.4h, v3.h[2] 198 SMLAL2 v31.4s, v5.8h, v3.h[2] 199 LDR x11, [x5, 64] 200 SMLAL v16.4s, v4.4h, v0.h[3] 201 SMLAL2 v20.4s, v4.8h, v0.h[3] 202 SMLAL v17.4s, v4.4h, v1.h[3] 203 SMLAL2 v21.4s, v4.8h, v1.h[3] 204 USUBL v6.8h, v6.8b, v7.8b 205 SMLAL v18.4s, v4.4h, v2.h[3] 206 SMLAL2 v22.4s, v4.8h, v2.h[3] 207 SMLAL v19.4s, v4.4h, v3.h[3] 208 SMLAL2 v23.4s, v4.8h, v3.h[3] 209 LDR d4, [x5, 72] 210 INS v5.d[0], x11 211 SMLAL v24.4s, v6.4h, v0.h[3] 212 SMLAL2 v28.4s, v6.8h, v0.h[3] 213 USUBL v5.8h, v5.8b, v7.8b 214 SMLAL v25.4s, v6.4h, v1.h[3] 215 SMLAL2 v29.4s, v6.8h, v1.h[3] 216 SMLAL v26.4s, v6.4h, v2.h[3] 217 SMLAL2 v30.4s, v6.8h, v2.h[3] 218 SMLAL v27.4s, v6.4h, v3.h[3] 219 SMLAL2 v31.4s, v6.8h, v3.h[3] 220 LDR x11, [x5, 80] 221 SMLAL v16.4s, v5.4h, v0.h[4] 222 SMLAL2 v20.4s, v5.8h, v0.h[4] 223 SMLAL v17.4s, v5.4h, v1.h[4] 224 SMLAL2 v21.4s, v5.8h, v1.h[4] 225 USUBL v4.8h, v4.8b, v7.8b 226 SMLAL v18.4s, v5.4h, v2.h[4] 227 SMLAL2 v22.4s, v5.8h, v2.h[4] 228 SMLAL v19.4s, v5.4h, v3.h[4] 229 SMLAL2 v23.4s, v5.8h, v3.h[4] 230 LDR d5, [x5, 88] 231 INS v6.d[0], x11 232 SMLAL v24.4s, v4.4h, v0.h[4] 233 SMLAL2 v28.4s, v4.8h, v0.h[4] 234 SMLAL v25.4s, v4.4h, v1.h[4] 235 SMLAL2 v29.4s, v4.8h, v1.h[4] 236 USUBL v6.8h, v6.8b, v7.8b 237 SMLAL v26.4s, v4.4h, v2.h[4] 238 SMLAL2 v30.4s, v4.8h, v2.h[4] 239 SMLAL v27.4s, v4.4h, v3.h[4] 240 SMLAL2 v31.4s, v4.8h, v3.h[4] 241 LDR x11, [x5, 96] 242 SMLAL v16.4s, v6.4h, v0.h[5] 243 SMLAL2 v20.4s, v6.8h, v0.h[5] 244 SMLAL v17.4s, v6.4h, v1.h[5] 245 SMLAL2 v21.4s, v6.8h, v1.h[5] 246 USUBL v5.8h, v5.8b, v7.8b 247 SMLAL v18.4s, v6.4h, v2.h[5] 248 SMLAL2 v22.4s, v6.8h, v2.h[5] 249 SMLAL v19.4s, v6.4h, v3.h[5] 250 SMLAL2 v23.4s, v6.8h, v3.h[5] 251 LDR d6, [x5, 104] 252 INS v4.d[0], x11 253 SMLAL v24.4s, v5.4h, v0.h[5] 254 SMLAL2 v28.4s, v5.8h, v0.h[5] 255 SMLAL v25.4s, v5.4h, v1.h[5] 256 SMLAL2 v29.4s, v5.8h, v1.h[5] 257 USUBL v4.8h, v4.8b, v7.8b 258 SMLAL v26.4s, v5.4h, v2.h[5] 259 SMLAL2 v30.4s, v5.8h, v2.h[5] 260 SMLAL v27.4s, v5.4h, v3.h[5] 261 SMLAL2 v31.4s, v5.8h, v3.h[5] 262 USUBL v6.8h, v6.8b, v7.8b 263 LDR x11, [x5, 112] 264 SMLAL v16.4s, v4.4h, v0.h[6] 265 SMLAL2 v20.4s, v4.8h, v0.h[6] 266 SMLAL v17.4s, v4.4h, v1.h[6] 267 SMLAL2 v21.4s, v4.8h, v1.h[6] 268 SMLAL v18.4s, v4.4h, v2.h[6] 269 SMLAL2 v22.4s, v4.8h, v2.h[6] 270 SMLAL v19.4s, v4.4h, v3.h[6] 271 SMLAL2 v23.4s, v4.8h, v3.h[6] 272 LDR d5, [x5, 120] 273 INS v4.d[0], x11 274 SMLAL v24.4s, v6.4h, v0.h[6] 275 SMLAL2 v28.4s, v6.8h, v0.h[6] 276 SMLAL v25.4s, v6.4h, v1.h[6] 277 SMLAL2 v29.4s, v6.8h, v1.h[6] 278 USUBL v4.8h, v4.8b, v7.8b 279 ADD x5, x5, 128 280 281 SMLAL v26.4s, v6.4h, v2.h[6] 282 SMLAL2 v30.4s, v6.8h, v2.h[6] 283 LDR x11, [x5] 284 SMLAL v27.4s, v6.4h, v3.h[6] 285 SMLAL2 v31.4s, v6.8h, v3.h[6] 286 USUBL v5.8h, v5.8b, v7.8b 287 LDR x21, [x13], 8 288 289 SMLAL v16.4s, v4.4h, v0.h[7] 290 SMLAL2 v20.4s, v4.8h, v0.h[7] 291 SMLAL v17.4s, v4.4h, v1.h[7] 292 SMLAL2 v21.4s, v4.8h, v1.h[7] 293 SMLAL v18.4s, v4.4h, v2.h[7] 294 SMLAL2 v22.4s, v4.8h, v2.h[7] 295 SMLAL v19.4s, v4.4h, v3.h[7] 296 SMLAL2 v23.4s, v4.8h, v3.h[7] 297 LDR d6, [x5, 8] 298 INS v4.d[0], x11 299 SMLAL v24.4s, v5.4h, v0.h[7] 300 SMLAL2 v28.4s, v5.8h, v0.h[7] 301 LDR x11, [x15], 8 302 SMLAL v25.4s, v5.4h, v1.h[7] 303 SMLAL2 v29.4s, v5.8h, v1.h[7] 304 LDR d1, [x14], 8 305 INS v0.d[0], x21 306 SMLAL v26.4s, v5.4h, v2.h[7] 307 SMLAL2 v30.4s, v5.8h, v2.h[7] 308 SMLAL v27.4s, v5.4h, v3.h[7] 309 SMLAL2 v31.4s, v5.8h, v3.h[7] 310 LDR d3, [x20], 8 311 INS v2.d[0], x11 312 313 UXTL v0.8h, v0.8b 314 UXTL v1.8h, v1.8b 315 LDR x11, [x5, 16] 316 USUBL v4.8h, v4.8b, v7.8b 317 UXTL v2.8h, v2.8b 318 SUBS x0, x0, 8 319 UXTL v3.8h, v3.8b 320 USUBL v6.8h, v6.8b, v7.8b 321 B.HS 2b 322 323 # Epilogue. Same as main loop but no preloads in final group 324 325 .p2align 3 3263: 327 SMLAL v16.4s, v4.4h, v0.h[0] 328 SMLAL2 v20.4s, v4.8h, v0.h[0] 329 SMLAL v17.4s, v4.4h, v1.h[0] 330 SMLAL2 v21.4s, v4.8h, v1.h[0] 331 SMLAL v18.4s, v4.4h, v2.h[0] 332 SMLAL2 v22.4s, v4.8h, v2.h[0] 333 SMLAL v19.4s, v4.4h, v3.h[0] 334 SMLAL2 v23.4s, v4.8h, v3.h[0] 335 LDR d4, [x5, 24] 336 INS v5.d[0], x11 337 SMLAL v24.4s, v6.4h, v0.h[0] 338 SMLAL2 v28.4s, v6.8h, v0.h[0] 339 SMLAL v25.4s, v6.4h, v1.h[0] 340 SMLAL2 v29.4s, v6.8h, v1.h[0] 341 USUBL v5.8h, v5.8b, v7.8b 342 SMLAL v26.4s, v6.4h, v2.h[0] 343 SMLAL2 v30.4s, v6.8h, v2.h[0] 344 SMLAL v27.4s, v6.4h, v3.h[0] 345 SMLAL2 v31.4s, v6.8h, v3.h[0] 346 LDR x11, [x5, 32] 347 SMLAL v16.4s, v5.4h, v0.h[1] 348 SMLAL2 v20.4s, v5.8h, v0.h[1] 349 SMLAL v17.4s, v5.4h, v1.h[1] 350 SMLAL2 v21.4s, v5.8h, v1.h[1] 351 USUBL v4.8h, v4.8b, v7.8b 352 SMLAL v18.4s, v5.4h, v2.h[1] 353 SMLAL2 v22.4s, v5.8h, v2.h[1] 354 SMLAL v19.4s, v5.4h, v3.h[1] 355 SMLAL2 v23.4s, v5.8h, v3.h[1] 356 LDR d5, [x5, 40] 357 INS v6.d[0], x11 358 SMLAL v24.4s, v4.4h, v0.h[1] 359 SMLAL2 v28.4s, v4.8h, v0.h[1] 360 SMLAL v25.4s, v4.4h, v1.h[1] 361 SMLAL2 v29.4s, v4.8h, v1.h[1] 362 USUBL v6.8h, v6.8b, v7.8b 363 SMLAL v26.4s, v4.4h, v2.h[1] 364 SMLAL2 v30.4s, v4.8h, v2.h[1] 365 SMLAL v27.4s, v4.4h, v3.h[1] 366 SMLAL2 v31.4s, v4.8h, v3.h[1] 367 LDR x11, [x5, 48] 368 SMLAL v16.4s, v6.4h, v0.h[2] 369 SMLAL2 v20.4s, v6.8h, v0.h[2] 370 SMLAL v17.4s, v6.4h, v1.h[2] 371 USUBL v5.8h, v5.8b, v7.8b 372 SMLAL2 v21.4s, v6.8h, v1.h[2] 373 SMLAL v18.4s, v6.4h, v2.h[2] 374 SMLAL2 v22.4s, v6.8h, v2.h[2] 375 SMLAL v19.4s, v6.4h, v3.h[2] 376 SMLAL2 v23.4s, v6.8h, v3.h[2] 377 LDR d6, [x5, 56] 378 INS v4.d[0], x11 379 SMLAL v24.4s, v5.4h, v0.h[2] 380 SMLAL2 v28.4s, v5.8h, v0.h[2] 381 SMLAL v25.4s, v5.4h, v1.h[2] 382 SMLAL2 v29.4s, v5.8h, v1.h[2] 383 USUBL v4.8h, v4.8b, v7.8b 384 SMLAL v26.4s, v5.4h, v2.h[2] 385 SMLAL2 v30.4s, v5.8h, v2.h[2] 386 SMLAL v27.4s, v5.4h, v3.h[2] 387 SMLAL2 v31.4s, v5.8h, v3.h[2] 388 LDR x11, [x5, 64] 389 SMLAL v16.4s, v4.4h, v0.h[3] 390 SMLAL2 v20.4s, v4.8h, v0.h[3] 391 SMLAL v17.4s, v4.4h, v1.h[3] 392 SMLAL2 v21.4s, v4.8h, v1.h[3] 393 USUBL v6.8h, v6.8b, v7.8b 394 SMLAL v18.4s, v4.4h, v2.h[3] 395 SMLAL2 v22.4s, v4.8h, v2.h[3] 396 SMLAL v19.4s, v4.4h, v3.h[3] 397 SMLAL2 v23.4s, v4.8h, v3.h[3] 398 LDR d4, [x5, 72] 399 INS v5.d[0], x11 400 SMLAL v24.4s, v6.4h, v0.h[3] 401 SMLAL2 v28.4s, v6.8h, v0.h[3] 402 USUBL v5.8h, v5.8b, v7.8b 403 SMLAL v25.4s, v6.4h, v1.h[3] 404 SMLAL2 v29.4s, v6.8h, v1.h[3] 405 SMLAL v26.4s, v6.4h, v2.h[3] 406 SMLAL2 v30.4s, v6.8h, v2.h[3] 407 SMLAL v27.4s, v6.4h, v3.h[3] 408 SMLAL2 v31.4s, v6.8h, v3.h[3] 409 LDR x11, [x5, 80] 410 SMLAL v16.4s, v5.4h, v0.h[4] 411 SMLAL2 v20.4s, v5.8h, v0.h[4] 412 SMLAL v17.4s, v5.4h, v1.h[4] 413 SMLAL2 v21.4s, v5.8h, v1.h[4] 414 USUBL v4.8h, v4.8b, v7.8b 415 SMLAL v18.4s, v5.4h, v2.h[4] 416 SMLAL2 v22.4s, v5.8h, v2.h[4] 417 SMLAL v19.4s, v5.4h, v3.h[4] 418 SMLAL2 v23.4s, v5.8h, v3.h[4] 419 LDR d5, [x5, 88] 420 INS v6.d[0], x11 421 SMLAL v24.4s, v4.4h, v0.h[4] 422 SMLAL2 v28.4s, v4.8h, v0.h[4] 423 SMLAL v25.4s, v4.4h, v1.h[4] 424 SMLAL2 v29.4s, v4.8h, v1.h[4] 425 USUBL v6.8h, v6.8b, v7.8b 426 SMLAL v26.4s, v4.4h, v2.h[4] 427 SMLAL2 v30.4s, v4.8h, v2.h[4] 428 SMLAL v27.4s, v4.4h, v3.h[4] 429 SMLAL2 v31.4s, v4.8h, v3.h[4] 430 LDR x11, [x5, 96] 431 SMLAL v16.4s, v6.4h, v0.h[5] 432 SMLAL2 v20.4s, v6.8h, v0.h[5] 433 SMLAL v17.4s, v6.4h, v1.h[5] 434 SMLAL2 v21.4s, v6.8h, v1.h[5] 435 USUBL v5.8h, v5.8b, v7.8b 436 SMLAL v18.4s, v6.4h, v2.h[5] 437 SMLAL2 v22.4s, v6.8h, v2.h[5] 438 SMLAL v19.4s, v6.4h, v3.h[5] 439 SMLAL2 v23.4s, v6.8h, v3.h[5] 440 LDR d6, [x5, 104] 441 INS v4.d[0], x11 442 SMLAL v24.4s, v5.4h, v0.h[5] 443 SMLAL2 v28.4s, v5.8h, v0.h[5] 444 SMLAL v25.4s, v5.4h, v1.h[5] 445 SMLAL2 v29.4s, v5.8h, v1.h[5] 446 USUBL v4.8h, v4.8b, v7.8b 447 SMLAL v26.4s, v5.4h, v2.h[5] 448 SMLAL2 v30.4s, v5.8h, v2.h[5] 449 SMLAL v27.4s, v5.4h, v3.h[5] 450 SMLAL2 v31.4s, v5.8h, v3.h[5] 451 USUBL v6.8h, v6.8b, v7.8b 452 SMLAL v16.4s, v4.4h, v0.h[6] 453 SMLAL2 v20.4s, v4.8h, v0.h[6] 454 SMLAL v17.4s, v4.4h, v1.h[6] 455 SMLAL2 v21.4s, v4.8h, v1.h[6] 456 SMLAL v18.4s, v4.4h, v2.h[6] 457 SMLAL2 v22.4s, v4.8h, v2.h[6] 458 SMLAL v19.4s, v4.4h, v3.h[6] 459 SMLAL2 v23.4s, v4.8h, v3.h[6] 460 LDR x11, [x5, 112] 461 SMLAL v24.4s, v6.4h, v0.h[6] 462 SMLAL2 v28.4s, v6.8h, v0.h[6] 463 SMLAL v25.4s, v6.4h, v1.h[6] 464 SMLAL2 v29.4s, v6.8h, v1.h[6] 465 LDR d5, [x5, 120] 466 INS v4.d[0], x11 467 USUBL v4.8h, v4.8b, v7.8b 468 SMLAL v26.4s, v6.4h, v2.h[6] 469 SMLAL2 v30.4s, v6.8h, v2.h[6] 470 SMLAL v27.4s, v6.4h, v3.h[6] 471 SMLAL2 v31.4s, v6.8h, v3.h[6] 472 SMLAL v16.4s, v4.4h, v0.h[7] 473 SMLAL2 v20.4s, v4.8h, v0.h[7] 474 SMLAL v17.4s, v4.4h, v1.h[7] 475 SMLAL2 v21.4s, v4.8h, v1.h[7] 476 USUBL v5.8h, v5.8b, v7.8b 477 SMLAL v18.4s, v4.4h, v2.h[7] 478 SMLAL2 v22.4s, v4.8h, v2.h[7] 479 SMLAL v19.4s, v4.4h, v3.h[7] 480 SMLAL2 v23.4s, v4.8h, v3.h[7] 481 ADD x5, x5, 128 482 SMLAL v24.4s, v5.4h, v0.h[7] 483 SMLAL2 v28.4s, v5.8h, v0.h[7] 484 SMLAL v25.4s, v5.4h, v1.h[7] 485 SMLAL2 v29.4s, v5.8h, v1.h[7] 486 AND x0, x2, 7 // kc remainder 0 to 7 487 SMLAL v26.4s, v5.4h, v2.h[7] 488 SMLAL2 v30.4s, v5.8h, v2.h[7] 489 LDR x11, [sp, 40] // reload params pointer 490 SMLAL v27.4s, v5.4h, v3.h[7] 491 SMLAL2 v31.4s, v5.8h, v3.h[7] 492 ADD x11, x11, 4 493 494 # Is there a remainder?- 1 to 7 bytes of A 495 CBNZ x0, 5f 496 4974: 498 # ks loop 499 SUBS x9, x9, 32 // ks -= MR * sizeof(uint8_t*) 500 B.HI 1b 501 502 # Apply params - preshift, scale, postshift, bias and clamp 503 LD1R {v4.4s}, [x11], 4 504 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 505 SQSHL v17.4s, v17.4s, v4.4s 506 SQSHL v18.4s, v18.4s, v4.4s 507 SQSHL v19.4s, v19.4s, v4.4s 508 SQSHL v20.4s, v20.4s, v4.4s 509 SQSHL v21.4s, v21.4s, v4.4s 510 SQSHL v22.4s, v22.4s, v4.4s 511 SQSHL v23.4s, v23.4s, v4.4s 512 LD1R {v5.4s}, [x11], 4 513 SQSHL v24.4s, v24.4s, v4.4s 514 SQSHL v25.4s, v25.4s, v4.4s 515 SQSHL v26.4s, v26.4s, v4.4s 516 SQSHL v27.4s, v27.4s, v4.4s 517 SQSHL v28.4s, v28.4s, v4.4s 518 SQSHL v29.4s, v29.4s, v4.4s 519 SQSHL v30.4s, v30.4s, v4.4s 520 SQSHL v31.4s, v31.4s, v4.4s 521 LD1R {v6.4s}, [x11], 4 522 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 523 SQDMULH v17.4s, v17.4s, v5.4s 524 SQDMULH v18.4s, v18.4s, v5.4s 525 SQDMULH v19.4s, v19.4s, v5.4s 526 SQDMULH v20.4s, v20.4s, v5.4s 527 SQDMULH v21.4s, v21.4s, v5.4s 528 SQDMULH v22.4s, v22.4s, v5.4s 529 SQDMULH v23.4s, v23.4s, v5.4s 530 SQDMULH v24.4s, v24.4s, v5.4s 531 SQDMULH v25.4s, v25.4s, v5.4s 532 SQDMULH v26.4s, v26.4s, v5.4s 533 SQDMULH v27.4s, v27.4s, v5.4s 534 SQDMULH v28.4s, v28.4s, v5.4s 535 SQDMULH v29.4s, v29.4s, v5.4s 536 SQDMULH v30.4s, v30.4s, v5.4s 537 SQDMULH v31.4s, v31.4s, v5.4s 538 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 539 SRSHL v17.4s, v17.4s, v6.4s 540 SRSHL v18.4s, v18.4s, v6.4s 541 SRSHL v19.4s, v19.4s, v6.4s 542 SRSHL v20.4s, v20.4s, v6.4s 543 SRSHL v21.4s, v21.4s, v6.4s 544 SRSHL v22.4s, v22.4s, v6.4s 545 SRSHL v23.4s, v23.4s, v6.4s 546 SRSHL v24.4s, v24.4s, v6.4s 547 SRSHL v25.4s, v25.4s, v6.4s 548 SRSHL v26.4s, v26.4s, v6.4s 549 SRSHL v27.4s, v27.4s, v6.4s 550 SRSHL v28.4s, v28.4s, v6.4s 551 SRSHL v29.4s, v29.4s, v6.4s 552 SRSHL v30.4s, v30.4s, v6.4s 553 SRSHL v31.4s, v31.4s, v6.4s 554 555 SQXTN v16.4h, v16.4s 556 SQXTN v17.4h, v17.4s 557 SQXTN v18.4h, v18.4s 558 SQXTN v19.4h, v19.4s 559 SQXTN v24.4h, v24.4s 560 SQXTN v25.4h, v25.4s 561 SQXTN v26.4h, v26.4s 562 SQXTN v27.4h, v27.4s 563 LD1R {v6.8h}, [x11], 2 // add bias 564 565 SQXTN2 v16.8h, v20.4s 566 SQXTN2 v17.8h, v21.4s 567 SQXTN2 v18.8h, v22.4s 568 SQXTN2 v19.8h, v23.4s 569 SQXTN2 v24.8h, v28.4s 570 SQXTN2 v25.8h, v29.4s 571 SQXTN2 v26.8h, v30.4s 572 SQXTN2 v27.8h, v31.4s 573 574 SQADD v16.8h, v16.8h, v6.8h 575 SQADD v17.8h, v17.8h, v6.8h 576 SQADD v18.8h, v18.8h, v6.8h 577 SQADD v19.8h, v19.8h, v6.8h 578 SQADD v24.8h, v24.8h, v6.8h 579 SQADD v25.8h, v25.8h, v6.8h 580 SQADD v26.8h, v26.8h, v6.8h 581 SQADD v27.8h, v27.8h, v6.8h 582 LD1R {v4.16b}, [x11], 1 // clamp min value 583 584 SQXTUN v0.8b, v16.8h 585 SQXTUN v1.8b, v17.8h 586 SQXTUN v2.8b, v18.8h 587 SQXTUN v3.8b, v19.8h 588 LD1R {v5.16b}, [x11] // clamp max value 589 SQXTUN2 v0.16b, v24.8h 590 SQXTUN2 v1.16b, v25.8h 591 SQXTUN2 v2.16b, v26.8h 592 SQXTUN2 v3.16b, v27.8h 593 SUB x11, x11, 19 // rewind params pointer 594 595 UMAX v0.16b, v0.16b, v4.16b 596 UMAX v1.16b, v1.16b, v4.16b 597 UMAX v2.16b, v2.16b, v4.16b 598 UMAX v3.16b, v3.16b, v4.16b 599 SUBS x1, x1, 16 600 UMIN v0.16b, v0.16b, v5.16b 601 UMIN v1.16b, v1.16b, v5.16b 602 UMIN v2.16b, v2.16b, v5.16b 603 UMIN v3.16b, v3.16b, v5.16b 604 B.LO 6f 605 606 # Store full 4 x 16 607 ST1 {v3.16b}, [x7], x10 608 ST1 {v2.16b}, [x17], x10 609 ST1 {v1.16b}, [x16], x10 610 ST1 {v0.16b}, [x6], x10 611 612 SUB x4, x4, x3 // a -= ks 613 614 # nc loop 615 B.HI 0b 616 617 # Restore x20-x21 from stack 618 LDP x20, x21, [sp], 16 619 RET 620 621 # Remainder- 1 to 7 bytes of A 622 .p2align 3 6235: 624 AND x0, x2, 7 // kc remainder 1 to 7 625 626 LD1 {v0.8b}, [x13], x0 627 LDP d4, d5, [x5], 16 628 LD1 {v1.8b}, [x14], x0 629 LD1 {v2.8b}, [x15], x0 630 LD1 {v3.8b}, [x20], x0 631 UXTL v0.8h, v0.8b 632 USUBL v4.8h, v4.8b, v7.8b 633 USUBL v5.8h, v5.8b, v7.8b 634 UXTL v1.8h, v1.8b 635 UXTL v2.8h, v2.8b 636 UXTL v3.8h, v3.8b 637 SMLAL v16.4s, v4.4h, v0.h[0] 638 SMLAL2 v20.4s, v4.8h, v0.h[0] 639 SMLAL v24.4s, v5.4h, v0.h[0] 640 SMLAL2 v28.4s, v5.8h, v0.h[0] 641 SMLAL v17.4s, v4.4h, v1.h[0] 642 SMLAL2 v21.4s, v4.8h, v1.h[0] 643 SMLAL v25.4s, v5.4h, v1.h[0] 644 SMLAL2 v29.4s, v5.8h, v1.h[0] 645 SMLAL v18.4s, v4.4h, v2.h[0] 646 SMLAL2 v22.4s, v4.8h, v2.h[0] 647 SMLAL v26.4s, v5.4h, v2.h[0] 648 SMLAL2 v30.4s, v5.8h, v2.h[0] 649 SMLAL v19.4s, v4.4h, v3.h[0] 650 SMLAL2 v23.4s, v4.8h, v3.h[0] 651 SMLAL v27.4s, v5.4h, v3.h[0] 652 SMLAL2 v31.4s, v5.8h, v3.h[0] 653 CMP x0, 2 654 B.LO 4b 655 656 LDP d4, d5, [x5], 16 657 USUBL v4.8h, v4.8b, v7.8b 658 USUBL v5.8h, v5.8b, v7.8b 659 SMLAL v16.4s, v4.4h, v0.h[1] 660 SMLAL2 v20.4s, v4.8h, v0.h[1] 661 SMLAL v24.4s, v5.4h, v0.h[1] 662 SMLAL2 v28.4s, v5.8h, v0.h[1] 663 SMLAL v17.4s, v4.4h, v1.h[1] 664 SMLAL2 v21.4s, v4.8h, v1.h[1] 665 SMLAL v25.4s, v5.4h, v1.h[1] 666 SMLAL2 v29.4s, v5.8h, v1.h[1] 667 SMLAL v18.4s, v4.4h, v2.h[1] 668 SMLAL2 v22.4s, v4.8h, v2.h[1] 669 SMLAL v26.4s, v5.4h, v2.h[1] 670 SMLAL2 v30.4s, v5.8h, v2.h[1] 671 SMLAL v19.4s, v4.4h, v3.h[1] 672 SMLAL2 v23.4s, v4.8h, v3.h[1] 673 SMLAL v27.4s, v5.4h, v3.h[1] 674 SMLAL2 v31.4s, v5.8h, v3.h[1] 675 B.EQ 4b 676 677 LDP d4, d5, [x5], 16 678 USUBL v4.8h, v4.8b, v7.8b 679 USUBL v5.8h, v5.8b, v7.8b 680 SMLAL v16.4s, v4.4h, v0.h[2] 681 SMLAL2 v20.4s, v4.8h, v0.h[2] 682 SMLAL v24.4s, v5.4h, v0.h[2] 683 SMLAL2 v28.4s, v5.8h, v0.h[2] 684 SMLAL v17.4s, v4.4h, v1.h[2] 685 SMLAL2 v21.4s, v4.8h, v1.h[2] 686 SMLAL v25.4s, v5.4h, v1.h[2] 687 SMLAL2 v29.4s, v5.8h, v1.h[2] 688 SMLAL v18.4s, v4.4h, v2.h[2] 689 SMLAL2 v22.4s, v4.8h, v2.h[2] 690 SMLAL v26.4s, v5.4h, v2.h[2] 691 SMLAL2 v30.4s, v5.8h, v2.h[2] 692 SMLAL v19.4s, v4.4h, v3.h[2] 693 SMLAL2 v23.4s, v4.8h, v3.h[2] 694 SMLAL v27.4s, v5.4h, v3.h[2] 695 SMLAL2 v31.4s, v5.8h, v3.h[2] 696 CMP x0, 4 697 B.LO 4b 698 699 LDP d4, d5, [x5], 16 700 USUBL v4.8h, v4.8b, v7.8b 701 USUBL v5.8h, v5.8b, v7.8b 702 SMLAL v16.4s, v4.4h, v0.h[3] 703 SMLAL2 v20.4s, v4.8h, v0.h[3] 704 SMLAL v24.4s, v5.4h, v0.h[3] 705 SMLAL2 v28.4s, v5.8h, v0.h[3] 706 SMLAL v17.4s, v4.4h, v1.h[3] 707 SMLAL2 v21.4s, v4.8h, v1.h[3] 708 SMLAL v25.4s, v5.4h, v1.h[3] 709 SMLAL2 v29.4s, v5.8h, v1.h[3] 710 SMLAL v18.4s, v4.4h, v2.h[3] 711 SMLAL2 v22.4s, v4.8h, v2.h[3] 712 SMLAL v26.4s, v5.4h, v2.h[3] 713 SMLAL2 v30.4s, v5.8h, v2.h[3] 714 SMLAL v19.4s, v4.4h, v3.h[3] 715 SMLAL2 v23.4s, v4.8h, v3.h[3] 716 SMLAL v27.4s, v5.4h, v3.h[3] 717 SMLAL2 v31.4s, v5.8h, v3.h[3] 718 B.EQ 4b 719 720 LDP d4, d5, [x5], 16 721 USUBL v4.8h, v4.8b, v7.8b 722 USUBL v5.8h, v5.8b, v7.8b 723 SMLAL v16.4s, v4.4h, v0.h[4] 724 SMLAL2 v20.4s, v4.8h, v0.h[4] 725 SMLAL v24.4s, v5.4h, v0.h[4] 726 SMLAL2 v28.4s, v5.8h, v0.h[4] 727 SMLAL v17.4s, v4.4h, v1.h[4] 728 SMLAL2 v21.4s, v4.8h, v1.h[4] 729 SMLAL v25.4s, v5.4h, v1.h[4] 730 SMLAL2 v29.4s, v5.8h, v1.h[4] 731 SMLAL v18.4s, v4.4h, v2.h[4] 732 SMLAL2 v22.4s, v4.8h, v2.h[4] 733 SMLAL v26.4s, v5.4h, v2.h[4] 734 SMLAL2 v30.4s, v5.8h, v2.h[4] 735 SMLAL v19.4s, v4.4h, v3.h[4] 736 SMLAL2 v23.4s, v4.8h, v3.h[4] 737 SMLAL v27.4s, v5.4h, v3.h[4] 738 SMLAL2 v31.4s, v5.8h, v3.h[4] 739 CMP x0, 6 740 B.LO 4b 741 742 LDP d4, d5, [x5], 16 743 USUBL v4.8h, v4.8b, v7.8b 744 USUBL v5.8h, v5.8b, v7.8b 745 SMLAL v16.4s, v4.4h, v0.h[5] 746 SMLAL2 v20.4s, v4.8h, v0.h[5] 747 SMLAL v24.4s, v5.4h, v0.h[5] 748 SMLAL2 v28.4s, v5.8h, v0.h[5] 749 SMLAL v17.4s, v4.4h, v1.h[5] 750 SMLAL2 v21.4s, v4.8h, v1.h[5] 751 SMLAL v25.4s, v5.4h, v1.h[5] 752 SMLAL2 v29.4s, v5.8h, v1.h[5] 753 SMLAL v18.4s, v4.4h, v2.h[5] 754 SMLAL2 v22.4s, v4.8h, v2.h[5] 755 SMLAL v26.4s, v5.4h, v2.h[5] 756 SMLAL2 v30.4s, v5.8h, v2.h[5] 757 SMLAL v19.4s, v4.4h, v3.h[5] 758 SMLAL2 v23.4s, v4.8h, v3.h[5] 759 SMLAL v27.4s, v5.4h, v3.h[5] 760 SMLAL2 v31.4s, v5.8h, v3.h[5] 761 B.EQ 4b 762 763 LDP d4, d5, [x5], 16 764 USUBL v4.8h, v4.8b, v7.8b 765 USUBL v5.8h, v5.8b, v7.8b 766 SMLAL v16.4s, v4.4h, v0.h[6] 767 SMLAL2 v20.4s, v4.8h, v0.h[6] 768 SMLAL v24.4s, v5.4h, v0.h[6] 769 SMLAL2 v28.4s, v5.8h, v0.h[6] 770 SMLAL v17.4s, v4.4h, v1.h[6] 771 SMLAL2 v21.4s, v4.8h, v1.h[6] 772 SMLAL v25.4s, v5.4h, v1.h[6] 773 SMLAL2 v29.4s, v5.8h, v1.h[6] 774 SMLAL v18.4s, v4.4h, v2.h[6] 775 SMLAL2 v22.4s, v4.8h, v2.h[6] 776 SMLAL v26.4s, v5.4h, v2.h[6] 777 SMLAL2 v30.4s, v5.8h, v2.h[6] 778 SMLAL v19.4s, v4.4h, v3.h[6] 779 SMLAL2 v23.4s, v4.8h, v3.h[6] 780 SMLAL v27.4s, v5.4h, v3.h[6] 781 SMLAL2 v31.4s, v5.8h, v3.h[6] 782 B 4b 783 784 # Store odd width 785 .p2align 3 7866: 787 TBZ x1, 3, 7f 788 STR d3, [x7], 8 789 STR d2, [x17], 8 790 DUP d3, v3.d[1] 791 DUP d2, v2.d[1] 792 STR d1, [x16], 8 793 STR d0, [x6], 8 794 DUP d1, v1.d[1] 795 DUP d0, v0.d[1] 7967: 797 TBZ x1, 2, 8f 798 STR s3, [x7], 4 799 STR s2, [x17], 4 800 DUP s3, v3.s[1] 801 DUP s2, v2.s[1] 802 STR s1, [x16], 4 803 STR s0, [x6], 4 804 DUP s1, v1.s[1] 805 DUP s0, v0.s[1] 8068: 807 TBZ x1, 1, 9f 808 STR h3, [x7], 2 809 STR h2, [x17], 2 810 DUP h3, v3.h[1] 811 DUP h2, v2.h[1] 812 STR h1, [x16], 2 813 STR h0, [x6], 2 814 DUP h1, v1.h[1] 815 DUP h0, v0.h[1] 8169: 817 TBZ x1, 0, 10f 818 STR b3, [x7] 819 STR b2, [x17] 820 STR b1, [x16] 821 STR b0, [x6] 82210: 823 # Restore x20-x21 from stack 824 LDP x20, x21, [sp], 16 825 RET 826 827END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53 828 829#ifdef __ELF__ 830.section ".note.GNU-stack","",%progbits 831#endif 832