1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const uint8_t** restrict a, x4 19# const uint8_t* restrict w, x5 20# uint8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const uint8_t* zero, [sp + 16] -> x12 25# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 26 27# params structure is 20 bytes 28# struct { 29# uint8_t kernel_zero_point[4]; 30# int32_t right_pre_shift; 31# int32_t multiplier; 32# int32_t right_post_shift; 33# int16_t output_zero_point; 34# uint8_t output_min; 35# uint8_t output_max; 36# } rndnu_neon; 37# 38# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 39 40# Register usage 41# A0 x13 v0 42# A1 x14 v1 43# A2 x15 v2 44# A3 x20 v3 45# B x5 v4 v5 v6 46# C0 x6 v16 v20 v24 v28 47# C1 x16 v17 v21 v25 v29 48# C2 x17 v18 v22 v26 v30 49# C3 x7 v19 v23 v27 v31 50# zero_point v7 51# unused v8 v9 v10 v11 v12 v13 v14 v15 52# x11, x21 temp for Cortex-A53 loads 53 54BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53 55 56 # Clamp C pointers 57 CMP x0, 2 // if mr < 2 58 LDP x10, x8, [sp] // Load cn_stride, a_offset 59 ADD x16, x6, x7 // c1 = c0 + cm_stride 60 CSEL x16, x6, x16, LO // c1 = c0 61 62 ADD x17, x16, x7 // c2 = c1 + cm_stride 63 LDP x12, x11, [sp, 16] // Load zero, params pointer 64 // if mr <= 2 65 CSEL x17, x16, x17, LS // c2 = c1 66 67 CMP x0, 4 // if mr < 4 68 STP x20, x21, [sp, -16]! // Save x20-x21 on stack 69 ADD x7, x17, x7 // c3 = c2 + cm_stride 70 CSEL x7, x17, x7, LO // c3 = c2 71 72 LD1R {v7.4s}, [x11] // kernel_zero_point 73 74 .p2align 3 750: 76 # Load initial bias from w into accumulators 77 LDP q16, q20, [x5], 32 78 MOV v17.16b, v16.16b 79 MOV v18.16b, v16.16b 80 LDP q24, q28, [x5], 32 81 MOV v19.16b, v16.16b 82 MOV v21.16b, v20.16b 83 ADD x11, x11, 4 // adjust params pointer 84 MOV v22.16b, v20.16b 85 MOV v23.16b, v20.16b 86 MOV v25.16b, v24.16b 87 MOV v26.16b, v24.16b 88 MOV v27.16b, v24.16b 89 MOV v29.16b, v28.16b 90 MOV v30.16b, v28.16b 91 MOV v31.16b, v28.16b 92 MOV x9, x3 // p = ks 93 94 .p2align 3 951: 96 # Load next 4 A pointers 97 LDP x13, x14, [x4], 16 98 LDP x15, x20, [x4], 16 99 100 CMP x13, x12 // if a0 == zero 101 ADD x13, x13, x8 // a0 += a_offset 102 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 103 CMP x14, x12 // if a1 == zero 104 ADD x14, x14, x8 // a1 += a_offset 105 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 106 CMP x15, x12 // if a2 == zero 107 ADD x15, x15, x8 // a2 += a_offset 108 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 109 CMP x20, x12 // if a3 == zero 110 ADD x20, x20, x8 // a3 += a_offset 111 CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset 112 113 # Is there at least 8 bytes for epilogue? 114 SUBS x0, x2, 8 // k = kc - 8 115 B.LO 5f 116 117 # Prologue 118 LDR d0, [x13], 8 119 LDP d4, d6, [x5] 120 LDR d1, [x14], 8 121 LDR d2, [x15], 8 122 LDR d3, [x20], 8 123 UXTL v0.8h, v0.8b 124 LDR x11, [x5, 16] 125 USUBL v4.8h, v4.8b, v7.8b 126 UXTL v1.8h, v1.8b 127 UXTL v2.8h, v2.8b 128 UXTL v3.8h, v3.8b 129 USUBL v6.8h, v6.8b, v7.8b 130 131 SUBS x0, x0, 8 // k = k - 8 132 # Is there at least 8 bytes for main loop? 133 B.LO 3f 134 135 # Main loop - 8 bytes of A 136 .p2align 3 1372: 138 SMLAL v16.4s, v4.4h, v0.h[0] 139 SMLAL2 v20.4s, v4.8h, v0.h[0] 140 PRFM PLDL1KEEP, [x13, 128] 141 SMLAL v17.4s, v4.4h, v1.h[0] 142 SMLAL2 v21.4s, v4.8h, v1.h[0] 143 PRFM PLDL1KEEP, [x14, 128] 144 SMLAL v18.4s, v4.4h, v2.h[0] 145 SMLAL2 v22.4s, v4.8h, v2.h[0] 146 PRFM PLDL1KEEP, [x15, 128] 147 SMLAL v19.4s, v4.4h, v3.h[0] 148 SMLAL2 v23.4s, v4.8h, v3.h[0] 149 PRFM PLDL1KEEP, [x20, 128] 150 LDR d4, [x5, 24] 151 INS v5.d[0], x11 152 SMLAL v24.4s, v6.4h, v0.h[0] 153 SMLAL2 v28.4s, v6.8h, v0.h[0] 154 PRFM PLDL1KEEP, [x5, 448] 155 SMLAL v25.4s, v6.4h, v1.h[0] 156 SMLAL2 v29.4s, v6.8h, v1.h[0] 157 PRFM PLDL1KEEP, [x5, 512] 158 USUBL v5.8h, v5.8b, v7.8b 159 SMLAL v26.4s, v6.4h, v2.h[0] 160 SMLAL2 v30.4s, v6.8h, v2.h[0] 161 SMLAL v27.4s, v6.4h, v3.h[0] 162 SMLAL2 v31.4s, v6.8h, v3.h[0] 163 LDR x11, [x5, 32] 164 SMLAL v16.4s, v5.4h, v0.h[1] 165 SMLAL2 v20.4s, v5.8h, v0.h[1] 166 SMLAL v17.4s, v5.4h, v1.h[1] 167 SMLAL2 v21.4s, v5.8h, v1.h[1] 168 USUBL v4.8h, v4.8b, v7.8b 169 SMLAL v18.4s, v5.4h, v2.h[1] 170 SMLAL2 v22.4s, v5.8h, v2.h[1] 171 SMLAL v19.4s, v5.4h, v3.h[1] 172 SMLAL2 v23.4s, v5.8h, v3.h[1] 173 LDR d5, [x5, 40] 174 INS v6.d[0], x11 175 SMLAL v24.4s, v4.4h, v0.h[1] 176 SMLAL2 v28.4s, v4.8h, v0.h[1] 177 SMLAL v25.4s, v4.4h, v1.h[1] 178 SMLAL2 v29.4s, v4.8h, v1.h[1] 179 USUBL v6.8h, v6.8b, v7.8b 180 SMLAL v26.4s, v4.4h, v2.h[1] 181 SMLAL2 v30.4s, v4.8h, v2.h[1] 182 SMLAL v27.4s, v4.4h, v3.h[1] 183 SMLAL2 v31.4s, v4.8h, v3.h[1] 184 LDR x11, [x5, 48] 185 SMLAL v16.4s, v6.4h, v0.h[2] 186 SMLAL2 v20.4s, v6.8h, v0.h[2] 187 SMLAL v17.4s, v6.4h, v1.h[2] 188 USUBL v5.8h, v5.8b, v7.8b 189 SMLAL2 v21.4s, v6.8h, v1.h[2] 190 SMLAL v18.4s, v6.4h, v2.h[2] 191 SMLAL2 v22.4s, v6.8h, v2.h[2] 192 SMLAL v19.4s, v6.4h, v3.h[2] 193 SMLAL2 v23.4s, v6.8h, v3.h[2] 194 LDR d6, [x5, 56] 195 INS v4.d[0], x11 196 SMLAL v24.4s, v5.4h, v0.h[2] 197 SMLAL2 v28.4s, v5.8h, v0.h[2] 198 SMLAL v25.4s, v5.4h, v1.h[2] 199 SMLAL2 v29.4s, v5.8h, v1.h[2] 200 USUBL v4.8h, v4.8b, v7.8b 201 SMLAL v26.4s, v5.4h, v2.h[2] 202 SMLAL2 v30.4s, v5.8h, v2.h[2] 203 SMLAL v27.4s, v5.4h, v3.h[2] 204 SMLAL2 v31.4s, v5.8h, v3.h[2] 205 LDR x11, [x5, 64] 206 SMLAL v16.4s, v4.4h, v0.h[3] 207 SMLAL2 v20.4s, v4.8h, v0.h[3] 208 SMLAL v17.4s, v4.4h, v1.h[3] 209 SMLAL2 v21.4s, v4.8h, v1.h[3] 210 USUBL v6.8h, v6.8b, v7.8b 211 SMLAL v18.4s, v4.4h, v2.h[3] 212 SMLAL2 v22.4s, v4.8h, v2.h[3] 213 SMLAL v19.4s, v4.4h, v3.h[3] 214 SMLAL2 v23.4s, v4.8h, v3.h[3] 215 LDR d4, [x5, 72] 216 INS v5.d[0], x11 217 SMLAL v24.4s, v6.4h, v0.h[3] 218 SMLAL2 v28.4s, v6.8h, v0.h[3] 219 USUBL v5.8h, v5.8b, v7.8b 220 SMLAL v25.4s, v6.4h, v1.h[3] 221 SMLAL2 v29.4s, v6.8h, v1.h[3] 222 SMLAL v26.4s, v6.4h, v2.h[3] 223 SMLAL2 v30.4s, v6.8h, v2.h[3] 224 SMLAL v27.4s, v6.4h, v3.h[3] 225 SMLAL2 v31.4s, v6.8h, v3.h[3] 226 LDR x11, [x5, 80] 227 SMLAL v16.4s, v5.4h, v0.h[4] 228 SMLAL2 v20.4s, v5.8h, v0.h[4] 229 SMLAL v17.4s, v5.4h, v1.h[4] 230 SMLAL2 v21.4s, v5.8h, v1.h[4] 231 USUBL v4.8h, v4.8b, v7.8b 232 SMLAL v18.4s, v5.4h, v2.h[4] 233 SMLAL2 v22.4s, v5.8h, v2.h[4] 234 SMLAL v19.4s, v5.4h, v3.h[4] 235 SMLAL2 v23.4s, v5.8h, v3.h[4] 236 LDR d5, [x5, 88] 237 INS v6.d[0], x11 238 SMLAL v24.4s, v4.4h, v0.h[4] 239 SMLAL2 v28.4s, v4.8h, v0.h[4] 240 SMLAL v25.4s, v4.4h, v1.h[4] 241 SMLAL2 v29.4s, v4.8h, v1.h[4] 242 USUBL v6.8h, v6.8b, v7.8b 243 SMLAL v26.4s, v4.4h, v2.h[4] 244 SMLAL2 v30.4s, v4.8h, v2.h[4] 245 SMLAL v27.4s, v4.4h, v3.h[4] 246 SMLAL2 v31.4s, v4.8h, v3.h[4] 247 LDR x11, [x5, 96] 248 SMLAL v16.4s, v6.4h, v0.h[5] 249 SMLAL2 v20.4s, v6.8h, v0.h[5] 250 SMLAL v17.4s, v6.4h, v1.h[5] 251 SMLAL2 v21.4s, v6.8h, v1.h[5] 252 USUBL v5.8h, v5.8b, v7.8b 253 SMLAL v18.4s, v6.4h, v2.h[5] 254 SMLAL2 v22.4s, v6.8h, v2.h[5] 255 SMLAL v19.4s, v6.4h, v3.h[5] 256 SMLAL2 v23.4s, v6.8h, v3.h[5] 257 LDR d6, [x5, 104] 258 INS v4.d[0], x11 259 SMLAL v24.4s, v5.4h, v0.h[5] 260 SMLAL2 v28.4s, v5.8h, v0.h[5] 261 SMLAL v25.4s, v5.4h, v1.h[5] 262 SMLAL2 v29.4s, v5.8h, v1.h[5] 263 USUBL v4.8h, v4.8b, v7.8b 264 SMLAL v26.4s, v5.4h, v2.h[5] 265 SMLAL2 v30.4s, v5.8h, v2.h[5] 266 SMLAL v27.4s, v5.4h, v3.h[5] 267 SMLAL2 v31.4s, v5.8h, v3.h[5] 268 USUBL v6.8h, v6.8b, v7.8b 269 LDR x11, [x5, 112] 270 SMLAL v16.4s, v4.4h, v0.h[6] 271 SMLAL2 v20.4s, v4.8h, v0.h[6] 272 SMLAL v17.4s, v4.4h, v1.h[6] 273 SMLAL2 v21.4s, v4.8h, v1.h[6] 274 SMLAL v18.4s, v4.4h, v2.h[6] 275 SMLAL2 v22.4s, v4.8h, v2.h[6] 276 SMLAL v19.4s, v4.4h, v3.h[6] 277 SMLAL2 v23.4s, v4.8h, v3.h[6] 278 LDR d5, [x5, 120] 279 INS v4.d[0], x11 280 SMLAL v24.4s, v6.4h, v0.h[6] 281 SMLAL2 v28.4s, v6.8h, v0.h[6] 282 SMLAL v25.4s, v6.4h, v1.h[6] 283 SMLAL2 v29.4s, v6.8h, v1.h[6] 284 USUBL v4.8h, v4.8b, v7.8b 285 ADD x5, x5, 128 286 287 SMLAL v26.4s, v6.4h, v2.h[6] 288 SMLAL2 v30.4s, v6.8h, v2.h[6] 289 LDR x11, [x5] 290 SMLAL v27.4s, v6.4h, v3.h[6] 291 SMLAL2 v31.4s, v6.8h, v3.h[6] 292 USUBL v5.8h, v5.8b, v7.8b 293 LDR x21, [x13], 8 294 295 SMLAL v16.4s, v4.4h, v0.h[7] 296 SMLAL2 v20.4s, v4.8h, v0.h[7] 297 SMLAL v17.4s, v4.4h, v1.h[7] 298 SMLAL2 v21.4s, v4.8h, v1.h[7] 299 SMLAL v18.4s, v4.4h, v2.h[7] 300 SMLAL2 v22.4s, v4.8h, v2.h[7] 301 SMLAL v19.4s, v4.4h, v3.h[7] 302 SMLAL2 v23.4s, v4.8h, v3.h[7] 303 LDR d6, [x5, 8] 304 INS v4.d[0], x11 305 SMLAL v24.4s, v5.4h, v0.h[7] 306 SMLAL2 v28.4s, v5.8h, v0.h[7] 307 LDR x11, [x15], 8 308 SMLAL v25.4s, v5.4h, v1.h[7] 309 SMLAL2 v29.4s, v5.8h, v1.h[7] 310 LDR d1, [x14], 8 311 INS v0.d[0], x21 312 SMLAL v26.4s, v5.4h, v2.h[7] 313 SMLAL2 v30.4s, v5.8h, v2.h[7] 314 SMLAL v27.4s, v5.4h, v3.h[7] 315 SMLAL2 v31.4s, v5.8h, v3.h[7] 316 LDR d3, [x20], 8 317 INS v2.d[0], x11 318 319 UXTL v0.8h, v0.8b 320 UXTL v1.8h, v1.8b 321 LDR x11, [x5, 16] 322 USUBL v4.8h, v4.8b, v7.8b 323 UXTL v2.8h, v2.8b 324 SUBS x0, x0, 8 325 UXTL v3.8h, v3.8b 326 USUBL v6.8h, v6.8b, v7.8b 327 B.HS 2b 328 329 # Epilogue. Same as main loop but no preloads in final group 330 331 .p2align 3 3323: 333 SMLAL v16.4s, v4.4h, v0.h[0] 334 SMLAL2 v20.4s, v4.8h, v0.h[0] 335 SMLAL v17.4s, v4.4h, v1.h[0] 336 SMLAL2 v21.4s, v4.8h, v1.h[0] 337 SMLAL v18.4s, v4.4h, v2.h[0] 338 SMLAL2 v22.4s, v4.8h, v2.h[0] 339 SMLAL v19.4s, v4.4h, v3.h[0] 340 SMLAL2 v23.4s, v4.8h, v3.h[0] 341 LDR d4, [x5, 24] 342 INS v5.d[0], x11 343 SMLAL v24.4s, v6.4h, v0.h[0] 344 SMLAL2 v28.4s, v6.8h, v0.h[0] 345 SMLAL v25.4s, v6.4h, v1.h[0] 346 SMLAL2 v29.4s, v6.8h, v1.h[0] 347 USUBL v5.8h, v5.8b, v7.8b 348 SMLAL v26.4s, v6.4h, v2.h[0] 349 SMLAL2 v30.4s, v6.8h, v2.h[0] 350 SMLAL v27.4s, v6.4h, v3.h[0] 351 SMLAL2 v31.4s, v6.8h, v3.h[0] 352 LDR x11, [x5, 32] 353 SMLAL v16.4s, v5.4h, v0.h[1] 354 SMLAL2 v20.4s, v5.8h, v0.h[1] 355 SMLAL v17.4s, v5.4h, v1.h[1] 356 SMLAL2 v21.4s, v5.8h, v1.h[1] 357 USUBL v4.8h, v4.8b, v7.8b 358 SMLAL v18.4s, v5.4h, v2.h[1] 359 SMLAL2 v22.4s, v5.8h, v2.h[1] 360 SMLAL v19.4s, v5.4h, v3.h[1] 361 SMLAL2 v23.4s, v5.8h, v3.h[1] 362 LDR d5, [x5, 40] 363 INS v6.d[0], x11 364 SMLAL v24.4s, v4.4h, v0.h[1] 365 SMLAL2 v28.4s, v4.8h, v0.h[1] 366 SMLAL v25.4s, v4.4h, v1.h[1] 367 SMLAL2 v29.4s, v4.8h, v1.h[1] 368 USUBL v6.8h, v6.8b, v7.8b 369 SMLAL v26.4s, v4.4h, v2.h[1] 370 SMLAL2 v30.4s, v4.8h, v2.h[1] 371 SMLAL v27.4s, v4.4h, v3.h[1] 372 SMLAL2 v31.4s, v4.8h, v3.h[1] 373 LDR x11, [x5, 48] 374 SMLAL v16.4s, v6.4h, v0.h[2] 375 SMLAL2 v20.4s, v6.8h, v0.h[2] 376 SMLAL v17.4s, v6.4h, v1.h[2] 377 USUBL v5.8h, v5.8b, v7.8b 378 SMLAL2 v21.4s, v6.8h, v1.h[2] 379 SMLAL v18.4s, v6.4h, v2.h[2] 380 SMLAL2 v22.4s, v6.8h, v2.h[2] 381 SMLAL v19.4s, v6.4h, v3.h[2] 382 SMLAL2 v23.4s, v6.8h, v3.h[2] 383 LDR d6, [x5, 56] 384 INS v4.d[0], x11 385 SMLAL v24.4s, v5.4h, v0.h[2] 386 SMLAL2 v28.4s, v5.8h, v0.h[2] 387 SMLAL v25.4s, v5.4h, v1.h[2] 388 SMLAL2 v29.4s, v5.8h, v1.h[2] 389 USUBL v4.8h, v4.8b, v7.8b 390 SMLAL v26.4s, v5.4h, v2.h[2] 391 SMLAL2 v30.4s, v5.8h, v2.h[2] 392 SMLAL v27.4s, v5.4h, v3.h[2] 393 SMLAL2 v31.4s, v5.8h, v3.h[2] 394 LDR x11, [x5, 64] 395 SMLAL v16.4s, v4.4h, v0.h[3] 396 SMLAL2 v20.4s, v4.8h, v0.h[3] 397 SMLAL v17.4s, v4.4h, v1.h[3] 398 SMLAL2 v21.4s, v4.8h, v1.h[3] 399 USUBL v6.8h, v6.8b, v7.8b 400 SMLAL v18.4s, v4.4h, v2.h[3] 401 SMLAL2 v22.4s, v4.8h, v2.h[3] 402 SMLAL v19.4s, v4.4h, v3.h[3] 403 SMLAL2 v23.4s, v4.8h, v3.h[3] 404 LDR d4, [x5, 72] 405 INS v5.d[0], x11 406 SMLAL v24.4s, v6.4h, v0.h[3] 407 SMLAL2 v28.4s, v6.8h, v0.h[3] 408 USUBL v5.8h, v5.8b, v7.8b 409 SMLAL v25.4s, v6.4h, v1.h[3] 410 SMLAL2 v29.4s, v6.8h, v1.h[3] 411 SMLAL v26.4s, v6.4h, v2.h[3] 412 SMLAL2 v30.4s, v6.8h, v2.h[3] 413 SMLAL v27.4s, v6.4h, v3.h[3] 414 SMLAL2 v31.4s, v6.8h, v3.h[3] 415 LDR x11, [x5, 80] 416 SMLAL v16.4s, v5.4h, v0.h[4] 417 SMLAL2 v20.4s, v5.8h, v0.h[4] 418 SMLAL v17.4s, v5.4h, v1.h[4] 419 SMLAL2 v21.4s, v5.8h, v1.h[4] 420 USUBL v4.8h, v4.8b, v7.8b 421 SMLAL v18.4s, v5.4h, v2.h[4] 422 SMLAL2 v22.4s, v5.8h, v2.h[4] 423 SMLAL v19.4s, v5.4h, v3.h[4] 424 SMLAL2 v23.4s, v5.8h, v3.h[4] 425 LDR d5, [x5, 88] 426 INS v6.d[0], x11 427 SMLAL v24.4s, v4.4h, v0.h[4] 428 SMLAL2 v28.4s, v4.8h, v0.h[4] 429 SMLAL v25.4s, v4.4h, v1.h[4] 430 SMLAL2 v29.4s, v4.8h, v1.h[4] 431 USUBL v6.8h, v6.8b, v7.8b 432 SMLAL v26.4s, v4.4h, v2.h[4] 433 SMLAL2 v30.4s, v4.8h, v2.h[4] 434 SMLAL v27.4s, v4.4h, v3.h[4] 435 SMLAL2 v31.4s, v4.8h, v3.h[4] 436 LDR x11, [x5, 96] 437 SMLAL v16.4s, v6.4h, v0.h[5] 438 SMLAL2 v20.4s, v6.8h, v0.h[5] 439 SMLAL v17.4s, v6.4h, v1.h[5] 440 SMLAL2 v21.4s, v6.8h, v1.h[5] 441 USUBL v5.8h, v5.8b, v7.8b 442 SMLAL v18.4s, v6.4h, v2.h[5] 443 SMLAL2 v22.4s, v6.8h, v2.h[5] 444 SMLAL v19.4s, v6.4h, v3.h[5] 445 SMLAL2 v23.4s, v6.8h, v3.h[5] 446 LDR d6, [x5, 104] 447 INS v4.d[0], x11 448 SMLAL v24.4s, v5.4h, v0.h[5] 449 SMLAL2 v28.4s, v5.8h, v0.h[5] 450 SMLAL v25.4s, v5.4h, v1.h[5] 451 SMLAL2 v29.4s, v5.8h, v1.h[5] 452 USUBL v4.8h, v4.8b, v7.8b 453 SMLAL v26.4s, v5.4h, v2.h[5] 454 SMLAL2 v30.4s, v5.8h, v2.h[5] 455 SMLAL v27.4s, v5.4h, v3.h[5] 456 SMLAL2 v31.4s, v5.8h, v3.h[5] 457 USUBL v6.8h, v6.8b, v7.8b 458 SMLAL v16.4s, v4.4h, v0.h[6] 459 SMLAL2 v20.4s, v4.8h, v0.h[6] 460 SMLAL v17.4s, v4.4h, v1.h[6] 461 SMLAL2 v21.4s, v4.8h, v1.h[6] 462 SMLAL v18.4s, v4.4h, v2.h[6] 463 SMLAL2 v22.4s, v4.8h, v2.h[6] 464 SMLAL v19.4s, v4.4h, v3.h[6] 465 SMLAL2 v23.4s, v4.8h, v3.h[6] 466 LDR x11, [x5, 112] 467 SMLAL v24.4s, v6.4h, v0.h[6] 468 SMLAL2 v28.4s, v6.8h, v0.h[6] 469 SMLAL v25.4s, v6.4h, v1.h[6] 470 SMLAL2 v29.4s, v6.8h, v1.h[6] 471 LDR d5, [x5, 120] 472 INS v4.d[0], x11 473 USUBL v4.8h, v4.8b, v7.8b 474 SMLAL v26.4s, v6.4h, v2.h[6] 475 SMLAL2 v30.4s, v6.8h, v2.h[6] 476 SMLAL v27.4s, v6.4h, v3.h[6] 477 SMLAL2 v31.4s, v6.8h, v3.h[6] 478 SMLAL v16.4s, v4.4h, v0.h[7] 479 SMLAL2 v20.4s, v4.8h, v0.h[7] 480 SMLAL v17.4s, v4.4h, v1.h[7] 481 SMLAL2 v21.4s, v4.8h, v1.h[7] 482 USUBL v5.8h, v5.8b, v7.8b 483 SMLAL v18.4s, v4.4h, v2.h[7] 484 SMLAL2 v22.4s, v4.8h, v2.h[7] 485 SMLAL v19.4s, v4.4h, v3.h[7] 486 SMLAL2 v23.4s, v4.8h, v3.h[7] 487 ADD x5, x5, 128 488 SMLAL v24.4s, v5.4h, v0.h[7] 489 SMLAL2 v28.4s, v5.8h, v0.h[7] 490 SMLAL v25.4s, v5.4h, v1.h[7] 491 SMLAL2 v29.4s, v5.8h, v1.h[7] 492 AND x0, x2, 7 // kc remainder 0 to 7 493 SMLAL v26.4s, v5.4h, v2.h[7] 494 SMLAL2 v30.4s, v5.8h, v2.h[7] 495 LDR x11, [sp, 40] // reload params pointer 496 SMLAL v27.4s, v5.4h, v3.h[7] 497 SMLAL2 v31.4s, v5.8h, v3.h[7] 498 ADD x11, x11, 4 499 500 # Is there a remainder?- 1 to 7 bytes of A 501 CBNZ x0, 5f 502 5034: 504 # ks loop 505 SUBS x9, x9, 32 // ks -= MR * sizeof(uint8_t*) 506 B.HI 1b 507 508 # Apply params - preshift, scale, postshift, bias and clamp 509 LD1R {v4.4s}, [x11], 4 510 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 511 SQSHL v17.4s, v17.4s, v4.4s 512 SQSHL v18.4s, v18.4s, v4.4s 513 SQSHL v19.4s, v19.4s, v4.4s 514 SQSHL v20.4s, v20.4s, v4.4s 515 SQSHL v21.4s, v21.4s, v4.4s 516 SQSHL v22.4s, v22.4s, v4.4s 517 SQSHL v23.4s, v23.4s, v4.4s 518 LD1R {v5.4s}, [x11], 4 519 SQSHL v24.4s, v24.4s, v4.4s 520 SQSHL v25.4s, v25.4s, v4.4s 521 SQSHL v26.4s, v26.4s, v4.4s 522 SQSHL v27.4s, v27.4s, v4.4s 523 SQSHL v28.4s, v28.4s, v4.4s 524 SQSHL v29.4s, v29.4s, v4.4s 525 SQSHL v30.4s, v30.4s, v4.4s 526 SQSHL v31.4s, v31.4s, v4.4s 527 LD1R {v6.4s}, [x11], 4 528 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 529 SQDMULH v17.4s, v17.4s, v5.4s 530 SQDMULH v18.4s, v18.4s, v5.4s 531 SQDMULH v19.4s, v19.4s, v5.4s 532 SQDMULH v20.4s, v20.4s, v5.4s 533 SQDMULH v21.4s, v21.4s, v5.4s 534 SQDMULH v22.4s, v22.4s, v5.4s 535 SQDMULH v23.4s, v23.4s, v5.4s 536 SQDMULH v24.4s, v24.4s, v5.4s 537 SQDMULH v25.4s, v25.4s, v5.4s 538 SQDMULH v26.4s, v26.4s, v5.4s 539 SQDMULH v27.4s, v27.4s, v5.4s 540 SQDMULH v28.4s, v28.4s, v5.4s 541 SQDMULH v29.4s, v29.4s, v5.4s 542 SQDMULH v30.4s, v30.4s, v5.4s 543 SQDMULH v31.4s, v31.4s, v5.4s 544 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 545 SRSHL v17.4s, v17.4s, v6.4s 546 SRSHL v18.4s, v18.4s, v6.4s 547 SRSHL v19.4s, v19.4s, v6.4s 548 SRSHL v20.4s, v20.4s, v6.4s 549 SRSHL v21.4s, v21.4s, v6.4s 550 SRSHL v22.4s, v22.4s, v6.4s 551 SRSHL v23.4s, v23.4s, v6.4s 552 SRSHL v24.4s, v24.4s, v6.4s 553 SRSHL v25.4s, v25.4s, v6.4s 554 SRSHL v26.4s, v26.4s, v6.4s 555 SRSHL v27.4s, v27.4s, v6.4s 556 SRSHL v28.4s, v28.4s, v6.4s 557 SRSHL v29.4s, v29.4s, v6.4s 558 SRSHL v30.4s, v30.4s, v6.4s 559 SRSHL v31.4s, v31.4s, v6.4s 560 561 SQXTN v16.4h, v16.4s 562 SQXTN v17.4h, v17.4s 563 SQXTN v18.4h, v18.4s 564 SQXTN v19.4h, v19.4s 565 SQXTN v24.4h, v24.4s 566 SQXTN v25.4h, v25.4s 567 SQXTN v26.4h, v26.4s 568 SQXTN v27.4h, v27.4s 569 LD1R {v6.8h}, [x11], 2 // add bias 570 571 SQXTN2 v16.8h, v20.4s 572 SQXTN2 v17.8h, v21.4s 573 SQXTN2 v18.8h, v22.4s 574 SQXTN2 v19.8h, v23.4s 575 SQXTN2 v24.8h, v28.4s 576 SQXTN2 v25.8h, v29.4s 577 SQXTN2 v26.8h, v30.4s 578 SQXTN2 v27.8h, v31.4s 579 580 SQADD v16.8h, v16.8h, v6.8h 581 SQADD v17.8h, v17.8h, v6.8h 582 SQADD v18.8h, v18.8h, v6.8h 583 SQADD v19.8h, v19.8h, v6.8h 584 SQADD v24.8h, v24.8h, v6.8h 585 SQADD v25.8h, v25.8h, v6.8h 586 SQADD v26.8h, v26.8h, v6.8h 587 SQADD v27.8h, v27.8h, v6.8h 588 LD1R {v4.16b}, [x11], 1 // clamp min value 589 590 SQXTUN v0.8b, v16.8h 591 SQXTUN v1.8b, v17.8h 592 SQXTUN v2.8b, v18.8h 593 SQXTUN v3.8b, v19.8h 594 LD1R {v5.16b}, [x11] // clamp max value 595 SQXTUN2 v0.16b, v24.8h 596 SQXTUN2 v1.16b, v25.8h 597 SQXTUN2 v2.16b, v26.8h 598 SQXTUN2 v3.16b, v27.8h 599 SUB x11, x11, 19 // rewind params pointer 600 601 UMAX v0.16b, v0.16b, v4.16b 602 UMAX v1.16b, v1.16b, v4.16b 603 UMAX v2.16b, v2.16b, v4.16b 604 UMAX v3.16b, v3.16b, v4.16b 605 SUBS x1, x1, 16 606 UMIN v0.16b, v0.16b, v5.16b 607 UMIN v1.16b, v1.16b, v5.16b 608 UMIN v2.16b, v2.16b, v5.16b 609 UMIN v3.16b, v3.16b, v5.16b 610 B.LO 6f 611 612 # Store full 4 x 16 613 ST1 {v3.16b}, [x7], x10 614 ST1 {v2.16b}, [x17], x10 615 ST1 {v1.16b}, [x16], x10 616 ST1 {v0.16b}, [x6], x10 617 618 SUB x4, x4, x3 // a -= ks 619 620 # nc loop 621 B.HI 0b 622 623 # Restore x20-x21 from stack 624 LDP x20, x21, [sp], 16 625 RET 626 627 # Remainder- 1 to 7 bytes of A 628 .p2align 3 6295: 630 AND x0, x2, 7 // kc remainder 1 to 7 631 632 LD1 {v0.8b}, [x13], x0 633 LDP d4, d5, [x5], 16 634 LD1 {v1.8b}, [x14], x0 635 LD1 {v2.8b}, [x15], x0 636 LD1 {v3.8b}, [x20], x0 637 UXTL v0.8h, v0.8b 638 USUBL v4.8h, v4.8b, v7.8b 639 USUBL v5.8h, v5.8b, v7.8b 640 UXTL v1.8h, v1.8b 641 UXTL v2.8h, v2.8b 642 UXTL v3.8h, v3.8b 643 SMLAL v16.4s, v4.4h, v0.h[0] 644 SMLAL2 v20.4s, v4.8h, v0.h[0] 645 SMLAL v24.4s, v5.4h, v0.h[0] 646 SMLAL2 v28.4s, v5.8h, v0.h[0] 647 SMLAL v17.4s, v4.4h, v1.h[0] 648 SMLAL2 v21.4s, v4.8h, v1.h[0] 649 SMLAL v25.4s, v5.4h, v1.h[0] 650 SMLAL2 v29.4s, v5.8h, v1.h[0] 651 SMLAL v18.4s, v4.4h, v2.h[0] 652 SMLAL2 v22.4s, v4.8h, v2.h[0] 653 SMLAL v26.4s, v5.4h, v2.h[0] 654 SMLAL2 v30.4s, v5.8h, v2.h[0] 655 SMLAL v19.4s, v4.4h, v3.h[0] 656 SMLAL2 v23.4s, v4.8h, v3.h[0] 657 SMLAL v27.4s, v5.4h, v3.h[0] 658 SMLAL2 v31.4s, v5.8h, v3.h[0] 659 CMP x0, 2 660 B.LO 4b 661 662 LDP d4, d5, [x5], 16 663 USUBL v4.8h, v4.8b, v7.8b 664 USUBL v5.8h, v5.8b, v7.8b 665 SMLAL v16.4s, v4.4h, v0.h[1] 666 SMLAL2 v20.4s, v4.8h, v0.h[1] 667 SMLAL v24.4s, v5.4h, v0.h[1] 668 SMLAL2 v28.4s, v5.8h, v0.h[1] 669 SMLAL v17.4s, v4.4h, v1.h[1] 670 SMLAL2 v21.4s, v4.8h, v1.h[1] 671 SMLAL v25.4s, v5.4h, v1.h[1] 672 SMLAL2 v29.4s, v5.8h, v1.h[1] 673 SMLAL v18.4s, v4.4h, v2.h[1] 674 SMLAL2 v22.4s, v4.8h, v2.h[1] 675 SMLAL v26.4s, v5.4h, v2.h[1] 676 SMLAL2 v30.4s, v5.8h, v2.h[1] 677 SMLAL v19.4s, v4.4h, v3.h[1] 678 SMLAL2 v23.4s, v4.8h, v3.h[1] 679 SMLAL v27.4s, v5.4h, v3.h[1] 680 SMLAL2 v31.4s, v5.8h, v3.h[1] 681 B.EQ 4b 682 683 LDP d4, d5, [x5], 16 684 USUBL v4.8h, v4.8b, v7.8b 685 USUBL v5.8h, v5.8b, v7.8b 686 SMLAL v16.4s, v4.4h, v0.h[2] 687 SMLAL2 v20.4s, v4.8h, v0.h[2] 688 SMLAL v24.4s, v5.4h, v0.h[2] 689 SMLAL2 v28.4s, v5.8h, v0.h[2] 690 SMLAL v17.4s, v4.4h, v1.h[2] 691 SMLAL2 v21.4s, v4.8h, v1.h[2] 692 SMLAL v25.4s, v5.4h, v1.h[2] 693 SMLAL2 v29.4s, v5.8h, v1.h[2] 694 SMLAL v18.4s, v4.4h, v2.h[2] 695 SMLAL2 v22.4s, v4.8h, v2.h[2] 696 SMLAL v26.4s, v5.4h, v2.h[2] 697 SMLAL2 v30.4s, v5.8h, v2.h[2] 698 SMLAL v19.4s, v4.4h, v3.h[2] 699 SMLAL2 v23.4s, v4.8h, v3.h[2] 700 SMLAL v27.4s, v5.4h, v3.h[2] 701 SMLAL2 v31.4s, v5.8h, v3.h[2] 702 CMP x0, 4 703 B.LO 4b 704 705 LDP d4, d5, [x5], 16 706 USUBL v4.8h, v4.8b, v7.8b 707 USUBL v5.8h, v5.8b, v7.8b 708 SMLAL v16.4s, v4.4h, v0.h[3] 709 SMLAL2 v20.4s, v4.8h, v0.h[3] 710 SMLAL v24.4s, v5.4h, v0.h[3] 711 SMLAL2 v28.4s, v5.8h, v0.h[3] 712 SMLAL v17.4s, v4.4h, v1.h[3] 713 SMLAL2 v21.4s, v4.8h, v1.h[3] 714 SMLAL v25.4s, v5.4h, v1.h[3] 715 SMLAL2 v29.4s, v5.8h, v1.h[3] 716 SMLAL v18.4s, v4.4h, v2.h[3] 717 SMLAL2 v22.4s, v4.8h, v2.h[3] 718 SMLAL v26.4s, v5.4h, v2.h[3] 719 SMLAL2 v30.4s, v5.8h, v2.h[3] 720 SMLAL v19.4s, v4.4h, v3.h[3] 721 SMLAL2 v23.4s, v4.8h, v3.h[3] 722 SMLAL v27.4s, v5.4h, v3.h[3] 723 SMLAL2 v31.4s, v5.8h, v3.h[3] 724 B.EQ 4b 725 726 LDP d4, d5, [x5], 16 727 USUBL v4.8h, v4.8b, v7.8b 728 USUBL v5.8h, v5.8b, v7.8b 729 SMLAL v16.4s, v4.4h, v0.h[4] 730 SMLAL2 v20.4s, v4.8h, v0.h[4] 731 SMLAL v24.4s, v5.4h, v0.h[4] 732 SMLAL2 v28.4s, v5.8h, v0.h[4] 733 SMLAL v17.4s, v4.4h, v1.h[4] 734 SMLAL2 v21.4s, v4.8h, v1.h[4] 735 SMLAL v25.4s, v5.4h, v1.h[4] 736 SMLAL2 v29.4s, v5.8h, v1.h[4] 737 SMLAL v18.4s, v4.4h, v2.h[4] 738 SMLAL2 v22.4s, v4.8h, v2.h[4] 739 SMLAL v26.4s, v5.4h, v2.h[4] 740 SMLAL2 v30.4s, v5.8h, v2.h[4] 741 SMLAL v19.4s, v4.4h, v3.h[4] 742 SMLAL2 v23.4s, v4.8h, v3.h[4] 743 SMLAL v27.4s, v5.4h, v3.h[4] 744 SMLAL2 v31.4s, v5.8h, v3.h[4] 745 CMP x0, 6 746 B.LO 4b 747 748 LDP d4, d5, [x5], 16 749 USUBL v4.8h, v4.8b, v7.8b 750 USUBL v5.8h, v5.8b, v7.8b 751 SMLAL v16.4s, v4.4h, v0.h[5] 752 SMLAL2 v20.4s, v4.8h, v0.h[5] 753 SMLAL v24.4s, v5.4h, v0.h[5] 754 SMLAL2 v28.4s, v5.8h, v0.h[5] 755 SMLAL v17.4s, v4.4h, v1.h[5] 756 SMLAL2 v21.4s, v4.8h, v1.h[5] 757 SMLAL v25.4s, v5.4h, v1.h[5] 758 SMLAL2 v29.4s, v5.8h, v1.h[5] 759 SMLAL v18.4s, v4.4h, v2.h[5] 760 SMLAL2 v22.4s, v4.8h, v2.h[5] 761 SMLAL v26.4s, v5.4h, v2.h[5] 762 SMLAL2 v30.4s, v5.8h, v2.h[5] 763 SMLAL v19.4s, v4.4h, v3.h[5] 764 SMLAL2 v23.4s, v4.8h, v3.h[5] 765 SMLAL v27.4s, v5.4h, v3.h[5] 766 SMLAL2 v31.4s, v5.8h, v3.h[5] 767 B.EQ 4b 768 769 LDP d4, d5, [x5], 16 770 USUBL v4.8h, v4.8b, v7.8b 771 USUBL v5.8h, v5.8b, v7.8b 772 SMLAL v16.4s, v4.4h, v0.h[6] 773 SMLAL2 v20.4s, v4.8h, v0.h[6] 774 SMLAL v24.4s, v5.4h, v0.h[6] 775 SMLAL2 v28.4s, v5.8h, v0.h[6] 776 SMLAL v17.4s, v4.4h, v1.h[6] 777 SMLAL2 v21.4s, v4.8h, v1.h[6] 778 SMLAL v25.4s, v5.4h, v1.h[6] 779 SMLAL2 v29.4s, v5.8h, v1.h[6] 780 SMLAL v18.4s, v4.4h, v2.h[6] 781 SMLAL2 v22.4s, v4.8h, v2.h[6] 782 SMLAL v26.4s, v5.4h, v2.h[6] 783 SMLAL2 v30.4s, v5.8h, v2.h[6] 784 SMLAL v19.4s, v4.4h, v3.h[6] 785 SMLAL2 v23.4s, v4.8h, v3.h[6] 786 SMLAL v27.4s, v5.4h, v3.h[6] 787 SMLAL2 v31.4s, v5.8h, v3.h[6] 788 B 4b 789 790 # Store odd width 791 .p2align 3 7926: 793 TBZ x1, 3, 7f 794 STR d3, [x7], 8 795 STR d2, [x17], 8 796 DUP d3, v3.d[1] 797 DUP d2, v2.d[1] 798 STR d1, [x16], 8 799 STR d0, [x6], 8 800 DUP d1, v1.d[1] 801 DUP d0, v0.d[1] 8027: 803 TBZ x1, 2, 8f 804 STR s3, [x7], 4 805 STR s2, [x17], 4 806 DUP s3, v3.s[1] 807 DUP s2, v2.s[1] 808 STR s1, [x16], 4 809 STR s0, [x6], 4 810 DUP s1, v1.s[1] 811 DUP s0, v0.s[1] 8128: 813 TBZ x1, 1, 9f 814 STR h3, [x7], 2 815 STR h2, [x17], 2 816 DUP h3, v3.h[1] 817 DUP h2, v2.h[1] 818 STR h1, [x16], 2 819 STR h0, [x6], 2 820 DUP h1, v1.h[1] 821 DUP h0, v0.h[1] 8229: 823 TBZ x1, 0, 10f 824 STR b3, [x7] 825 STR b2, [x17] 826 STR b1, [x16] 827 STR b0, [x6] 82810: 829 # Restore x20-x21 from stack 830 LDP x20, x21, [sp], 16 831 RET 832 833END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53 834 835#ifdef __ELF__ 836.section ".note.GNU-stack","",%progbits 837#endif 838