1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const uint8_t** restrict a, x4 19# const uint8_t* restrict w, x5 20# uint8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const uint8_t* zero, [sp + 16] -> x12 25# const xnn_qs8_conv_minmax_params params [sp + 24] -> x11 26 27# params structure is 20 bytes 28# struct { 29# uint8_t kernel_zero_point; 30# uint8_t padding[3]; 31# int32_t right_pre_shift; 32# int32_t multiplier; 33# int32_t right_post_shift; 34# int16_t output_zero_point; 35# uint8_t output_min; 36# uint8_t output_max; 37# } rndnu_neon; 38# 39# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 40 41# Register usage 42# A0 x13 v0 43# A1 x14 v1 44# A2 x15 v2 45# A3 x20 v3 46# B x5 v4 v5 v6 47# C0 x6 v16 v20 v24 v28 48# C1 x16 v17 v21 v25 v29 49# C2 x17 v18 v22 v26 v30 50# C3 x7 v19 v23 v27 v31 51# zero_point v7 52# unused v8 v9 v10 v11 v12 v13 v14 v15 53 54BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75 55 56 # Clamp C pointers 57 CMP x0, 2 // if mr < 2 58 LDP x10, x8, [sp] // Load cn_stride, a_offset 59 ADD x16, x6, x7 // c1 = c0 + cm_stride 60 CSEL x16, x6, x16, LO // c1 = c0 61 62 ADD x17, x16, x7 // c2 = c1 + cm_stride 63 LDP x12, x11, [sp, 16] // Load zero, params pointer 64 // if mr <= 2 65 CSEL x17, x16, x17, LS // c2 = c1 66 67 CMP x0, 4 // if mr < 4 68 STR x20, [sp, -16]! // Save x20 on stack 69 ADD x7, x17, x7 // c3 = c2 + cm_stride 70 CSEL x7, x17, x7, LO // c3 = c2 71 LD1R {v7.4s}, [x11], 4 // kernel_zero_point 72 73 .p2align 3 740: 75 # Load initial bias from w into accumulators 76 LDP q16, q20, [x5], 32 77 MOV v17.16b, v16.16b 78 MOV v18.16b, v16.16b 79 LDP q24, q28, [x5], 32 80 MOV v19.16b, v16.16b 81 MOV v21.16b, v20.16b 82 MOV v22.16b, v20.16b 83 MOV v23.16b, v20.16b 84 MOV v25.16b, v24.16b 85 MOV v26.16b, v24.16b 86 MOV v27.16b, v24.16b 87 MOV v29.16b, v28.16b 88 MOV v30.16b, v28.16b 89 MOV v31.16b, v28.16b 90 MOV x9, x3 // p = ks 91 92 .p2align 3 931: 94 # Load next 4 A pointers 95 LDP x13, x14, [x4], 16 96 LDP x15, x20, [x4], 16 97 98 CMP x13, x12 // if a0 == zero 99 ADD x13, x13, x8 // a0 += a_offset 100 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 101 CMP x14, x12 // if a1 == zero 102 ADD x14, x14, x8 // a1 += a_offset 103 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 104 CMP x15, x12 // if a2 == zero 105 ADD x15, x15, x8 // a2 += a_offset 106 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 107 CMP x20, x12 // if a3 == zero 108 ADD x20, x20, x8 // a3 += a_offset 109 CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset 110 111 # Is there at least 8 bytes for epilogue? 112 SUBS x0, x2, 8 // k = kc - 8 113 B.LO 5f 114 115 # Prologue 116 LDR d0, [x13], 8 117 LDP d4, d6, [x5] 118 LDR d1, [x14], 8 119 LDR d2, [x15], 8 120 LDR d3, [x20], 8 121 UXTL v0.8h, v0.8b 122 USUBL v4.8h, v4.8b, v7.8b 123 UXTL v1.8h, v1.8b 124 UXTL v2.8h, v2.8b 125 UXTL v3.8h, v3.8b 126 USUBL v6.8h, v6.8b, v7.8b 127 128 SUBS x0, x0, 8 // k = k - 8 129 # Is there at least 8 bytes for main loop? 130 B.LO 3f 131 132 # Main loop - 8 bytes of A 133 .p2align 3 1342: 135 SMLAL v16.4s, v4.4h, v0.h[0] 136 SMLAL2 v20.4s, v4.8h, v0.h[0] 137 SMLAL v17.4s, v4.4h, v1.h[0] 138 SMLAL2 v21.4s, v4.8h, v1.h[0] 139 SMLAL v18.4s, v4.4h, v2.h[0] 140 SMLAL2 v22.4s, v4.8h, v2.h[0] 141 SMLAL v19.4s, v4.4h, v3.h[0] 142 SMLAL2 v23.4s, v4.8h, v3.h[0] 143 LDR d5, [x5, 16] 144 SMLAL v24.4s, v6.4h, v0.h[0] 145 LDR d4, [x5, 24] 146 SMLAL2 v28.4s, v6.8h, v0.h[0] 147 SMLAL v25.4s, v6.4h, v1.h[0] 148 SMLAL2 v29.4s, v6.8h, v1.h[0] 149 USUBL v5.8h, v5.8b, v7.8b 150 SMLAL v26.4s, v6.4h, v2.h[0] 151 SMLAL2 v30.4s, v6.8h, v2.h[0] 152 SMLAL v27.4s, v6.4h, v3.h[0] 153 SMLAL2 v31.4s, v6.8h, v3.h[0] 154 SMLAL v16.4s, v5.4h, v0.h[1] 155 SMLAL2 v20.4s, v5.8h, v0.h[1] 156 SMLAL v17.4s, v5.4h, v1.h[1] 157 SMLAL2 v21.4s, v5.8h, v1.h[1] 158 USUBL v4.8h, v4.8b, v7.8b 159 SMLAL v18.4s, v5.4h, v2.h[1] 160 SMLAL2 v22.4s, v5.8h, v2.h[1] 161 SMLAL v19.4s, v5.4h, v3.h[1] 162 SMLAL2 v23.4s, v5.8h, v3.h[1] 163 LDR d6, [x5, 32] 164 SMLAL v24.4s, v4.4h, v0.h[1] 165 LDR d5, [x5, 40] 166 SMLAL2 v28.4s, v4.8h, v0.h[1] 167 SMLAL v25.4s, v4.4h, v1.h[1] 168 SMLAL2 v29.4s, v4.8h, v1.h[1] 169 USUBL v6.8h, v6.8b, v7.8b 170 SMLAL v26.4s, v4.4h, v2.h[1] 171 SMLAL2 v30.4s, v4.8h, v2.h[1] 172 SMLAL v27.4s, v4.4h, v3.h[1] 173 SMLAL2 v31.4s, v4.8h, v3.h[1] 174 SMLAL v16.4s, v6.4h, v0.h[2] 175 SMLAL2 v20.4s, v6.8h, v0.h[2] 176 SMLAL v17.4s, v6.4h, v1.h[2] 177 USUBL v5.8h, v5.8b, v7.8b 178 SMLAL2 v21.4s, v6.8h, v1.h[2] 179 SMLAL v18.4s, v6.4h, v2.h[2] 180 SMLAL2 v22.4s, v6.8h, v2.h[2] 181 SMLAL v19.4s, v6.4h, v3.h[2] 182 SMLAL2 v23.4s, v6.8h, v3.h[2] 183 LDR d4, [x5, 48] 184 SMLAL v24.4s, v5.4h, v0.h[2] 185 LDR d6, [x5, 56] 186 SMLAL2 v28.4s, v5.8h, v0.h[2] 187 SMLAL v25.4s, v5.4h, v1.h[2] 188 SMLAL2 v29.4s, v5.8h, v1.h[2] 189 USUBL v4.8h, v4.8b, v7.8b 190 SMLAL v26.4s, v5.4h, v2.h[2] 191 SMLAL2 v30.4s, v5.8h, v2.h[2] 192 SMLAL v27.4s, v5.4h, v3.h[2] 193 SMLAL2 v31.4s, v5.8h, v3.h[2] 194 SMLAL v16.4s, v4.4h, v0.h[3] 195 SMLAL2 v20.4s, v4.8h, v0.h[3] 196 SMLAL v17.4s, v4.4h, v1.h[3] 197 SMLAL2 v21.4s, v4.8h, v1.h[3] 198 USUBL v6.8h, v6.8b, v7.8b 199 SMLAL v18.4s, v4.4h, v2.h[3] 200 SMLAL2 v22.4s, v4.8h, v2.h[3] 201 SMLAL v19.4s, v4.4h, v3.h[3] 202 SMLAL2 v23.4s, v4.8h, v3.h[3] 203 LDR d5, [x5, 64] 204 SMLAL v24.4s, v6.4h, v0.h[3] 205 LDR d4, [x5, 72] 206 SMLAL2 v28.4s, v6.8h, v0.h[3] 207 USUBL v5.8h, v5.8b, v7.8b 208 SMLAL v25.4s, v6.4h, v1.h[3] 209 SMLAL2 v29.4s, v6.8h, v1.h[3] 210 SMLAL v26.4s, v6.4h, v2.h[3] 211 SMLAL2 v30.4s, v6.8h, v2.h[3] 212 SMLAL v27.4s, v6.4h, v3.h[3] 213 SMLAL2 v31.4s, v6.8h, v3.h[3] 214 SMLAL v16.4s, v5.4h, v0.h[4] 215 SMLAL2 v20.4s, v5.8h, v0.h[4] 216 SMLAL v17.4s, v5.4h, v1.h[4] 217 SMLAL2 v21.4s, v5.8h, v1.h[4] 218 USUBL v4.8h, v4.8b, v7.8b 219 SMLAL v18.4s, v5.4h, v2.h[4] 220 SMLAL2 v22.4s, v5.8h, v2.h[4] 221 SMLAL v19.4s, v5.4h, v3.h[4] 222 SMLAL2 v23.4s, v5.8h, v3.h[4] 223 LDR d6, [x5, 80] 224 SMLAL v24.4s, v4.4h, v0.h[4] 225 LDR d5, [x5, 88] 226 SMLAL2 v28.4s, v4.8h, v0.h[4] 227 SMLAL v25.4s, v4.4h, v1.h[4] 228 SMLAL2 v29.4s, v4.8h, v1.h[4] 229 USUBL v6.8h, v6.8b, v7.8b 230 SMLAL v26.4s, v4.4h, v2.h[4] 231 SMLAL2 v30.4s, v4.8h, v2.h[4] 232 SMLAL v27.4s, v4.4h, v3.h[4] 233 SMLAL2 v31.4s, v4.8h, v3.h[4] 234 SMLAL v16.4s, v6.4h, v0.h[5] 235 SMLAL2 v20.4s, v6.8h, v0.h[5] 236 SMLAL v17.4s, v6.4h, v1.h[5] 237 SMLAL2 v21.4s, v6.8h, v1.h[5] 238 USUBL v5.8h, v5.8b, v7.8b 239 SMLAL v18.4s, v6.4h, v2.h[5] 240 SMLAL2 v22.4s, v6.8h, v2.h[5] 241 SMLAL v19.4s, v6.4h, v3.h[5] 242 SMLAL2 v23.4s, v6.8h, v3.h[5] 243 LDR d4, [x5, 96] 244 SMLAL v24.4s, v5.4h, v0.h[5] 245 LDR d6, [x5, 104] 246 SMLAL2 v28.4s, v5.8h, v0.h[5] 247 SMLAL v25.4s, v5.4h, v1.h[5] 248 SMLAL2 v29.4s, v5.8h, v1.h[5] 249 USUBL v4.8h, v4.8b, v7.8b 250 SMLAL v26.4s, v5.4h, v2.h[5] 251 SMLAL2 v30.4s, v5.8h, v2.h[5] 252 SMLAL v27.4s, v5.4h, v3.h[5] 253 SMLAL2 v31.4s, v5.8h, v3.h[5] 254 USUBL v6.8h, v6.8b, v7.8b 255 SMLAL v16.4s, v4.4h, v0.h[6] 256 SMLAL2 v20.4s, v4.8h, v0.h[6] 257 SMLAL v17.4s, v4.4h, v1.h[6] 258 SMLAL2 v21.4s, v4.8h, v1.h[6] 259 SMLAL v18.4s, v4.4h, v2.h[6] 260 SMLAL2 v22.4s, v4.8h, v2.h[6] 261 SMLAL v19.4s, v4.4h, v3.h[6] 262 SMLAL2 v23.4s, v4.8h, v3.h[6] 263 LDR d4, [x5, 112] 264 SMLAL v24.4s, v6.4h, v0.h[6] 265 LDR d5, [x5, 120] 266 SMLAL2 v28.4s, v6.8h, v0.h[6] 267 SMLAL v25.4s, v6.4h, v1.h[6] 268 SMLAL2 v29.4s, v6.8h, v1.h[6] 269 USUBL v4.8h, v4.8b, v7.8b 270 ADD x5, x5, 128 271 272 SMLAL v26.4s, v6.4h, v2.h[6] 273 SMLAL2 v30.4s, v6.8h, v2.h[6] 274 SMLAL v27.4s, v6.4h, v3.h[6] 275 SMLAL2 v31.4s, v6.8h, v3.h[6] 276 USUBL v5.8h, v5.8b, v7.8b 277 278 SMLAL v16.4s, v4.4h, v0.h[7] 279 SMLAL2 v20.4s, v4.8h, v0.h[7] 280 SMLAL v17.4s, v4.4h, v1.h[7] 281 SMLAL2 v21.4s, v4.8h, v1.h[7] 282 SMLAL v18.4s, v4.4h, v2.h[7] 283 SMLAL2 v22.4s, v4.8h, v2.h[7] 284 SMLAL v19.4s, v4.4h, v3.h[7] 285 SMLAL2 v23.4s, v4.8h, v3.h[7] 286 LDR d4, [x5] 287 SMLAL v24.4s, v5.4h, v0.h[7] 288 LDR d6, [x5, 8] 289 SMLAL2 v28.4s, v5.8h, v0.h[7] 290 SMLAL v25.4s, v5.4h, v1.h[7] 291 SMLAL2 v29.4s, v5.8h, v1.h[7] 292 LDR d0, [x13], 8 293 SMLAL v26.4s, v5.4h, v2.h[7] 294 LDR d1, [x14], 8 295 SMLAL2 v30.4s, v5.8h, v2.h[7] 296 SMLAL v27.4s, v5.4h, v3.h[7] 297 SMLAL2 v31.4s, v5.8h, v3.h[7] 298 LDR d2, [x15], 8 299 300 UXTL v0.8h, v0.8b 301 LDR d3, [x20], 8 302 UXTL v1.8h, v1.8b 303 USUBL v4.8h, v4.8b, v7.8b 304 UXTL v2.8h, v2.8b 305 SUBS x0, x0, 8 306 UXTL v3.8h, v3.8b 307 USUBL v6.8h, v6.8b, v7.8b 308 B.HS 2b 309 310 # Epilogue. Same as main loop but no preloads in final group 311 312 .p2align 3 3133: 314 SMLAL v16.4s, v4.4h, v0.h[0] 315 SMLAL2 v20.4s, v4.8h, v0.h[0] 316 SMLAL v17.4s, v4.4h, v1.h[0] 317 SMLAL2 v21.4s, v4.8h, v1.h[0] 318 SMLAL v18.4s, v4.4h, v2.h[0] 319 SMLAL2 v22.4s, v4.8h, v2.h[0] 320 SMLAL v19.4s, v4.4h, v3.h[0] 321 SMLAL2 v23.4s, v4.8h, v3.h[0] 322 LDR d5, [x5, 16] 323 SMLAL v24.4s, v6.4h, v0.h[0] 324 LDR d4, [x5, 24] 325 SMLAL2 v28.4s, v6.8h, v0.h[0] 326 SMLAL v25.4s, v6.4h, v1.h[0] 327 SMLAL2 v29.4s, v6.8h, v1.h[0] 328 USUBL v5.8h, v5.8b, v7.8b 329 SMLAL v26.4s, v6.4h, v2.h[0] 330 SMLAL2 v30.4s, v6.8h, v2.h[0] 331 SMLAL v27.4s, v6.4h, v3.h[0] 332 SMLAL2 v31.4s, v6.8h, v3.h[0] 333 SMLAL v16.4s, v5.4h, v0.h[1] 334 SMLAL2 v20.4s, v5.8h, v0.h[1] 335 SMLAL v17.4s, v5.4h, v1.h[1] 336 SMLAL2 v21.4s, v5.8h, v1.h[1] 337 USUBL v4.8h, v4.8b, v7.8b 338 SMLAL v18.4s, v5.4h, v2.h[1] 339 SMLAL2 v22.4s, v5.8h, v2.h[1] 340 SMLAL v19.4s, v5.4h, v3.h[1] 341 SMLAL2 v23.4s, v5.8h, v3.h[1] 342 LDR d6, [x5, 32] 343 SMLAL v24.4s, v4.4h, v0.h[1] 344 LDR d5, [x5, 40] 345 SMLAL2 v28.4s, v4.8h, v0.h[1] 346 SMLAL v25.4s, v4.4h, v1.h[1] 347 SMLAL2 v29.4s, v4.8h, v1.h[1] 348 USUBL v6.8h, v6.8b, v7.8b 349 SMLAL v26.4s, v4.4h, v2.h[1] 350 SMLAL2 v30.4s, v4.8h, v2.h[1] 351 SMLAL v27.4s, v4.4h, v3.h[1] 352 SMLAL2 v31.4s, v4.8h, v3.h[1] 353 SMLAL v16.4s, v6.4h, v0.h[2] 354 SMLAL2 v20.4s, v6.8h, v0.h[2] 355 SMLAL v17.4s, v6.4h, v1.h[2] 356 USUBL v5.8h, v5.8b, v7.8b 357 SMLAL2 v21.4s, v6.8h, v1.h[2] 358 SMLAL v18.4s, v6.4h, v2.h[2] 359 SMLAL2 v22.4s, v6.8h, v2.h[2] 360 SMLAL v19.4s, v6.4h, v3.h[2] 361 SMLAL2 v23.4s, v6.8h, v3.h[2] 362 LDR d4, [x5, 48] 363 SMLAL v24.4s, v5.4h, v0.h[2] 364 LDR d6, [x5, 56] 365 SMLAL2 v28.4s, v5.8h, v0.h[2] 366 SMLAL v25.4s, v5.4h, v1.h[2] 367 SMLAL2 v29.4s, v5.8h, v1.h[2] 368 USUBL v4.8h, v4.8b, v7.8b 369 SMLAL v26.4s, v5.4h, v2.h[2] 370 SMLAL2 v30.4s, v5.8h, v2.h[2] 371 SMLAL v27.4s, v5.4h, v3.h[2] 372 SMLAL2 v31.4s, v5.8h, v3.h[2] 373 SMLAL v16.4s, v4.4h, v0.h[3] 374 SMLAL2 v20.4s, v4.8h, v0.h[3] 375 SMLAL v17.4s, v4.4h, v1.h[3] 376 SMLAL2 v21.4s, v4.8h, v1.h[3] 377 USUBL v6.8h, v6.8b, v7.8b 378 SMLAL v18.4s, v4.4h, v2.h[3] 379 SMLAL2 v22.4s, v4.8h, v2.h[3] 380 SMLAL v19.4s, v4.4h, v3.h[3] 381 SMLAL2 v23.4s, v4.8h, v3.h[3] 382 LDR d5, [x5, 64] 383 SMLAL v24.4s, v6.4h, v0.h[3] 384 LDR d4, [x5, 72] 385 SMLAL2 v28.4s, v6.8h, v0.h[3] 386 USUBL v5.8h, v5.8b, v7.8b 387 SMLAL v25.4s, v6.4h, v1.h[3] 388 SMLAL2 v29.4s, v6.8h, v1.h[3] 389 SMLAL v26.4s, v6.4h, v2.h[3] 390 SMLAL2 v30.4s, v6.8h, v2.h[3] 391 SMLAL v27.4s, v6.4h, v3.h[3] 392 SMLAL2 v31.4s, v6.8h, v3.h[3] 393 SMLAL v16.4s, v5.4h, v0.h[4] 394 SMLAL2 v20.4s, v5.8h, v0.h[4] 395 SMLAL v17.4s, v5.4h, v1.h[4] 396 SMLAL2 v21.4s, v5.8h, v1.h[4] 397 USUBL v4.8h, v4.8b, v7.8b 398 SMLAL v18.4s, v5.4h, v2.h[4] 399 SMLAL2 v22.4s, v5.8h, v2.h[4] 400 SMLAL v19.4s, v5.4h, v3.h[4] 401 SMLAL2 v23.4s, v5.8h, v3.h[4] 402 LDR d6, [x5, 80] 403 SMLAL v24.4s, v4.4h, v0.h[4] 404 LDR d5, [x5, 88] 405 SMLAL2 v28.4s, v4.8h, v0.h[4] 406 SMLAL v25.4s, v4.4h, v1.h[4] 407 SMLAL2 v29.4s, v4.8h, v1.h[4] 408 USUBL v6.8h, v6.8b, v7.8b 409 SMLAL v26.4s, v4.4h, v2.h[4] 410 SMLAL2 v30.4s, v4.8h, v2.h[4] 411 SMLAL v27.4s, v4.4h, v3.h[4] 412 SMLAL2 v31.4s, v4.8h, v3.h[4] 413 SMLAL v16.4s, v6.4h, v0.h[5] 414 SMLAL2 v20.4s, v6.8h, v0.h[5] 415 SMLAL v17.4s, v6.4h, v1.h[5] 416 SMLAL2 v21.4s, v6.8h, v1.h[5] 417 USUBL v5.8h, v5.8b, v7.8b 418 SMLAL v18.4s, v6.4h, v2.h[5] 419 SMLAL2 v22.4s, v6.8h, v2.h[5] 420 SMLAL v19.4s, v6.4h, v3.h[5] 421 SMLAL2 v23.4s, v6.8h, v3.h[5] 422 LDR d4, [x5, 96] 423 SMLAL v24.4s, v5.4h, v0.h[5] 424 LDR d6, [x5, 104] 425 SMLAL2 v28.4s, v5.8h, v0.h[5] 426 SMLAL v25.4s, v5.4h, v1.h[5] 427 SMLAL2 v29.4s, v5.8h, v1.h[5] 428 USUBL v4.8h, v4.8b, v7.8b 429 SMLAL v26.4s, v5.4h, v2.h[5] 430 SMLAL2 v30.4s, v5.8h, v2.h[5] 431 SMLAL v27.4s, v5.4h, v3.h[5] 432 SMLAL2 v31.4s, v5.8h, v3.h[5] 433 USUBL v6.8h, v6.8b, v7.8b 434 SMLAL v16.4s, v4.4h, v0.h[6] 435 SMLAL2 v20.4s, v4.8h, v0.h[6] 436 SMLAL v17.4s, v4.4h, v1.h[6] 437 SMLAL2 v21.4s, v4.8h, v1.h[6] 438 SMLAL v18.4s, v4.4h, v2.h[6] 439 SMLAL2 v22.4s, v4.8h, v2.h[6] 440 SMLAL v19.4s, v4.4h, v3.h[6] 441 SMLAL2 v23.4s, v4.8h, v3.h[6] 442 SMLAL v24.4s, v6.4h, v0.h[6] 443 SMLAL2 v28.4s, v6.8h, v0.h[6] 444 SMLAL v25.4s, v6.4h, v1.h[6] 445 SMLAL2 v29.4s, v6.8h, v1.h[6] 446 LDR d4, [x5, 112] 447 USUBL v4.8h, v4.8b, v7.8b 448 LDR d5, [x5, 120] 449 SMLAL v26.4s, v6.4h, v2.h[6] 450 SMLAL2 v30.4s, v6.8h, v2.h[6] 451 SMLAL v27.4s, v6.4h, v3.h[6] 452 SMLAL2 v31.4s, v6.8h, v3.h[6] 453 SMLAL v16.4s, v4.4h, v0.h[7] 454 SMLAL2 v20.4s, v4.8h, v0.h[7] 455 SMLAL v17.4s, v4.4h, v1.h[7] 456 SMLAL2 v21.4s, v4.8h, v1.h[7] 457 USUBL v5.8h, v5.8b, v7.8b 458 SMLAL v18.4s, v4.4h, v2.h[7] 459 SMLAL2 v22.4s, v4.8h, v2.h[7] 460 SMLAL v19.4s, v4.4h, v3.h[7] 461 SMLAL2 v23.4s, v4.8h, v3.h[7] 462 ADD x5, x5, 128 463 SMLAL v24.4s, v5.4h, v0.h[7] 464 SMLAL2 v28.4s, v5.8h, v0.h[7] 465 SMLAL v25.4s, v5.4h, v1.h[7] 466 SMLAL2 v29.4s, v5.8h, v1.h[7] 467 AND x0, x2, 7 // kc remainder 0 to 7 468 SMLAL v26.4s, v5.4h, v2.h[7] 469 SMLAL2 v30.4s, v5.8h, v2.h[7] 470 SMLAL v27.4s, v5.4h, v3.h[7] 471 SMLAL2 v31.4s, v5.8h, v3.h[7] 472 473 # Is there a remainder?- 1 to 7 bytes of A 474 CBNZ x0, 5f 475 4764: 477 # ks loop 478 SUBS x9, x9, 32 // ks -= MR * sizeof(uint8_t*) 479 B.HI 1b 480 481 # Apply params - preshift, scale, postshift, bias and clamp 482 LD1R {v4.4s}, [x11], 4 483 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 484 SQSHL v17.4s, v17.4s, v4.4s 485 SQSHL v18.4s, v18.4s, v4.4s 486 SQSHL v19.4s, v19.4s, v4.4s 487 SQSHL v20.4s, v20.4s, v4.4s 488 SQSHL v21.4s, v21.4s, v4.4s 489 SQSHL v22.4s, v22.4s, v4.4s 490 SQSHL v23.4s, v23.4s, v4.4s 491 LD1R {v5.4s}, [x11], 4 492 SQSHL v24.4s, v24.4s, v4.4s 493 SQSHL v25.4s, v25.4s, v4.4s 494 SQSHL v26.4s, v26.4s, v4.4s 495 SQSHL v27.4s, v27.4s, v4.4s 496 SQSHL v28.4s, v28.4s, v4.4s 497 SQSHL v29.4s, v29.4s, v4.4s 498 SQSHL v30.4s, v30.4s, v4.4s 499 SQSHL v31.4s, v31.4s, v4.4s 500 LD1R {v6.4s}, [x11], 4 501 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 502 SQDMULH v17.4s, v17.4s, v5.4s 503 SQDMULH v18.4s, v18.4s, v5.4s 504 SQDMULH v19.4s, v19.4s, v5.4s 505 SQDMULH v20.4s, v20.4s, v5.4s 506 SQDMULH v21.4s, v21.4s, v5.4s 507 SQDMULH v22.4s, v22.4s, v5.4s 508 SQDMULH v23.4s, v23.4s, v5.4s 509 SQDMULH v24.4s, v24.4s, v5.4s 510 SQDMULH v25.4s, v25.4s, v5.4s 511 SQDMULH v26.4s, v26.4s, v5.4s 512 SQDMULH v27.4s, v27.4s, v5.4s 513 SQDMULH v28.4s, v28.4s, v5.4s 514 SQDMULH v29.4s, v29.4s, v5.4s 515 SQDMULH v30.4s, v30.4s, v5.4s 516 SQDMULH v31.4s, v31.4s, v5.4s 517 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 518 SRSHL v17.4s, v17.4s, v6.4s 519 SRSHL v18.4s, v18.4s, v6.4s 520 SRSHL v19.4s, v19.4s, v6.4s 521 SRSHL v20.4s, v20.4s, v6.4s 522 SRSHL v21.4s, v21.4s, v6.4s 523 SRSHL v22.4s, v22.4s, v6.4s 524 SRSHL v23.4s, v23.4s, v6.4s 525 SRSHL v24.4s, v24.4s, v6.4s 526 SRSHL v25.4s, v25.4s, v6.4s 527 SRSHL v26.4s, v26.4s, v6.4s 528 SRSHL v27.4s, v27.4s, v6.4s 529 SRSHL v28.4s, v28.4s, v6.4s 530 SRSHL v29.4s, v29.4s, v6.4s 531 SRSHL v30.4s, v30.4s, v6.4s 532 SRSHL v31.4s, v31.4s, v6.4s 533 534 SQXTN v16.4h, v16.4s 535 SQXTN v17.4h, v17.4s 536 SQXTN v18.4h, v18.4s 537 SQXTN v19.4h, v19.4s 538 SQXTN v24.4h, v24.4s 539 SQXTN v25.4h, v25.4s 540 SQXTN v26.4h, v26.4s 541 SQXTN v27.4h, v27.4s 542 LD1R {v6.8h}, [x11], 2 // add bias 543 544 SQXTN2 v16.8h, v20.4s 545 SQXTN2 v17.8h, v21.4s 546 SQXTN2 v18.8h, v22.4s 547 SQXTN2 v19.8h, v23.4s 548 SQXTN2 v24.8h, v28.4s 549 SQXTN2 v25.8h, v29.4s 550 SQXTN2 v26.8h, v30.4s 551 SQXTN2 v27.8h, v31.4s 552 553 SQADD v16.8h, v16.8h, v6.8h 554 SQADD v17.8h, v17.8h, v6.8h 555 SQADD v18.8h, v18.8h, v6.8h 556 SQADD v19.8h, v19.8h, v6.8h 557 SQADD v24.8h, v24.8h, v6.8h 558 SQADD v25.8h, v25.8h, v6.8h 559 SQADD v26.8h, v26.8h, v6.8h 560 SQADD v27.8h, v27.8h, v6.8h 561 LD1R {v4.16b}, [x11], 1 // clamp min value 562 563 SQXTUN v0.8b, v16.8h 564 SQXTUN v1.8b, v17.8h 565 SQXTUN v2.8b, v18.8h 566 SQXTUN v3.8b, v19.8h 567 LD1R {v5.16b}, [x11] // clamp max value 568 SQXTUN2 v0.16b, v24.8h 569 SQXTUN2 v1.16b, v25.8h 570 SQXTUN2 v2.16b, v26.8h 571 SQXTUN2 v3.16b, v27.8h 572 SUB x11, x11, 15 // rewind params pointer 573 574 UMAX v0.16b, v0.16b, v4.16b 575 UMAX v1.16b, v1.16b, v4.16b 576 UMAX v2.16b, v2.16b, v4.16b 577 UMAX v3.16b, v3.16b, v4.16b 578 SUBS x1, x1, 16 579 UMIN v0.16b, v0.16b, v5.16b 580 UMIN v1.16b, v1.16b, v5.16b 581 UMIN v2.16b, v2.16b, v5.16b 582 UMIN v3.16b, v3.16b, v5.16b 583 B.LO 6f 584 585 # Store full 4 x 16 586 ST1 {v3.16b}, [x7], x10 587 ST1 {v2.16b}, [x17], x10 588 ST1 {v1.16b}, [x16], x10 589 ST1 {v0.16b}, [x6], x10 590 591 SUB x4, x4, x3 // a -= ks 592 593 # nc loop 594 B.HI 0b 595 596 # Restore x20 from stack 597 LDR x20, [sp], 16 598 RET 599 600 # Remainder- 1 to 7 bytes of A 601 .p2align 3 6025: 603 AND x0, x2, 7 // kc remainder 1 to 7 604 605 LD1 {v0.8b}, [x13], x0 606 LDP d4, d5, [x5], 16 607 LD1 {v1.8b}, [x14], x0 608 LD1 {v2.8b}, [x15], x0 609 LD1 {v3.8b}, [x20], x0 610 UXTL v0.8h, v0.8b 611 USUBL v4.8h, v4.8b, v7.8b 612 USUBL v5.8h, v5.8b, v7.8b 613 UXTL v1.8h, v1.8b 614 UXTL v2.8h, v2.8b 615 UXTL v3.8h, v3.8b 616 SMLAL v16.4s, v4.4h, v0.h[0] 617 SMLAL2 v20.4s, v4.8h, v0.h[0] 618 SMLAL v24.4s, v5.4h, v0.h[0] 619 SMLAL2 v28.4s, v5.8h, v0.h[0] 620 SMLAL v17.4s, v4.4h, v1.h[0] 621 SMLAL2 v21.4s, v4.8h, v1.h[0] 622 SMLAL v25.4s, v5.4h, v1.h[0] 623 SMLAL2 v29.4s, v5.8h, v1.h[0] 624 SMLAL v18.4s, v4.4h, v2.h[0] 625 SMLAL2 v22.4s, v4.8h, v2.h[0] 626 SMLAL v26.4s, v5.4h, v2.h[0] 627 SMLAL2 v30.4s, v5.8h, v2.h[0] 628 SMLAL v19.4s, v4.4h, v3.h[0] 629 SMLAL2 v23.4s, v4.8h, v3.h[0] 630 SMLAL v27.4s, v5.4h, v3.h[0] 631 SMLAL2 v31.4s, v5.8h, v3.h[0] 632 CMP x0, 2 633 B.LO 4b 634 635 LDP d4, d5, [x5], 16 636 USUBL v4.8h, v4.8b, v7.8b 637 USUBL v5.8h, v5.8b, v7.8b 638 SMLAL v16.4s, v4.4h, v0.h[1] 639 SMLAL2 v20.4s, v4.8h, v0.h[1] 640 SMLAL v24.4s, v5.4h, v0.h[1] 641 SMLAL2 v28.4s, v5.8h, v0.h[1] 642 SMLAL v17.4s, v4.4h, v1.h[1] 643 SMLAL2 v21.4s, v4.8h, v1.h[1] 644 SMLAL v25.4s, v5.4h, v1.h[1] 645 SMLAL2 v29.4s, v5.8h, v1.h[1] 646 SMLAL v18.4s, v4.4h, v2.h[1] 647 SMLAL2 v22.4s, v4.8h, v2.h[1] 648 SMLAL v26.4s, v5.4h, v2.h[1] 649 SMLAL2 v30.4s, v5.8h, v2.h[1] 650 SMLAL v19.4s, v4.4h, v3.h[1] 651 SMLAL2 v23.4s, v4.8h, v3.h[1] 652 SMLAL v27.4s, v5.4h, v3.h[1] 653 SMLAL2 v31.4s, v5.8h, v3.h[1] 654 B.EQ 4b 655 656 LDP d4, d5, [x5], 16 657 USUBL v4.8h, v4.8b, v7.8b 658 USUBL v5.8h, v5.8b, v7.8b 659 SMLAL v16.4s, v4.4h, v0.h[2] 660 SMLAL2 v20.4s, v4.8h, v0.h[2] 661 SMLAL v24.4s, v5.4h, v0.h[2] 662 SMLAL2 v28.4s, v5.8h, v0.h[2] 663 SMLAL v17.4s, v4.4h, v1.h[2] 664 SMLAL2 v21.4s, v4.8h, v1.h[2] 665 SMLAL v25.4s, v5.4h, v1.h[2] 666 SMLAL2 v29.4s, v5.8h, v1.h[2] 667 SMLAL v18.4s, v4.4h, v2.h[2] 668 SMLAL2 v22.4s, v4.8h, v2.h[2] 669 SMLAL v26.4s, v5.4h, v2.h[2] 670 SMLAL2 v30.4s, v5.8h, v2.h[2] 671 SMLAL v19.4s, v4.4h, v3.h[2] 672 SMLAL2 v23.4s, v4.8h, v3.h[2] 673 SMLAL v27.4s, v5.4h, v3.h[2] 674 SMLAL2 v31.4s, v5.8h, v3.h[2] 675 CMP x0, 4 676 B.LO 4b 677 678 LDP d4, d5, [x5], 16 679 USUBL v4.8h, v4.8b, v7.8b 680 USUBL v5.8h, v5.8b, v7.8b 681 SMLAL v16.4s, v4.4h, v0.h[3] 682 SMLAL2 v20.4s, v4.8h, v0.h[3] 683 SMLAL v24.4s, v5.4h, v0.h[3] 684 SMLAL2 v28.4s, v5.8h, v0.h[3] 685 SMLAL v17.4s, v4.4h, v1.h[3] 686 SMLAL2 v21.4s, v4.8h, v1.h[3] 687 SMLAL v25.4s, v5.4h, v1.h[3] 688 SMLAL2 v29.4s, v5.8h, v1.h[3] 689 SMLAL v18.4s, v4.4h, v2.h[3] 690 SMLAL2 v22.4s, v4.8h, v2.h[3] 691 SMLAL v26.4s, v5.4h, v2.h[3] 692 SMLAL2 v30.4s, v5.8h, v2.h[3] 693 SMLAL v19.4s, v4.4h, v3.h[3] 694 SMLAL2 v23.4s, v4.8h, v3.h[3] 695 SMLAL v27.4s, v5.4h, v3.h[3] 696 SMLAL2 v31.4s, v5.8h, v3.h[3] 697 B.EQ 4b 698 699 LDP d4, d5, [x5], 16 700 USUBL v4.8h, v4.8b, v7.8b 701 USUBL v5.8h, v5.8b, v7.8b 702 SMLAL v16.4s, v4.4h, v0.h[4] 703 SMLAL2 v20.4s, v4.8h, v0.h[4] 704 SMLAL v24.4s, v5.4h, v0.h[4] 705 SMLAL2 v28.4s, v5.8h, v0.h[4] 706 SMLAL v17.4s, v4.4h, v1.h[4] 707 SMLAL2 v21.4s, v4.8h, v1.h[4] 708 SMLAL v25.4s, v5.4h, v1.h[4] 709 SMLAL2 v29.4s, v5.8h, v1.h[4] 710 SMLAL v18.4s, v4.4h, v2.h[4] 711 SMLAL2 v22.4s, v4.8h, v2.h[4] 712 SMLAL v26.4s, v5.4h, v2.h[4] 713 SMLAL2 v30.4s, v5.8h, v2.h[4] 714 SMLAL v19.4s, v4.4h, v3.h[4] 715 SMLAL2 v23.4s, v4.8h, v3.h[4] 716 SMLAL v27.4s, v5.4h, v3.h[4] 717 SMLAL2 v31.4s, v5.8h, v3.h[4] 718 CMP x0, 6 719 B.LO 4b 720 721 LDP d4, d5, [x5], 16 722 USUBL v4.8h, v4.8b, v7.8b 723 USUBL v5.8h, v5.8b, v7.8b 724 SMLAL v16.4s, v4.4h, v0.h[5] 725 SMLAL2 v20.4s, v4.8h, v0.h[5] 726 SMLAL v24.4s, v5.4h, v0.h[5] 727 SMLAL2 v28.4s, v5.8h, v0.h[5] 728 SMLAL v17.4s, v4.4h, v1.h[5] 729 SMLAL2 v21.4s, v4.8h, v1.h[5] 730 SMLAL v25.4s, v5.4h, v1.h[5] 731 SMLAL2 v29.4s, v5.8h, v1.h[5] 732 SMLAL v18.4s, v4.4h, v2.h[5] 733 SMLAL2 v22.4s, v4.8h, v2.h[5] 734 SMLAL v26.4s, v5.4h, v2.h[5] 735 SMLAL2 v30.4s, v5.8h, v2.h[5] 736 SMLAL v19.4s, v4.4h, v3.h[5] 737 SMLAL2 v23.4s, v4.8h, v3.h[5] 738 SMLAL v27.4s, v5.4h, v3.h[5] 739 SMLAL2 v31.4s, v5.8h, v3.h[5] 740 B.EQ 4b 741 742 LDP d4, d5, [x5], 16 743 USUBL v4.8h, v4.8b, v7.8b 744 USUBL v5.8h, v5.8b, v7.8b 745 SMLAL v16.4s, v4.4h, v0.h[6] 746 SMLAL2 v20.4s, v4.8h, v0.h[6] 747 SMLAL v24.4s, v5.4h, v0.h[6] 748 SMLAL2 v28.4s, v5.8h, v0.h[6] 749 SMLAL v17.4s, v4.4h, v1.h[6] 750 SMLAL2 v21.4s, v4.8h, v1.h[6] 751 SMLAL v25.4s, v5.4h, v1.h[6] 752 SMLAL2 v29.4s, v5.8h, v1.h[6] 753 SMLAL v18.4s, v4.4h, v2.h[6] 754 SMLAL2 v22.4s, v4.8h, v2.h[6] 755 SMLAL v26.4s, v5.4h, v2.h[6] 756 SMLAL2 v30.4s, v5.8h, v2.h[6] 757 SMLAL v19.4s, v4.4h, v3.h[6] 758 SMLAL2 v23.4s, v4.8h, v3.h[6] 759 SMLAL v27.4s, v5.4h, v3.h[6] 760 SMLAL2 v31.4s, v5.8h, v3.h[6] 761 B 4b 762 763 # Store odd width 764 .p2align 3 7656: 766 TBZ x1, 3, 7f 767 STR d3, [x7], 8 768 STR d2, [x17], 8 769 DUP d3, v3.d[1] 770 DUP d2, v2.d[1] 771 STR d1, [x16], 8 772 STR d0, [x6], 8 773 DUP d1, v1.d[1] 774 DUP d0, v0.d[1] 7757: 776 TBZ x1, 2, 8f 777 STR s3, [x7], 4 778 STR s2, [x17], 4 779 DUP s3, v3.s[1] 780 DUP s2, v2.s[1] 781 STR s1, [x16], 4 782 STR s0, [x6], 4 783 DUP s1, v1.s[1] 784 DUP s0, v0.s[1] 7858: 786 TBZ x1, 1, 9f 787 STR h3, [x7], 2 788 STR h2, [x17], 2 789 DUP h3, v3.h[1] 790 DUP h2, v2.h[1] 791 STR h1, [x16], 2 792 STR h0, [x6], 2 793 DUP h1, v1.h[1] 794 DUP h0, v0.h[1] 7959: 796 TBZ x1, 0, 10f 797 STR b3, [x7] 798 STR b2, [x17] 799 STR b1, [x16] 800 STR b0, [x6] 80110: 802 # Restore x20 from stack 803 LDR x20, [sp], 16 804 RET 805 806END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75 807 808#ifdef __ELF__ 809.section ".note.GNU-stack","",%progbits 810#endif 811