1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t** restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x20 v3 34# B x5 v4 v5 v6 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 40# x11, x21 temp for Cortex-A53 loads 41 42BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53 43 44 # Clamp C pointers 45 CMP x0, 2 // if mr < 2 46 LDP x10, x8, [sp] // Load cn_stride, a_offset 47 ADD x16, x6, x7 // c1 = c0 + cm_stride 48 CSEL x16, x6, x16, LO // c1 = c0 49 50 ADD x17, x16, x7 // c2 = c1 + cm_stride 51 LDP x12, x11, [sp, 16] // Load zero, params pointer 52 // if mr <= 2 53 CSEL x17, x16, x17, LS // c2 = c1 54 55 CMP x0, 4 // if mr < 4 56 STP x20, x21, [sp, -16]! // Save x20-x21 on stack 57 ADD x7, x17, x7 // c3 = c2 + cm_stride 58 CSEL x7, x17, x7, LO // c3 = c2 59 60 61 .p2align 3 620: 63 # Load initial bias from w into accumulators 64 LDP q16, q20, [x5], 32 65 MOV v17.16b, v16.16b 66 MOV v18.16b, v16.16b 67 LDP q24, q28, [x5], 32 68 MOV v19.16b, v16.16b 69 MOV v21.16b, v20.16b 70 MOV v22.16b, v20.16b 71 MOV v23.16b, v20.16b 72 MOV v25.16b, v24.16b 73 MOV v26.16b, v24.16b 74 MOV v27.16b, v24.16b 75 MOV v29.16b, v28.16b 76 MOV v30.16b, v28.16b 77 MOV v31.16b, v28.16b 78 MOV x9, x3 // p = ks 79 80 .p2align 3 811: 82 # Load next 4 A pointers 83 LDP x13, x14, [x4], 16 84 LDP x15, x20, [x4], 16 85 86 CMP x13, x12 // if a0 == zero 87 ADD x13, x13, x8 // a0 += a_offset 88 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 89 CMP x14, x12 // if a1 == zero 90 ADD x14, x14, x8 // a1 += a_offset 91 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 92 CMP x15, x12 // if a2 == zero 93 ADD x15, x15, x8 // a2 += a_offset 94 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 95 CMP x20, x12 // if a3 == zero 96 ADD x20, x20, x8 // a3 += a_offset 97 CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset 98 99 # Is there at least 8 bytes for epilogue? 100 SUBS x0, x2, 8 // k = kc - 8 101 B.LO 5f 102 103 # Prologue 104 LDR d0, [x13], 8 105 LDP d4, d6, [x5] 106 LDR d1, [x14], 8 107 LDR d2, [x15], 8 108 LDR d3, [x20], 8 109 SXTL v0.8h, v0.8b 110 LDR x11, [x5, 16] 111 SXTL v4.8h, v4.8b 112 SXTL v1.8h, v1.8b 113 SXTL v2.8h, v2.8b 114 SXTL v3.8h, v3.8b 115 SXTL v6.8h, v6.8b 116 117 SUBS x0, x0, 8 // k = k - 8 118 # Is there at least 8 bytes for main loop? 119 B.LO 3f 120 121 # Main loop - 8 bytes of A 122 .p2align 3 1232: 124 SMLAL v16.4s, v4.4h, v0.h[0] 125 SMLAL2 v20.4s, v4.8h, v0.h[0] 126 PRFM PLDL1KEEP, [x13, 128] 127 SMLAL v17.4s, v4.4h, v1.h[0] 128 SMLAL2 v21.4s, v4.8h, v1.h[0] 129 PRFM PLDL1KEEP, [x14, 128] 130 SMLAL v18.4s, v4.4h, v2.h[0] 131 SMLAL2 v22.4s, v4.8h, v2.h[0] 132 PRFM PLDL1KEEP, [x15, 128] 133 SMLAL v19.4s, v4.4h, v3.h[0] 134 SMLAL2 v23.4s, v4.8h, v3.h[0] 135 PRFM PLDL1KEEP, [x20, 128] 136 LDR d4, [x5, 24] 137 INS v5.d[0], x11 138 SMLAL v24.4s, v6.4h, v0.h[0] 139 SMLAL2 v28.4s, v6.8h, v0.h[0] 140 PRFM PLDL1KEEP, [x5, 448] 141 SMLAL v25.4s, v6.4h, v1.h[0] 142 SMLAL2 v29.4s, v6.8h, v1.h[0] 143 PRFM PLDL1KEEP, [x5, 512] 144 SXTL v5.8h, v5.8b 145 SMLAL v26.4s, v6.4h, v2.h[0] 146 SMLAL2 v30.4s, v6.8h, v2.h[0] 147 SMLAL v27.4s, v6.4h, v3.h[0] 148 SMLAL2 v31.4s, v6.8h, v3.h[0] 149 LDR x11, [x5, 32] 150 SMLAL v16.4s, v5.4h, v0.h[1] 151 SMLAL2 v20.4s, v5.8h, v0.h[1] 152 SMLAL v17.4s, v5.4h, v1.h[1] 153 SMLAL2 v21.4s, v5.8h, v1.h[1] 154 SXTL v4.8h, v4.8b 155 SMLAL v18.4s, v5.4h, v2.h[1] 156 SMLAL2 v22.4s, v5.8h, v2.h[1] 157 SMLAL v19.4s, v5.4h, v3.h[1] 158 SMLAL2 v23.4s, v5.8h, v3.h[1] 159 LDR d5, [x5, 40] 160 INS v6.d[0], x11 161 SMLAL v24.4s, v4.4h, v0.h[1] 162 SMLAL2 v28.4s, v4.8h, v0.h[1] 163 SMLAL v25.4s, v4.4h, v1.h[1] 164 SMLAL2 v29.4s, v4.8h, v1.h[1] 165 SXTL v6.8h, v6.8b 166 SMLAL v26.4s, v4.4h, v2.h[1] 167 SMLAL2 v30.4s, v4.8h, v2.h[1] 168 SMLAL v27.4s, v4.4h, v3.h[1] 169 SMLAL2 v31.4s, v4.8h, v3.h[1] 170 LDR x11, [x5, 48] 171 SMLAL v16.4s, v6.4h, v0.h[2] 172 SMLAL2 v20.4s, v6.8h, v0.h[2] 173 SMLAL v17.4s, v6.4h, v1.h[2] 174 SXTL v5.8h, v5.8b 175 SMLAL2 v21.4s, v6.8h, v1.h[2] 176 SMLAL v18.4s, v6.4h, v2.h[2] 177 SMLAL2 v22.4s, v6.8h, v2.h[2] 178 SMLAL v19.4s, v6.4h, v3.h[2] 179 SMLAL2 v23.4s, v6.8h, v3.h[2] 180 LDR d6, [x5, 56] 181 INS v4.d[0], x11 182 SMLAL v24.4s, v5.4h, v0.h[2] 183 SMLAL2 v28.4s, v5.8h, v0.h[2] 184 SMLAL v25.4s, v5.4h, v1.h[2] 185 SMLAL2 v29.4s, v5.8h, v1.h[2] 186 SXTL v4.8h, v4.8b 187 SMLAL v26.4s, v5.4h, v2.h[2] 188 SMLAL2 v30.4s, v5.8h, v2.h[2] 189 SMLAL v27.4s, v5.4h, v3.h[2] 190 SMLAL2 v31.4s, v5.8h, v3.h[2] 191 LDR x11, [x5, 64] 192 SMLAL v16.4s, v4.4h, v0.h[3] 193 SMLAL2 v20.4s, v4.8h, v0.h[3] 194 SMLAL v17.4s, v4.4h, v1.h[3] 195 SMLAL2 v21.4s, v4.8h, v1.h[3] 196 SXTL v6.8h, v6.8b 197 SMLAL v18.4s, v4.4h, v2.h[3] 198 SMLAL2 v22.4s, v4.8h, v2.h[3] 199 SMLAL v19.4s, v4.4h, v3.h[3] 200 SMLAL2 v23.4s, v4.8h, v3.h[3] 201 LDR d4, [x5, 72] 202 INS v5.d[0], x11 203 SMLAL v24.4s, v6.4h, v0.h[3] 204 SMLAL2 v28.4s, v6.8h, v0.h[3] 205 SXTL v5.8h, v5.8b 206 SMLAL v25.4s, v6.4h, v1.h[3] 207 SMLAL2 v29.4s, v6.8h, v1.h[3] 208 SMLAL v26.4s, v6.4h, v2.h[3] 209 SMLAL2 v30.4s, v6.8h, v2.h[3] 210 SMLAL v27.4s, v6.4h, v3.h[3] 211 SMLAL2 v31.4s, v6.8h, v3.h[3] 212 LDR x11, [x5, 80] 213 SMLAL v16.4s, v5.4h, v0.h[4] 214 SMLAL2 v20.4s, v5.8h, v0.h[4] 215 SMLAL v17.4s, v5.4h, v1.h[4] 216 SMLAL2 v21.4s, v5.8h, v1.h[4] 217 SXTL v4.8h, v4.8b 218 SMLAL v18.4s, v5.4h, v2.h[4] 219 SMLAL2 v22.4s, v5.8h, v2.h[4] 220 SMLAL v19.4s, v5.4h, v3.h[4] 221 SMLAL2 v23.4s, v5.8h, v3.h[4] 222 LDR d5, [x5, 88] 223 INS v6.d[0], x11 224 SMLAL v24.4s, v4.4h, v0.h[4] 225 SMLAL2 v28.4s, v4.8h, v0.h[4] 226 SMLAL v25.4s, v4.4h, v1.h[4] 227 SMLAL2 v29.4s, v4.8h, v1.h[4] 228 SXTL v6.8h, v6.8b 229 SMLAL v26.4s, v4.4h, v2.h[4] 230 SMLAL2 v30.4s, v4.8h, v2.h[4] 231 SMLAL v27.4s, v4.4h, v3.h[4] 232 SMLAL2 v31.4s, v4.8h, v3.h[4] 233 LDR x11, [x5, 96] 234 SMLAL v16.4s, v6.4h, v0.h[5] 235 SMLAL2 v20.4s, v6.8h, v0.h[5] 236 SMLAL v17.4s, v6.4h, v1.h[5] 237 SMLAL2 v21.4s, v6.8h, v1.h[5] 238 SXTL v5.8h, v5.8b 239 SMLAL v18.4s, v6.4h, v2.h[5] 240 SMLAL2 v22.4s, v6.8h, v2.h[5] 241 SMLAL v19.4s, v6.4h, v3.h[5] 242 SMLAL2 v23.4s, v6.8h, v3.h[5] 243 LDR d6, [x5, 104] 244 INS v4.d[0], x11 245 SMLAL v24.4s, v5.4h, v0.h[5] 246 SMLAL2 v28.4s, v5.8h, v0.h[5] 247 SMLAL v25.4s, v5.4h, v1.h[5] 248 SMLAL2 v29.4s, v5.8h, v1.h[5] 249 SXTL v4.8h, v4.8b 250 SMLAL v26.4s, v5.4h, v2.h[5] 251 SMLAL2 v30.4s, v5.8h, v2.h[5] 252 SMLAL v27.4s, v5.4h, v3.h[5] 253 SMLAL2 v31.4s, v5.8h, v3.h[5] 254 SXTL v6.8h, v6.8b 255 LDR x11, [x5, 112] 256 SMLAL v16.4s, v4.4h, v0.h[6] 257 SMLAL2 v20.4s, v4.8h, v0.h[6] 258 SMLAL v17.4s, v4.4h, v1.h[6] 259 SMLAL2 v21.4s, v4.8h, v1.h[6] 260 SMLAL v18.4s, v4.4h, v2.h[6] 261 SMLAL2 v22.4s, v4.8h, v2.h[6] 262 SMLAL v19.4s, v4.4h, v3.h[6] 263 SMLAL2 v23.4s, v4.8h, v3.h[6] 264 LDR d5, [x5, 120] 265 INS v4.d[0], x11 266 SMLAL v24.4s, v6.4h, v0.h[6] 267 SMLAL2 v28.4s, v6.8h, v0.h[6] 268 SMLAL v25.4s, v6.4h, v1.h[6] 269 SMLAL2 v29.4s, v6.8h, v1.h[6] 270 SXTL v4.8h, v4.8b 271 ADD x5, x5, 128 272 273 SMLAL v26.4s, v6.4h, v2.h[6] 274 SMLAL2 v30.4s, v6.8h, v2.h[6] 275 LDR x11, [x5] 276 SMLAL v27.4s, v6.4h, v3.h[6] 277 SMLAL2 v31.4s, v6.8h, v3.h[6] 278 SXTL v5.8h, v5.8b 279 LDR x21, [x13], 8 280 281 SMLAL v16.4s, v4.4h, v0.h[7] 282 SMLAL2 v20.4s, v4.8h, v0.h[7] 283 SMLAL v17.4s, v4.4h, v1.h[7] 284 SMLAL2 v21.4s, v4.8h, v1.h[7] 285 SMLAL v18.4s, v4.4h, v2.h[7] 286 SMLAL2 v22.4s, v4.8h, v2.h[7] 287 SMLAL v19.4s, v4.4h, v3.h[7] 288 SMLAL2 v23.4s, v4.8h, v3.h[7] 289 LDR d6, [x5, 8] 290 INS v4.d[0], x11 291 SMLAL v24.4s, v5.4h, v0.h[7] 292 SMLAL2 v28.4s, v5.8h, v0.h[7] 293 LDR x11, [x15], 8 294 SMLAL v25.4s, v5.4h, v1.h[7] 295 SMLAL2 v29.4s, v5.8h, v1.h[7] 296 LDR d1, [x14], 8 297 INS v0.d[0], x21 298 SMLAL v26.4s, v5.4h, v2.h[7] 299 SMLAL2 v30.4s, v5.8h, v2.h[7] 300 SMLAL v27.4s, v5.4h, v3.h[7] 301 SMLAL2 v31.4s, v5.8h, v3.h[7] 302 LDR d3, [x20], 8 303 INS v2.d[0], x11 304 305 SXTL v0.8h, v0.8b 306 SXTL v1.8h, v1.8b 307 LDR x11, [x5, 16] 308 SXTL v4.8h, v4.8b 309 SXTL v2.8h, v2.8b 310 SUBS x0, x0, 8 311 SXTL v3.8h, v3.8b 312 SXTL v6.8h, v6.8b 313 B.HS 2b 314 315 # Epilogue. Same as main loop but no preloads in final group 316 317 .p2align 3 3183: 319 SMLAL v16.4s, v4.4h, v0.h[0] 320 SMLAL2 v20.4s, v4.8h, v0.h[0] 321 SMLAL v17.4s, v4.4h, v1.h[0] 322 SMLAL2 v21.4s, v4.8h, v1.h[0] 323 SMLAL v18.4s, v4.4h, v2.h[0] 324 SMLAL2 v22.4s, v4.8h, v2.h[0] 325 SMLAL v19.4s, v4.4h, v3.h[0] 326 SMLAL2 v23.4s, v4.8h, v3.h[0] 327 LDR d4, [x5, 24] 328 INS v5.d[0], x11 329 SMLAL v24.4s, v6.4h, v0.h[0] 330 SMLAL2 v28.4s, v6.8h, v0.h[0] 331 SMLAL v25.4s, v6.4h, v1.h[0] 332 SMLAL2 v29.4s, v6.8h, v1.h[0] 333 SXTL v5.8h, v5.8b 334 SMLAL v26.4s, v6.4h, v2.h[0] 335 SMLAL2 v30.4s, v6.8h, v2.h[0] 336 SMLAL v27.4s, v6.4h, v3.h[0] 337 SMLAL2 v31.4s, v6.8h, v3.h[0] 338 LDR x11, [x5, 32] 339 SMLAL v16.4s, v5.4h, v0.h[1] 340 SMLAL2 v20.4s, v5.8h, v0.h[1] 341 SMLAL v17.4s, v5.4h, v1.h[1] 342 SMLAL2 v21.4s, v5.8h, v1.h[1] 343 SXTL v4.8h, v4.8b 344 SMLAL v18.4s, v5.4h, v2.h[1] 345 SMLAL2 v22.4s, v5.8h, v2.h[1] 346 SMLAL v19.4s, v5.4h, v3.h[1] 347 SMLAL2 v23.4s, v5.8h, v3.h[1] 348 LDR d5, [x5, 40] 349 INS v6.d[0], x11 350 SMLAL v24.4s, v4.4h, v0.h[1] 351 SMLAL2 v28.4s, v4.8h, v0.h[1] 352 SMLAL v25.4s, v4.4h, v1.h[1] 353 SMLAL2 v29.4s, v4.8h, v1.h[1] 354 SXTL v6.8h, v6.8b 355 SMLAL v26.4s, v4.4h, v2.h[1] 356 SMLAL2 v30.4s, v4.8h, v2.h[1] 357 SMLAL v27.4s, v4.4h, v3.h[1] 358 SMLAL2 v31.4s, v4.8h, v3.h[1] 359 LDR x11, [x5, 48] 360 SMLAL v16.4s, v6.4h, v0.h[2] 361 SMLAL2 v20.4s, v6.8h, v0.h[2] 362 SMLAL v17.4s, v6.4h, v1.h[2] 363 SXTL v5.8h, v5.8b 364 SMLAL2 v21.4s, v6.8h, v1.h[2] 365 SMLAL v18.4s, v6.4h, v2.h[2] 366 SMLAL2 v22.4s, v6.8h, v2.h[2] 367 SMLAL v19.4s, v6.4h, v3.h[2] 368 SMLAL2 v23.4s, v6.8h, v3.h[2] 369 LDR d6, [x5, 56] 370 INS v4.d[0], x11 371 SMLAL v24.4s, v5.4h, v0.h[2] 372 SMLAL2 v28.4s, v5.8h, v0.h[2] 373 SMLAL v25.4s, v5.4h, v1.h[2] 374 SMLAL2 v29.4s, v5.8h, v1.h[2] 375 SXTL v4.8h, v4.8b 376 SMLAL v26.4s, v5.4h, v2.h[2] 377 SMLAL2 v30.4s, v5.8h, v2.h[2] 378 SMLAL v27.4s, v5.4h, v3.h[2] 379 SMLAL2 v31.4s, v5.8h, v3.h[2] 380 LDR x11, [x5, 64] 381 SMLAL v16.4s, v4.4h, v0.h[3] 382 SMLAL2 v20.4s, v4.8h, v0.h[3] 383 SMLAL v17.4s, v4.4h, v1.h[3] 384 SMLAL2 v21.4s, v4.8h, v1.h[3] 385 SXTL v6.8h, v6.8b 386 SMLAL v18.4s, v4.4h, v2.h[3] 387 SMLAL2 v22.4s, v4.8h, v2.h[3] 388 SMLAL v19.4s, v4.4h, v3.h[3] 389 SMLAL2 v23.4s, v4.8h, v3.h[3] 390 LDR d4, [x5, 72] 391 INS v5.d[0], x11 392 SMLAL v24.4s, v6.4h, v0.h[3] 393 SMLAL2 v28.4s, v6.8h, v0.h[3] 394 SXTL v5.8h, v5.8b 395 SMLAL v25.4s, v6.4h, v1.h[3] 396 SMLAL2 v29.4s, v6.8h, v1.h[3] 397 SMLAL v26.4s, v6.4h, v2.h[3] 398 SMLAL2 v30.4s, v6.8h, v2.h[3] 399 SMLAL v27.4s, v6.4h, v3.h[3] 400 SMLAL2 v31.4s, v6.8h, v3.h[3] 401 LDR x11, [x5, 80] 402 SMLAL v16.4s, v5.4h, v0.h[4] 403 SMLAL2 v20.4s, v5.8h, v0.h[4] 404 SMLAL v17.4s, v5.4h, v1.h[4] 405 SMLAL2 v21.4s, v5.8h, v1.h[4] 406 SXTL v4.8h, v4.8b 407 SMLAL v18.4s, v5.4h, v2.h[4] 408 SMLAL2 v22.4s, v5.8h, v2.h[4] 409 SMLAL v19.4s, v5.4h, v3.h[4] 410 SMLAL2 v23.4s, v5.8h, v3.h[4] 411 LDR d5, [x5, 88] 412 INS v6.d[0], x11 413 SMLAL v24.4s, v4.4h, v0.h[4] 414 SMLAL2 v28.4s, v4.8h, v0.h[4] 415 SMLAL v25.4s, v4.4h, v1.h[4] 416 SMLAL2 v29.4s, v4.8h, v1.h[4] 417 SXTL v6.8h, v6.8b 418 SMLAL v26.4s, v4.4h, v2.h[4] 419 SMLAL2 v30.4s, v4.8h, v2.h[4] 420 SMLAL v27.4s, v4.4h, v3.h[4] 421 SMLAL2 v31.4s, v4.8h, v3.h[4] 422 LDR x11, [x5, 96] 423 SMLAL v16.4s, v6.4h, v0.h[5] 424 SMLAL2 v20.4s, v6.8h, v0.h[5] 425 SMLAL v17.4s, v6.4h, v1.h[5] 426 SMLAL2 v21.4s, v6.8h, v1.h[5] 427 SXTL v5.8h, v5.8b 428 SMLAL v18.4s, v6.4h, v2.h[5] 429 SMLAL2 v22.4s, v6.8h, v2.h[5] 430 SMLAL v19.4s, v6.4h, v3.h[5] 431 SMLAL2 v23.4s, v6.8h, v3.h[5] 432 LDR d6, [x5, 104] 433 INS v4.d[0], x11 434 SMLAL v24.4s, v5.4h, v0.h[5] 435 SMLAL2 v28.4s, v5.8h, v0.h[5] 436 SMLAL v25.4s, v5.4h, v1.h[5] 437 SMLAL2 v29.4s, v5.8h, v1.h[5] 438 SXTL v4.8h, v4.8b 439 SMLAL v26.4s, v5.4h, v2.h[5] 440 SMLAL2 v30.4s, v5.8h, v2.h[5] 441 SMLAL v27.4s, v5.4h, v3.h[5] 442 SMLAL2 v31.4s, v5.8h, v3.h[5] 443 SXTL v6.8h, v6.8b 444 SMLAL v16.4s, v4.4h, v0.h[6] 445 SMLAL2 v20.4s, v4.8h, v0.h[6] 446 SMLAL v17.4s, v4.4h, v1.h[6] 447 SMLAL2 v21.4s, v4.8h, v1.h[6] 448 SMLAL v18.4s, v4.4h, v2.h[6] 449 SMLAL2 v22.4s, v4.8h, v2.h[6] 450 SMLAL v19.4s, v4.4h, v3.h[6] 451 SMLAL2 v23.4s, v4.8h, v3.h[6] 452 LDR x11, [x5, 112] 453 SMLAL v24.4s, v6.4h, v0.h[6] 454 SMLAL2 v28.4s, v6.8h, v0.h[6] 455 SMLAL v25.4s, v6.4h, v1.h[6] 456 SMLAL2 v29.4s, v6.8h, v1.h[6] 457 LDR d5, [x5, 120] 458 INS v4.d[0], x11 459 SXTL v4.8h, v4.8b 460 SMLAL v26.4s, v6.4h, v2.h[6] 461 SMLAL2 v30.4s, v6.8h, v2.h[6] 462 SMLAL v27.4s, v6.4h, v3.h[6] 463 SMLAL2 v31.4s, v6.8h, v3.h[6] 464 SMLAL v16.4s, v4.4h, v0.h[7] 465 SMLAL2 v20.4s, v4.8h, v0.h[7] 466 SMLAL v17.4s, v4.4h, v1.h[7] 467 SMLAL2 v21.4s, v4.8h, v1.h[7] 468 SXTL v5.8h, v5.8b 469 SMLAL v18.4s, v4.4h, v2.h[7] 470 SMLAL2 v22.4s, v4.8h, v2.h[7] 471 SMLAL v19.4s, v4.4h, v3.h[7] 472 SMLAL2 v23.4s, v4.8h, v3.h[7] 473 ADD x5, x5, 128 474 SMLAL v24.4s, v5.4h, v0.h[7] 475 SMLAL2 v28.4s, v5.8h, v0.h[7] 476 SMLAL v25.4s, v5.4h, v1.h[7] 477 SMLAL2 v29.4s, v5.8h, v1.h[7] 478 AND x0, x2, 7 // kc remainder 0 to 7 479 SMLAL v26.4s, v5.4h, v2.h[7] 480 SMLAL2 v30.4s, v5.8h, v2.h[7] 481 LDR x11, [sp, 40] // reload params pointer 482 SMLAL v27.4s, v5.4h, v3.h[7] 483 SMLAL2 v31.4s, v5.8h, v3.h[7] 484 485 # Is there a remainder?- 1 to 7 bytes of A 486 CBNZ x0, 5f 487 4884: 489 # ks loop 490 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 491 B.HI 1b 492 493 # Apply params - preshift, scale, postshift, bias and clamp 494 LD1R {v4.4s}, [x11], 4 495 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 496 SQSHL v17.4s, v17.4s, v4.4s 497 SQSHL v18.4s, v18.4s, v4.4s 498 SQSHL v19.4s, v19.4s, v4.4s 499 SQSHL v20.4s, v20.4s, v4.4s 500 SQSHL v21.4s, v21.4s, v4.4s 501 SQSHL v22.4s, v22.4s, v4.4s 502 SQSHL v23.4s, v23.4s, v4.4s 503 LD1R {v5.4s}, [x11], 4 504 SQSHL v24.4s, v24.4s, v4.4s 505 SQSHL v25.4s, v25.4s, v4.4s 506 SQSHL v26.4s, v26.4s, v4.4s 507 SQSHL v27.4s, v27.4s, v4.4s 508 SQSHL v28.4s, v28.4s, v4.4s 509 SQSHL v29.4s, v29.4s, v4.4s 510 SQSHL v30.4s, v30.4s, v4.4s 511 SQSHL v31.4s, v31.4s, v4.4s 512 LD1R {v6.4s}, [x11], 4 513 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 514 SQDMULH v17.4s, v17.4s, v5.4s 515 SQDMULH v18.4s, v18.4s, v5.4s 516 SQDMULH v19.4s, v19.4s, v5.4s 517 SQDMULH v20.4s, v20.4s, v5.4s 518 SQDMULH v21.4s, v21.4s, v5.4s 519 SQDMULH v22.4s, v22.4s, v5.4s 520 SQDMULH v23.4s, v23.4s, v5.4s 521 SQDMULH v24.4s, v24.4s, v5.4s 522 SQDMULH v25.4s, v25.4s, v5.4s 523 SQDMULH v26.4s, v26.4s, v5.4s 524 SQDMULH v27.4s, v27.4s, v5.4s 525 SQDMULH v28.4s, v28.4s, v5.4s 526 SQDMULH v29.4s, v29.4s, v5.4s 527 SQDMULH v30.4s, v30.4s, v5.4s 528 SQDMULH v31.4s, v31.4s, v5.4s 529 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 530 SRSHL v17.4s, v17.4s, v6.4s 531 SRSHL v18.4s, v18.4s, v6.4s 532 SRSHL v19.4s, v19.4s, v6.4s 533 SRSHL v20.4s, v20.4s, v6.4s 534 SRSHL v21.4s, v21.4s, v6.4s 535 SRSHL v22.4s, v22.4s, v6.4s 536 SRSHL v23.4s, v23.4s, v6.4s 537 SRSHL v24.4s, v24.4s, v6.4s 538 SRSHL v25.4s, v25.4s, v6.4s 539 SRSHL v26.4s, v26.4s, v6.4s 540 SRSHL v27.4s, v27.4s, v6.4s 541 SRSHL v28.4s, v28.4s, v6.4s 542 SRSHL v29.4s, v29.4s, v6.4s 543 SRSHL v30.4s, v30.4s, v6.4s 544 SRSHL v31.4s, v31.4s, v6.4s 545 546 SQXTN v16.4h, v16.4s 547 SQXTN v17.4h, v17.4s 548 SQXTN v18.4h, v18.4s 549 SQXTN v19.4h, v19.4s 550 SQXTN v24.4h, v24.4s 551 SQXTN v25.4h, v25.4s 552 SQXTN v26.4h, v26.4s 553 SQXTN v27.4h, v27.4s 554 LD1R {v6.8h}, [x11], 2 // add bias 555 556 SQXTN2 v16.8h, v20.4s 557 SQXTN2 v17.8h, v21.4s 558 SQXTN2 v18.8h, v22.4s 559 SQXTN2 v19.8h, v23.4s 560 SQXTN2 v24.8h, v28.4s 561 SQXTN2 v25.8h, v29.4s 562 SQXTN2 v26.8h, v30.4s 563 SQXTN2 v27.8h, v31.4s 564 565 SQADD v16.8h, v16.8h, v6.8h 566 SQADD v17.8h, v17.8h, v6.8h 567 SQADD v18.8h, v18.8h, v6.8h 568 SQADD v19.8h, v19.8h, v6.8h 569 SQADD v24.8h, v24.8h, v6.8h 570 SQADD v25.8h, v25.8h, v6.8h 571 SQADD v26.8h, v26.8h, v6.8h 572 SQADD v27.8h, v27.8h, v6.8h 573 LD1R {v4.16b}, [x11], 1 // clamp min value 574 575 SQXTN v0.8b, v16.8h 576 SQXTN v1.8b, v17.8h 577 SQXTN v2.8b, v18.8h 578 SQXTN v3.8b, v19.8h 579 LD1R {v5.16b}, [x11] // clamp max value 580 SQXTN2 v0.16b, v24.8h 581 SQXTN2 v1.16b, v25.8h 582 SQXTN2 v2.16b, v26.8h 583 SQXTN2 v3.16b, v27.8h 584 SUB x11, x11, 15 // rewind params pointer 585 586 SMAX v0.16b, v0.16b, v4.16b 587 SMAX v1.16b, v1.16b, v4.16b 588 SMAX v2.16b, v2.16b, v4.16b 589 SMAX v3.16b, v3.16b, v4.16b 590 SUBS x1, x1, 16 591 SMIN v0.16b, v0.16b, v5.16b 592 SMIN v1.16b, v1.16b, v5.16b 593 SMIN v2.16b, v2.16b, v5.16b 594 SMIN v3.16b, v3.16b, v5.16b 595 B.LO 6f 596 597 # Store full 4 x 16 598 ST1 {v3.16b}, [x7], x10 599 ST1 {v2.16b}, [x17], x10 600 ST1 {v1.16b}, [x16], x10 601 ST1 {v0.16b}, [x6], x10 602 603 SUB x4, x4, x3 // a -= ks 604 605 # nc loop 606 B.HI 0b 607 608 # Restore x20-x21 from stack 609 LDP x20, x21, [sp], 16 610 RET 611 612 # Remainder- 1 to 7 bytes of A 613 .p2align 3 6145: 615 AND x0, x2, 7 // kc remainder 1 to 7 616 617 LD1 {v0.8b}, [x13], x0 618 LDP d4, d5, [x5], 16 619 LD1 {v1.8b}, [x14], x0 620 LD1 {v2.8b}, [x15], x0 621 LD1 {v3.8b}, [x20], x0 622 SXTL v0.8h, v0.8b 623 SXTL v4.8h, v4.8b 624 SXTL v5.8h, v5.8b 625 SXTL v1.8h, v1.8b 626 SXTL v2.8h, v2.8b 627 SXTL v3.8h, v3.8b 628 SMLAL v16.4s, v4.4h, v0.h[0] 629 SMLAL2 v20.4s, v4.8h, v0.h[0] 630 SMLAL v24.4s, v5.4h, v0.h[0] 631 SMLAL2 v28.4s, v5.8h, v0.h[0] 632 SMLAL v17.4s, v4.4h, v1.h[0] 633 SMLAL2 v21.4s, v4.8h, v1.h[0] 634 SMLAL v25.4s, v5.4h, v1.h[0] 635 SMLAL2 v29.4s, v5.8h, v1.h[0] 636 SMLAL v18.4s, v4.4h, v2.h[0] 637 SMLAL2 v22.4s, v4.8h, v2.h[0] 638 SMLAL v26.4s, v5.4h, v2.h[0] 639 SMLAL2 v30.4s, v5.8h, v2.h[0] 640 SMLAL v19.4s, v4.4h, v3.h[0] 641 SMLAL2 v23.4s, v4.8h, v3.h[0] 642 SMLAL v27.4s, v5.4h, v3.h[0] 643 SMLAL2 v31.4s, v5.8h, v3.h[0] 644 CMP x0, 2 645 B.LO 4b 646 647 LDP d4, d5, [x5], 16 648 SXTL v4.8h, v4.8b 649 SXTL v5.8h, v5.8b 650 SMLAL v16.4s, v4.4h, v0.h[1] 651 SMLAL2 v20.4s, v4.8h, v0.h[1] 652 SMLAL v24.4s, v5.4h, v0.h[1] 653 SMLAL2 v28.4s, v5.8h, v0.h[1] 654 SMLAL v17.4s, v4.4h, v1.h[1] 655 SMLAL2 v21.4s, v4.8h, v1.h[1] 656 SMLAL v25.4s, v5.4h, v1.h[1] 657 SMLAL2 v29.4s, v5.8h, v1.h[1] 658 SMLAL v18.4s, v4.4h, v2.h[1] 659 SMLAL2 v22.4s, v4.8h, v2.h[1] 660 SMLAL v26.4s, v5.4h, v2.h[1] 661 SMLAL2 v30.4s, v5.8h, v2.h[1] 662 SMLAL v19.4s, v4.4h, v3.h[1] 663 SMLAL2 v23.4s, v4.8h, v3.h[1] 664 SMLAL v27.4s, v5.4h, v3.h[1] 665 SMLAL2 v31.4s, v5.8h, v3.h[1] 666 B.EQ 4b 667 668 LDP d4, d5, [x5], 16 669 SXTL v4.8h, v4.8b 670 SXTL v5.8h, v5.8b 671 SMLAL v16.4s, v4.4h, v0.h[2] 672 SMLAL2 v20.4s, v4.8h, v0.h[2] 673 SMLAL v24.4s, v5.4h, v0.h[2] 674 SMLAL2 v28.4s, v5.8h, v0.h[2] 675 SMLAL v17.4s, v4.4h, v1.h[2] 676 SMLAL2 v21.4s, v4.8h, v1.h[2] 677 SMLAL v25.4s, v5.4h, v1.h[2] 678 SMLAL2 v29.4s, v5.8h, v1.h[2] 679 SMLAL v18.4s, v4.4h, v2.h[2] 680 SMLAL2 v22.4s, v4.8h, v2.h[2] 681 SMLAL v26.4s, v5.4h, v2.h[2] 682 SMLAL2 v30.4s, v5.8h, v2.h[2] 683 SMLAL v19.4s, v4.4h, v3.h[2] 684 SMLAL2 v23.4s, v4.8h, v3.h[2] 685 SMLAL v27.4s, v5.4h, v3.h[2] 686 SMLAL2 v31.4s, v5.8h, v3.h[2] 687 CMP x0, 4 688 B.LO 4b 689 690 LDP d4, d5, [x5], 16 691 SXTL v4.8h, v4.8b 692 SXTL v5.8h, v5.8b 693 SMLAL v16.4s, v4.4h, v0.h[3] 694 SMLAL2 v20.4s, v4.8h, v0.h[3] 695 SMLAL v24.4s, v5.4h, v0.h[3] 696 SMLAL2 v28.4s, v5.8h, v0.h[3] 697 SMLAL v17.4s, v4.4h, v1.h[3] 698 SMLAL2 v21.4s, v4.8h, v1.h[3] 699 SMLAL v25.4s, v5.4h, v1.h[3] 700 SMLAL2 v29.4s, v5.8h, v1.h[3] 701 SMLAL v18.4s, v4.4h, v2.h[3] 702 SMLAL2 v22.4s, v4.8h, v2.h[3] 703 SMLAL v26.4s, v5.4h, v2.h[3] 704 SMLAL2 v30.4s, v5.8h, v2.h[3] 705 SMLAL v19.4s, v4.4h, v3.h[3] 706 SMLAL2 v23.4s, v4.8h, v3.h[3] 707 SMLAL v27.4s, v5.4h, v3.h[3] 708 SMLAL2 v31.4s, v5.8h, v3.h[3] 709 B.EQ 4b 710 711 LDP d4, d5, [x5], 16 712 SXTL v4.8h, v4.8b 713 SXTL v5.8h, v5.8b 714 SMLAL v16.4s, v4.4h, v0.h[4] 715 SMLAL2 v20.4s, v4.8h, v0.h[4] 716 SMLAL v24.4s, v5.4h, v0.h[4] 717 SMLAL2 v28.4s, v5.8h, v0.h[4] 718 SMLAL v17.4s, v4.4h, v1.h[4] 719 SMLAL2 v21.4s, v4.8h, v1.h[4] 720 SMLAL v25.4s, v5.4h, v1.h[4] 721 SMLAL2 v29.4s, v5.8h, v1.h[4] 722 SMLAL v18.4s, v4.4h, v2.h[4] 723 SMLAL2 v22.4s, v4.8h, v2.h[4] 724 SMLAL v26.4s, v5.4h, v2.h[4] 725 SMLAL2 v30.4s, v5.8h, v2.h[4] 726 SMLAL v19.4s, v4.4h, v3.h[4] 727 SMLAL2 v23.4s, v4.8h, v3.h[4] 728 SMLAL v27.4s, v5.4h, v3.h[4] 729 SMLAL2 v31.4s, v5.8h, v3.h[4] 730 CMP x0, 6 731 B.LO 4b 732 733 LDP d4, d5, [x5], 16 734 SXTL v4.8h, v4.8b 735 SXTL v5.8h, v5.8b 736 SMLAL v16.4s, v4.4h, v0.h[5] 737 SMLAL2 v20.4s, v4.8h, v0.h[5] 738 SMLAL v24.4s, v5.4h, v0.h[5] 739 SMLAL2 v28.4s, v5.8h, v0.h[5] 740 SMLAL v17.4s, v4.4h, v1.h[5] 741 SMLAL2 v21.4s, v4.8h, v1.h[5] 742 SMLAL v25.4s, v5.4h, v1.h[5] 743 SMLAL2 v29.4s, v5.8h, v1.h[5] 744 SMLAL v18.4s, v4.4h, v2.h[5] 745 SMLAL2 v22.4s, v4.8h, v2.h[5] 746 SMLAL v26.4s, v5.4h, v2.h[5] 747 SMLAL2 v30.4s, v5.8h, v2.h[5] 748 SMLAL v19.4s, v4.4h, v3.h[5] 749 SMLAL2 v23.4s, v4.8h, v3.h[5] 750 SMLAL v27.4s, v5.4h, v3.h[5] 751 SMLAL2 v31.4s, v5.8h, v3.h[5] 752 B.EQ 4b 753 754 LDP d4, d5, [x5], 16 755 SXTL v4.8h, v4.8b 756 SXTL v5.8h, v5.8b 757 SMLAL v16.4s, v4.4h, v0.h[6] 758 SMLAL2 v20.4s, v4.8h, v0.h[6] 759 SMLAL v24.4s, v5.4h, v0.h[6] 760 SMLAL2 v28.4s, v5.8h, v0.h[6] 761 SMLAL v17.4s, v4.4h, v1.h[6] 762 SMLAL2 v21.4s, v4.8h, v1.h[6] 763 SMLAL v25.4s, v5.4h, v1.h[6] 764 SMLAL2 v29.4s, v5.8h, v1.h[6] 765 SMLAL v18.4s, v4.4h, v2.h[6] 766 SMLAL2 v22.4s, v4.8h, v2.h[6] 767 SMLAL v26.4s, v5.4h, v2.h[6] 768 SMLAL2 v30.4s, v5.8h, v2.h[6] 769 SMLAL v19.4s, v4.4h, v3.h[6] 770 SMLAL2 v23.4s, v4.8h, v3.h[6] 771 SMLAL v27.4s, v5.4h, v3.h[6] 772 SMLAL2 v31.4s, v5.8h, v3.h[6] 773 B 4b 774 775 # Store odd width 776 .p2align 3 7776: 778 TBZ x1, 3, 7f 779 STR d3, [x7], 8 780 STR d2, [x17], 8 781 DUP d3, v3.d[1] 782 DUP d2, v2.d[1] 783 STR d1, [x16], 8 784 STR d0, [x6], 8 785 DUP d1, v1.d[1] 786 DUP d0, v0.d[1] 7877: 788 TBZ x1, 2, 8f 789 STR s3, [x7], 4 790 STR s2, [x17], 4 791 DUP s3, v3.s[1] 792 DUP s2, v2.s[1] 793 STR s1, [x16], 4 794 STR s0, [x6], 4 795 DUP s1, v1.s[1] 796 DUP s0, v0.s[1] 7978: 798 TBZ x1, 1, 9f 799 STR h3, [x7], 2 800 STR h2, [x17], 2 801 DUP h3, v3.h[1] 802 DUP h2, v2.h[1] 803 STR h1, [x16], 2 804 STR h0, [x6], 2 805 DUP h1, v1.h[1] 806 DUP h0, v0.h[1] 8079: 808 TBZ x1, 0, 10f 809 STR b3, [x7] 810 STR b2, [x17] 811 STR b1, [x16] 812 STR b0, [x6] 81310: 814 # Restore x20-x21 from stack 815 LDP x20, x21, [sp], 16 816 RET 817 818END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53 819 820#ifdef __ELF__ 821.section ".note.GNU-stack","",%progbits 822#endif 823