1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t** restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x10 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 31# A1 x14 v1 32# A2 x15 v2 33# A3 x20 v3 34# B x5 v4 v5 v6 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 40# x11, x21 temp for Cortex-A53 loads 41 42BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53 43 44 # Clamp C pointers 45 CMP x0, 2 // if mr < 2 46 LDP x10, x8, [sp] // Load cn_stride, a_offset 47 ADD x16, x6, x7 // c1 = c0 + cm_stride 48 CSEL x16, x6, x16, LO // c1 = c0 49 50 ADD x17, x16, x7 // c2 = c1 + cm_stride 51 LDP x12, x11, [sp, 16] // Load zero, params pointer 52 // if mr <= 2 53 CSEL x17, x16, x17, LS // c2 = c1 54 55 CMP x0, 4 // if mr < 4 56 STP x20, x21, [sp, -16]! // Save x20-x21 on stack 57 ADD x7, x17, x7 // c3 = c2 + cm_stride 58 CSEL x7, x17, x7, LO // c3 = c2 59 60 61 .p2align 3 620: 63 # Load initial bias from w into accumulators 64 LDP q16, q20, [x5], 32 65 MOV v17.16b, v16.16b 66 MOV v18.16b, v16.16b 67 LDP q24, q28, [x5], 32 68 MOV v19.16b, v16.16b 69 MOV v21.16b, v20.16b 70 MOV v22.16b, v20.16b 71 MOV v23.16b, v20.16b 72 MOV v25.16b, v24.16b 73 MOV v26.16b, v24.16b 74 MOV v27.16b, v24.16b 75 MOV v29.16b, v28.16b 76 MOV v30.16b, v28.16b 77 MOV v31.16b, v28.16b 78 MOV x9, x3 // p = ks 79 80 .p2align 3 811: 82 # Load next 4 A pointers 83 LDP x13, x14, [x4], 16 84 LDP x15, x20, [x4], 16 85 86 CMP x13, x12 // if a0 == zero 87 ADD x13, x13, x8 // a0 += a_offset 88 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 89 CMP x14, x12 // if a1 == zero 90 ADD x14, x14, x8 // a1 += a_offset 91 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 92 CMP x15, x12 // if a2 == zero 93 ADD x15, x15, x8 // a2 += a_offset 94 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 95 CMP x20, x12 // if a3 == zero 96 ADD x20, x20, x8 // a3 += a_offset 97 CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset 98 99 # Is there at least 8 bytes for epilogue? 100 SUBS x0, x2, 8 // k = kc - 8 101 B.LO 5f 102 103 # Prologue 104 LDR d0, [x13], 8 105 LDP d4, d6, [x5] 106 LDR d1, [x14], 8 107 LDR d2, [x15], 8 108 LDR d3, [x20], 8 109 SXTL v0.8h, v0.8b 110 LDR x11, [x5, 16] 111 SXTL v4.8h, v4.8b 112 SXTL v1.8h, v1.8b 113 SXTL v2.8h, v2.8b 114 SXTL v3.8h, v3.8b 115 SXTL v6.8h, v6.8b 116 117 SUBS x0, x0, 8 // k = k - 8 118 # Is there at least 8 bytes for main loop? 119 B.LO 3f 120 121 # Main loop - 8 bytes of A 122 .p2align 3 1232: 124 SMLAL v16.4s, v4.4h, v0.h[0] 125 SMLAL2 v20.4s, v4.8h, v0.h[0] 126 SMLAL v17.4s, v4.4h, v1.h[0] 127 SMLAL2 v21.4s, v4.8h, v1.h[0] 128 SMLAL v18.4s, v4.4h, v2.h[0] 129 SMLAL2 v22.4s, v4.8h, v2.h[0] 130 SMLAL v19.4s, v4.4h, v3.h[0] 131 SMLAL2 v23.4s, v4.8h, v3.h[0] 132 LDR d4, [x5, 24] 133 INS v5.d[0], x11 134 SMLAL v24.4s, v6.4h, v0.h[0] 135 SMLAL2 v28.4s, v6.8h, v0.h[0] 136 SMLAL v25.4s, v6.4h, v1.h[0] 137 SMLAL2 v29.4s, v6.8h, v1.h[0] 138 SXTL v5.8h, v5.8b 139 SMLAL v26.4s, v6.4h, v2.h[0] 140 SMLAL2 v30.4s, v6.8h, v2.h[0] 141 SMLAL v27.4s, v6.4h, v3.h[0] 142 SMLAL2 v31.4s, v6.8h, v3.h[0] 143 LDR x11, [x5, 32] 144 SMLAL v16.4s, v5.4h, v0.h[1] 145 SMLAL2 v20.4s, v5.8h, v0.h[1] 146 SMLAL v17.4s, v5.4h, v1.h[1] 147 SMLAL2 v21.4s, v5.8h, v1.h[1] 148 SXTL v4.8h, v4.8b 149 SMLAL v18.4s, v5.4h, v2.h[1] 150 SMLAL2 v22.4s, v5.8h, v2.h[1] 151 SMLAL v19.4s, v5.4h, v3.h[1] 152 SMLAL2 v23.4s, v5.8h, v3.h[1] 153 LDR d5, [x5, 40] 154 INS v6.d[0], x11 155 SMLAL v24.4s, v4.4h, v0.h[1] 156 SMLAL2 v28.4s, v4.8h, v0.h[1] 157 SMLAL v25.4s, v4.4h, v1.h[1] 158 SMLAL2 v29.4s, v4.8h, v1.h[1] 159 SXTL v6.8h, v6.8b 160 SMLAL v26.4s, v4.4h, v2.h[1] 161 SMLAL2 v30.4s, v4.8h, v2.h[1] 162 SMLAL v27.4s, v4.4h, v3.h[1] 163 SMLAL2 v31.4s, v4.8h, v3.h[1] 164 LDR x11, [x5, 48] 165 SMLAL v16.4s, v6.4h, v0.h[2] 166 SMLAL2 v20.4s, v6.8h, v0.h[2] 167 SMLAL v17.4s, v6.4h, v1.h[2] 168 SXTL v5.8h, v5.8b 169 SMLAL2 v21.4s, v6.8h, v1.h[2] 170 SMLAL v18.4s, v6.4h, v2.h[2] 171 SMLAL2 v22.4s, v6.8h, v2.h[2] 172 SMLAL v19.4s, v6.4h, v3.h[2] 173 SMLAL2 v23.4s, v6.8h, v3.h[2] 174 LDR d6, [x5, 56] 175 INS v4.d[0], x11 176 SMLAL v24.4s, v5.4h, v0.h[2] 177 SMLAL2 v28.4s, v5.8h, v0.h[2] 178 SMLAL v25.4s, v5.4h, v1.h[2] 179 SMLAL2 v29.4s, v5.8h, v1.h[2] 180 SXTL v4.8h, v4.8b 181 SMLAL v26.4s, v5.4h, v2.h[2] 182 SMLAL2 v30.4s, v5.8h, v2.h[2] 183 SMLAL v27.4s, v5.4h, v3.h[2] 184 SMLAL2 v31.4s, v5.8h, v3.h[2] 185 LDR x11, [x5, 64] 186 SMLAL v16.4s, v4.4h, v0.h[3] 187 SMLAL2 v20.4s, v4.8h, v0.h[3] 188 SMLAL v17.4s, v4.4h, v1.h[3] 189 SMLAL2 v21.4s, v4.8h, v1.h[3] 190 SXTL v6.8h, v6.8b 191 SMLAL v18.4s, v4.4h, v2.h[3] 192 SMLAL2 v22.4s, v4.8h, v2.h[3] 193 SMLAL v19.4s, v4.4h, v3.h[3] 194 SMLAL2 v23.4s, v4.8h, v3.h[3] 195 LDR d4, [x5, 72] 196 INS v5.d[0], x11 197 SMLAL v24.4s, v6.4h, v0.h[3] 198 SMLAL2 v28.4s, v6.8h, v0.h[3] 199 SXTL v5.8h, v5.8b 200 SMLAL v25.4s, v6.4h, v1.h[3] 201 SMLAL2 v29.4s, v6.8h, v1.h[3] 202 SMLAL v26.4s, v6.4h, v2.h[3] 203 SMLAL2 v30.4s, v6.8h, v2.h[3] 204 SMLAL v27.4s, v6.4h, v3.h[3] 205 SMLAL2 v31.4s, v6.8h, v3.h[3] 206 LDR x11, [x5, 80] 207 SMLAL v16.4s, v5.4h, v0.h[4] 208 SMLAL2 v20.4s, v5.8h, v0.h[4] 209 SMLAL v17.4s, v5.4h, v1.h[4] 210 SMLAL2 v21.4s, v5.8h, v1.h[4] 211 SXTL v4.8h, v4.8b 212 SMLAL v18.4s, v5.4h, v2.h[4] 213 SMLAL2 v22.4s, v5.8h, v2.h[4] 214 SMLAL v19.4s, v5.4h, v3.h[4] 215 SMLAL2 v23.4s, v5.8h, v3.h[4] 216 LDR d5, [x5, 88] 217 INS v6.d[0], x11 218 SMLAL v24.4s, v4.4h, v0.h[4] 219 SMLAL2 v28.4s, v4.8h, v0.h[4] 220 SMLAL v25.4s, v4.4h, v1.h[4] 221 SMLAL2 v29.4s, v4.8h, v1.h[4] 222 SXTL v6.8h, v6.8b 223 SMLAL v26.4s, v4.4h, v2.h[4] 224 SMLAL2 v30.4s, v4.8h, v2.h[4] 225 SMLAL v27.4s, v4.4h, v3.h[4] 226 SMLAL2 v31.4s, v4.8h, v3.h[4] 227 LDR x11, [x5, 96] 228 SMLAL v16.4s, v6.4h, v0.h[5] 229 SMLAL2 v20.4s, v6.8h, v0.h[5] 230 SMLAL v17.4s, v6.4h, v1.h[5] 231 SMLAL2 v21.4s, v6.8h, v1.h[5] 232 SXTL v5.8h, v5.8b 233 SMLAL v18.4s, v6.4h, v2.h[5] 234 SMLAL2 v22.4s, v6.8h, v2.h[5] 235 SMLAL v19.4s, v6.4h, v3.h[5] 236 SMLAL2 v23.4s, v6.8h, v3.h[5] 237 LDR d6, [x5, 104] 238 INS v4.d[0], x11 239 SMLAL v24.4s, v5.4h, v0.h[5] 240 SMLAL2 v28.4s, v5.8h, v0.h[5] 241 SMLAL v25.4s, v5.4h, v1.h[5] 242 SMLAL2 v29.4s, v5.8h, v1.h[5] 243 SXTL v4.8h, v4.8b 244 SMLAL v26.4s, v5.4h, v2.h[5] 245 SMLAL2 v30.4s, v5.8h, v2.h[5] 246 SMLAL v27.4s, v5.4h, v3.h[5] 247 SMLAL2 v31.4s, v5.8h, v3.h[5] 248 SXTL v6.8h, v6.8b 249 LDR x11, [x5, 112] 250 SMLAL v16.4s, v4.4h, v0.h[6] 251 SMLAL2 v20.4s, v4.8h, v0.h[6] 252 SMLAL v17.4s, v4.4h, v1.h[6] 253 SMLAL2 v21.4s, v4.8h, v1.h[6] 254 SMLAL v18.4s, v4.4h, v2.h[6] 255 SMLAL2 v22.4s, v4.8h, v2.h[6] 256 SMLAL v19.4s, v4.4h, v3.h[6] 257 SMLAL2 v23.4s, v4.8h, v3.h[6] 258 LDR d5, [x5, 120] 259 INS v4.d[0], x11 260 SMLAL v24.4s, v6.4h, v0.h[6] 261 SMLAL2 v28.4s, v6.8h, v0.h[6] 262 SMLAL v25.4s, v6.4h, v1.h[6] 263 SMLAL2 v29.4s, v6.8h, v1.h[6] 264 SXTL v4.8h, v4.8b 265 ADD x5, x5, 128 266 267 SMLAL v26.4s, v6.4h, v2.h[6] 268 SMLAL2 v30.4s, v6.8h, v2.h[6] 269 LDR x11, [x5] 270 SMLAL v27.4s, v6.4h, v3.h[6] 271 SMLAL2 v31.4s, v6.8h, v3.h[6] 272 SXTL v5.8h, v5.8b 273 LDR x21, [x13], 8 274 275 SMLAL v16.4s, v4.4h, v0.h[7] 276 SMLAL2 v20.4s, v4.8h, v0.h[7] 277 SMLAL v17.4s, v4.4h, v1.h[7] 278 SMLAL2 v21.4s, v4.8h, v1.h[7] 279 SMLAL v18.4s, v4.4h, v2.h[7] 280 SMLAL2 v22.4s, v4.8h, v2.h[7] 281 SMLAL v19.4s, v4.4h, v3.h[7] 282 SMLAL2 v23.4s, v4.8h, v3.h[7] 283 LDR d6, [x5, 8] 284 INS v4.d[0], x11 285 SMLAL v24.4s, v5.4h, v0.h[7] 286 SMLAL2 v28.4s, v5.8h, v0.h[7] 287 LDR x11, [x15], 8 288 SMLAL v25.4s, v5.4h, v1.h[7] 289 SMLAL2 v29.4s, v5.8h, v1.h[7] 290 LDR d1, [x14], 8 291 INS v0.d[0], x21 292 SMLAL v26.4s, v5.4h, v2.h[7] 293 SMLAL2 v30.4s, v5.8h, v2.h[7] 294 SMLAL v27.4s, v5.4h, v3.h[7] 295 SMLAL2 v31.4s, v5.8h, v3.h[7] 296 LDR d3, [x20], 8 297 INS v2.d[0], x11 298 299 SXTL v0.8h, v0.8b 300 SXTL v1.8h, v1.8b 301 LDR x11, [x5, 16] 302 SXTL v4.8h, v4.8b 303 SXTL v2.8h, v2.8b 304 SUBS x0, x0, 8 305 SXTL v3.8h, v3.8b 306 SXTL v6.8h, v6.8b 307 B.HS 2b 308 309 # Epilogue. Same as main loop but no preloads in final group 310 311 .p2align 3 3123: 313 SMLAL v16.4s, v4.4h, v0.h[0] 314 SMLAL2 v20.4s, v4.8h, v0.h[0] 315 SMLAL v17.4s, v4.4h, v1.h[0] 316 SMLAL2 v21.4s, v4.8h, v1.h[0] 317 SMLAL v18.4s, v4.4h, v2.h[0] 318 SMLAL2 v22.4s, v4.8h, v2.h[0] 319 SMLAL v19.4s, v4.4h, v3.h[0] 320 SMLAL2 v23.4s, v4.8h, v3.h[0] 321 LDR d4, [x5, 24] 322 INS v5.d[0], x11 323 SMLAL v24.4s, v6.4h, v0.h[0] 324 SMLAL2 v28.4s, v6.8h, v0.h[0] 325 SMLAL v25.4s, v6.4h, v1.h[0] 326 SMLAL2 v29.4s, v6.8h, v1.h[0] 327 SXTL v5.8h, v5.8b 328 SMLAL v26.4s, v6.4h, v2.h[0] 329 SMLAL2 v30.4s, v6.8h, v2.h[0] 330 SMLAL v27.4s, v6.4h, v3.h[0] 331 SMLAL2 v31.4s, v6.8h, v3.h[0] 332 LDR x11, [x5, 32] 333 SMLAL v16.4s, v5.4h, v0.h[1] 334 SMLAL2 v20.4s, v5.8h, v0.h[1] 335 SMLAL v17.4s, v5.4h, v1.h[1] 336 SMLAL2 v21.4s, v5.8h, v1.h[1] 337 SXTL v4.8h, v4.8b 338 SMLAL v18.4s, v5.4h, v2.h[1] 339 SMLAL2 v22.4s, v5.8h, v2.h[1] 340 SMLAL v19.4s, v5.4h, v3.h[1] 341 SMLAL2 v23.4s, v5.8h, v3.h[1] 342 LDR d5, [x5, 40] 343 INS v6.d[0], x11 344 SMLAL v24.4s, v4.4h, v0.h[1] 345 SMLAL2 v28.4s, v4.8h, v0.h[1] 346 SMLAL v25.4s, v4.4h, v1.h[1] 347 SMLAL2 v29.4s, v4.8h, v1.h[1] 348 SXTL v6.8h, v6.8b 349 SMLAL v26.4s, v4.4h, v2.h[1] 350 SMLAL2 v30.4s, v4.8h, v2.h[1] 351 SMLAL v27.4s, v4.4h, v3.h[1] 352 SMLAL2 v31.4s, v4.8h, v3.h[1] 353 LDR x11, [x5, 48] 354 SMLAL v16.4s, v6.4h, v0.h[2] 355 SMLAL2 v20.4s, v6.8h, v0.h[2] 356 SMLAL v17.4s, v6.4h, v1.h[2] 357 SXTL v5.8h, v5.8b 358 SMLAL2 v21.4s, v6.8h, v1.h[2] 359 SMLAL v18.4s, v6.4h, v2.h[2] 360 SMLAL2 v22.4s, v6.8h, v2.h[2] 361 SMLAL v19.4s, v6.4h, v3.h[2] 362 SMLAL2 v23.4s, v6.8h, v3.h[2] 363 LDR d6, [x5, 56] 364 INS v4.d[0], x11 365 SMLAL v24.4s, v5.4h, v0.h[2] 366 SMLAL2 v28.4s, v5.8h, v0.h[2] 367 SMLAL v25.4s, v5.4h, v1.h[2] 368 SMLAL2 v29.4s, v5.8h, v1.h[2] 369 SXTL v4.8h, v4.8b 370 SMLAL v26.4s, v5.4h, v2.h[2] 371 SMLAL2 v30.4s, v5.8h, v2.h[2] 372 SMLAL v27.4s, v5.4h, v3.h[2] 373 SMLAL2 v31.4s, v5.8h, v3.h[2] 374 LDR x11, [x5, 64] 375 SMLAL v16.4s, v4.4h, v0.h[3] 376 SMLAL2 v20.4s, v4.8h, v0.h[3] 377 SMLAL v17.4s, v4.4h, v1.h[3] 378 SMLAL2 v21.4s, v4.8h, v1.h[3] 379 SXTL v6.8h, v6.8b 380 SMLAL v18.4s, v4.4h, v2.h[3] 381 SMLAL2 v22.4s, v4.8h, v2.h[3] 382 SMLAL v19.4s, v4.4h, v3.h[3] 383 SMLAL2 v23.4s, v4.8h, v3.h[3] 384 LDR d4, [x5, 72] 385 INS v5.d[0], x11 386 SMLAL v24.4s, v6.4h, v0.h[3] 387 SMLAL2 v28.4s, v6.8h, v0.h[3] 388 SXTL v5.8h, v5.8b 389 SMLAL v25.4s, v6.4h, v1.h[3] 390 SMLAL2 v29.4s, v6.8h, v1.h[3] 391 SMLAL v26.4s, v6.4h, v2.h[3] 392 SMLAL2 v30.4s, v6.8h, v2.h[3] 393 SMLAL v27.4s, v6.4h, v3.h[3] 394 SMLAL2 v31.4s, v6.8h, v3.h[3] 395 LDR x11, [x5, 80] 396 SMLAL v16.4s, v5.4h, v0.h[4] 397 SMLAL2 v20.4s, v5.8h, v0.h[4] 398 SMLAL v17.4s, v5.4h, v1.h[4] 399 SMLAL2 v21.4s, v5.8h, v1.h[4] 400 SXTL v4.8h, v4.8b 401 SMLAL v18.4s, v5.4h, v2.h[4] 402 SMLAL2 v22.4s, v5.8h, v2.h[4] 403 SMLAL v19.4s, v5.4h, v3.h[4] 404 SMLAL2 v23.4s, v5.8h, v3.h[4] 405 LDR d5, [x5, 88] 406 INS v6.d[0], x11 407 SMLAL v24.4s, v4.4h, v0.h[4] 408 SMLAL2 v28.4s, v4.8h, v0.h[4] 409 SMLAL v25.4s, v4.4h, v1.h[4] 410 SMLAL2 v29.4s, v4.8h, v1.h[4] 411 SXTL v6.8h, v6.8b 412 SMLAL v26.4s, v4.4h, v2.h[4] 413 SMLAL2 v30.4s, v4.8h, v2.h[4] 414 SMLAL v27.4s, v4.4h, v3.h[4] 415 SMLAL2 v31.4s, v4.8h, v3.h[4] 416 LDR x11, [x5, 96] 417 SMLAL v16.4s, v6.4h, v0.h[5] 418 SMLAL2 v20.4s, v6.8h, v0.h[5] 419 SMLAL v17.4s, v6.4h, v1.h[5] 420 SMLAL2 v21.4s, v6.8h, v1.h[5] 421 SXTL v5.8h, v5.8b 422 SMLAL v18.4s, v6.4h, v2.h[5] 423 SMLAL2 v22.4s, v6.8h, v2.h[5] 424 SMLAL v19.4s, v6.4h, v3.h[5] 425 SMLAL2 v23.4s, v6.8h, v3.h[5] 426 LDR d6, [x5, 104] 427 INS v4.d[0], x11 428 SMLAL v24.4s, v5.4h, v0.h[5] 429 SMLAL2 v28.4s, v5.8h, v0.h[5] 430 SMLAL v25.4s, v5.4h, v1.h[5] 431 SMLAL2 v29.4s, v5.8h, v1.h[5] 432 SXTL v4.8h, v4.8b 433 SMLAL v26.4s, v5.4h, v2.h[5] 434 SMLAL2 v30.4s, v5.8h, v2.h[5] 435 SMLAL v27.4s, v5.4h, v3.h[5] 436 SMLAL2 v31.4s, v5.8h, v3.h[5] 437 SXTL v6.8h, v6.8b 438 SMLAL v16.4s, v4.4h, v0.h[6] 439 SMLAL2 v20.4s, v4.8h, v0.h[6] 440 SMLAL v17.4s, v4.4h, v1.h[6] 441 SMLAL2 v21.4s, v4.8h, v1.h[6] 442 SMLAL v18.4s, v4.4h, v2.h[6] 443 SMLAL2 v22.4s, v4.8h, v2.h[6] 444 SMLAL v19.4s, v4.4h, v3.h[6] 445 SMLAL2 v23.4s, v4.8h, v3.h[6] 446 LDR x11, [x5, 112] 447 SMLAL v24.4s, v6.4h, v0.h[6] 448 SMLAL2 v28.4s, v6.8h, v0.h[6] 449 SMLAL v25.4s, v6.4h, v1.h[6] 450 SMLAL2 v29.4s, v6.8h, v1.h[6] 451 LDR d5, [x5, 120] 452 INS v4.d[0], x11 453 SXTL v4.8h, v4.8b 454 SMLAL v26.4s, v6.4h, v2.h[6] 455 SMLAL2 v30.4s, v6.8h, v2.h[6] 456 SMLAL v27.4s, v6.4h, v3.h[6] 457 SMLAL2 v31.4s, v6.8h, v3.h[6] 458 SMLAL v16.4s, v4.4h, v0.h[7] 459 SMLAL2 v20.4s, v4.8h, v0.h[7] 460 SMLAL v17.4s, v4.4h, v1.h[7] 461 SMLAL2 v21.4s, v4.8h, v1.h[7] 462 SXTL v5.8h, v5.8b 463 SMLAL v18.4s, v4.4h, v2.h[7] 464 SMLAL2 v22.4s, v4.8h, v2.h[7] 465 SMLAL v19.4s, v4.4h, v3.h[7] 466 SMLAL2 v23.4s, v4.8h, v3.h[7] 467 ADD x5, x5, 128 468 SMLAL v24.4s, v5.4h, v0.h[7] 469 SMLAL2 v28.4s, v5.8h, v0.h[7] 470 SMLAL v25.4s, v5.4h, v1.h[7] 471 SMLAL2 v29.4s, v5.8h, v1.h[7] 472 AND x0, x2, 7 // kc remainder 0 to 7 473 SMLAL v26.4s, v5.4h, v2.h[7] 474 SMLAL2 v30.4s, v5.8h, v2.h[7] 475 LDR x11, [sp, 40] // reload params pointer 476 SMLAL v27.4s, v5.4h, v3.h[7] 477 SMLAL2 v31.4s, v5.8h, v3.h[7] 478 479 # Is there a remainder?- 1 to 7 bytes of A 480 CBNZ x0, 5f 481 4824: 483 # ks loop 484 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 485 B.HI 1b 486 487 # Apply params - preshift, scale, postshift, bias and clamp 488 LD1R {v4.4s}, [x11], 4 489 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 490 SQSHL v17.4s, v17.4s, v4.4s 491 SQSHL v18.4s, v18.4s, v4.4s 492 SQSHL v19.4s, v19.4s, v4.4s 493 SQSHL v20.4s, v20.4s, v4.4s 494 SQSHL v21.4s, v21.4s, v4.4s 495 SQSHL v22.4s, v22.4s, v4.4s 496 SQSHL v23.4s, v23.4s, v4.4s 497 LD1R {v5.4s}, [x11], 4 498 SQSHL v24.4s, v24.4s, v4.4s 499 SQSHL v25.4s, v25.4s, v4.4s 500 SQSHL v26.4s, v26.4s, v4.4s 501 SQSHL v27.4s, v27.4s, v4.4s 502 SQSHL v28.4s, v28.4s, v4.4s 503 SQSHL v29.4s, v29.4s, v4.4s 504 SQSHL v30.4s, v30.4s, v4.4s 505 SQSHL v31.4s, v31.4s, v4.4s 506 LD1R {v6.4s}, [x11], 4 507 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 508 SQDMULH v17.4s, v17.4s, v5.4s 509 SQDMULH v18.4s, v18.4s, v5.4s 510 SQDMULH v19.4s, v19.4s, v5.4s 511 SQDMULH v20.4s, v20.4s, v5.4s 512 SQDMULH v21.4s, v21.4s, v5.4s 513 SQDMULH v22.4s, v22.4s, v5.4s 514 SQDMULH v23.4s, v23.4s, v5.4s 515 SQDMULH v24.4s, v24.4s, v5.4s 516 SQDMULH v25.4s, v25.4s, v5.4s 517 SQDMULH v26.4s, v26.4s, v5.4s 518 SQDMULH v27.4s, v27.4s, v5.4s 519 SQDMULH v28.4s, v28.4s, v5.4s 520 SQDMULH v29.4s, v29.4s, v5.4s 521 SQDMULH v30.4s, v30.4s, v5.4s 522 SQDMULH v31.4s, v31.4s, v5.4s 523 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 524 SRSHL v17.4s, v17.4s, v6.4s 525 SRSHL v18.4s, v18.4s, v6.4s 526 SRSHL v19.4s, v19.4s, v6.4s 527 SRSHL v20.4s, v20.4s, v6.4s 528 SRSHL v21.4s, v21.4s, v6.4s 529 SRSHL v22.4s, v22.4s, v6.4s 530 SRSHL v23.4s, v23.4s, v6.4s 531 SRSHL v24.4s, v24.4s, v6.4s 532 SRSHL v25.4s, v25.4s, v6.4s 533 SRSHL v26.4s, v26.4s, v6.4s 534 SRSHL v27.4s, v27.4s, v6.4s 535 SRSHL v28.4s, v28.4s, v6.4s 536 SRSHL v29.4s, v29.4s, v6.4s 537 SRSHL v30.4s, v30.4s, v6.4s 538 SRSHL v31.4s, v31.4s, v6.4s 539 540 SQXTN v16.4h, v16.4s 541 SQXTN v17.4h, v17.4s 542 SQXTN v18.4h, v18.4s 543 SQXTN v19.4h, v19.4s 544 SQXTN v24.4h, v24.4s 545 SQXTN v25.4h, v25.4s 546 SQXTN v26.4h, v26.4s 547 SQXTN v27.4h, v27.4s 548 LD1R {v6.8h}, [x11], 2 // add bias 549 550 SQXTN2 v16.8h, v20.4s 551 SQXTN2 v17.8h, v21.4s 552 SQXTN2 v18.8h, v22.4s 553 SQXTN2 v19.8h, v23.4s 554 SQXTN2 v24.8h, v28.4s 555 SQXTN2 v25.8h, v29.4s 556 SQXTN2 v26.8h, v30.4s 557 SQXTN2 v27.8h, v31.4s 558 559 SQADD v16.8h, v16.8h, v6.8h 560 SQADD v17.8h, v17.8h, v6.8h 561 SQADD v18.8h, v18.8h, v6.8h 562 SQADD v19.8h, v19.8h, v6.8h 563 SQADD v24.8h, v24.8h, v6.8h 564 SQADD v25.8h, v25.8h, v6.8h 565 SQADD v26.8h, v26.8h, v6.8h 566 SQADD v27.8h, v27.8h, v6.8h 567 LD1R {v4.16b}, [x11], 1 // clamp min value 568 569 SQXTN v0.8b, v16.8h 570 SQXTN v1.8b, v17.8h 571 SQXTN v2.8b, v18.8h 572 SQXTN v3.8b, v19.8h 573 LD1R {v5.16b}, [x11] // clamp max value 574 SQXTN2 v0.16b, v24.8h 575 SQXTN2 v1.16b, v25.8h 576 SQXTN2 v2.16b, v26.8h 577 SQXTN2 v3.16b, v27.8h 578 SUB x11, x11, 15 // rewind params pointer 579 580 SMAX v0.16b, v0.16b, v4.16b 581 SMAX v1.16b, v1.16b, v4.16b 582 SMAX v2.16b, v2.16b, v4.16b 583 SMAX v3.16b, v3.16b, v4.16b 584 SUBS x1, x1, 16 585 SMIN v0.16b, v0.16b, v5.16b 586 SMIN v1.16b, v1.16b, v5.16b 587 SMIN v2.16b, v2.16b, v5.16b 588 SMIN v3.16b, v3.16b, v5.16b 589 B.LO 6f 590 591 # Store full 4 x 16 592 ST1 {v3.16b}, [x7], x10 593 ST1 {v2.16b}, [x17], x10 594 ST1 {v1.16b}, [x16], x10 595 ST1 {v0.16b}, [x6], x10 596 597 SUB x4, x4, x3 // a -= ks 598 599 # nc loop 600 B.HI 0b 601 602 # Restore x20-x21 from stack 603 LDP x20, x21, [sp], 16 604 RET 605 606 # Remainder- 1 to 7 bytes of A 607 .p2align 3 6085: 609 AND x0, x2, 7 // kc remainder 1 to 7 610 611 LD1 {v0.8b}, [x13], x0 612 LDP d4, d5, [x5], 16 613 LD1 {v1.8b}, [x14], x0 614 LD1 {v2.8b}, [x15], x0 615 LD1 {v3.8b}, [x20], x0 616 SXTL v0.8h, v0.8b 617 SXTL v4.8h, v4.8b 618 SXTL v5.8h, v5.8b 619 SXTL v1.8h, v1.8b 620 SXTL v2.8h, v2.8b 621 SXTL v3.8h, v3.8b 622 SMLAL v16.4s, v4.4h, v0.h[0] 623 SMLAL2 v20.4s, v4.8h, v0.h[0] 624 SMLAL v24.4s, v5.4h, v0.h[0] 625 SMLAL2 v28.4s, v5.8h, v0.h[0] 626 SMLAL v17.4s, v4.4h, v1.h[0] 627 SMLAL2 v21.4s, v4.8h, v1.h[0] 628 SMLAL v25.4s, v5.4h, v1.h[0] 629 SMLAL2 v29.4s, v5.8h, v1.h[0] 630 SMLAL v18.4s, v4.4h, v2.h[0] 631 SMLAL2 v22.4s, v4.8h, v2.h[0] 632 SMLAL v26.4s, v5.4h, v2.h[0] 633 SMLAL2 v30.4s, v5.8h, v2.h[0] 634 SMLAL v19.4s, v4.4h, v3.h[0] 635 SMLAL2 v23.4s, v4.8h, v3.h[0] 636 SMLAL v27.4s, v5.4h, v3.h[0] 637 SMLAL2 v31.4s, v5.8h, v3.h[0] 638 CMP x0, 2 639 B.LO 4b 640 641 LDP d4, d5, [x5], 16 642 SXTL v4.8h, v4.8b 643 SXTL v5.8h, v5.8b 644 SMLAL v16.4s, v4.4h, v0.h[1] 645 SMLAL2 v20.4s, v4.8h, v0.h[1] 646 SMLAL v24.4s, v5.4h, v0.h[1] 647 SMLAL2 v28.4s, v5.8h, v0.h[1] 648 SMLAL v17.4s, v4.4h, v1.h[1] 649 SMLAL2 v21.4s, v4.8h, v1.h[1] 650 SMLAL v25.4s, v5.4h, v1.h[1] 651 SMLAL2 v29.4s, v5.8h, v1.h[1] 652 SMLAL v18.4s, v4.4h, v2.h[1] 653 SMLAL2 v22.4s, v4.8h, v2.h[1] 654 SMLAL v26.4s, v5.4h, v2.h[1] 655 SMLAL2 v30.4s, v5.8h, v2.h[1] 656 SMLAL v19.4s, v4.4h, v3.h[1] 657 SMLAL2 v23.4s, v4.8h, v3.h[1] 658 SMLAL v27.4s, v5.4h, v3.h[1] 659 SMLAL2 v31.4s, v5.8h, v3.h[1] 660 B.EQ 4b 661 662 LDP d4, d5, [x5], 16 663 SXTL v4.8h, v4.8b 664 SXTL v5.8h, v5.8b 665 SMLAL v16.4s, v4.4h, v0.h[2] 666 SMLAL2 v20.4s, v4.8h, v0.h[2] 667 SMLAL v24.4s, v5.4h, v0.h[2] 668 SMLAL2 v28.4s, v5.8h, v0.h[2] 669 SMLAL v17.4s, v4.4h, v1.h[2] 670 SMLAL2 v21.4s, v4.8h, v1.h[2] 671 SMLAL v25.4s, v5.4h, v1.h[2] 672 SMLAL2 v29.4s, v5.8h, v1.h[2] 673 SMLAL v18.4s, v4.4h, v2.h[2] 674 SMLAL2 v22.4s, v4.8h, v2.h[2] 675 SMLAL v26.4s, v5.4h, v2.h[2] 676 SMLAL2 v30.4s, v5.8h, v2.h[2] 677 SMLAL v19.4s, v4.4h, v3.h[2] 678 SMLAL2 v23.4s, v4.8h, v3.h[2] 679 SMLAL v27.4s, v5.4h, v3.h[2] 680 SMLAL2 v31.4s, v5.8h, v3.h[2] 681 CMP x0, 4 682 B.LO 4b 683 684 LDP d4, d5, [x5], 16 685 SXTL v4.8h, v4.8b 686 SXTL v5.8h, v5.8b 687 SMLAL v16.4s, v4.4h, v0.h[3] 688 SMLAL2 v20.4s, v4.8h, v0.h[3] 689 SMLAL v24.4s, v5.4h, v0.h[3] 690 SMLAL2 v28.4s, v5.8h, v0.h[3] 691 SMLAL v17.4s, v4.4h, v1.h[3] 692 SMLAL2 v21.4s, v4.8h, v1.h[3] 693 SMLAL v25.4s, v5.4h, v1.h[3] 694 SMLAL2 v29.4s, v5.8h, v1.h[3] 695 SMLAL v18.4s, v4.4h, v2.h[3] 696 SMLAL2 v22.4s, v4.8h, v2.h[3] 697 SMLAL v26.4s, v5.4h, v2.h[3] 698 SMLAL2 v30.4s, v5.8h, v2.h[3] 699 SMLAL v19.4s, v4.4h, v3.h[3] 700 SMLAL2 v23.4s, v4.8h, v3.h[3] 701 SMLAL v27.4s, v5.4h, v3.h[3] 702 SMLAL2 v31.4s, v5.8h, v3.h[3] 703 B.EQ 4b 704 705 LDP d4, d5, [x5], 16 706 SXTL v4.8h, v4.8b 707 SXTL v5.8h, v5.8b 708 SMLAL v16.4s, v4.4h, v0.h[4] 709 SMLAL2 v20.4s, v4.8h, v0.h[4] 710 SMLAL v24.4s, v5.4h, v0.h[4] 711 SMLAL2 v28.4s, v5.8h, v0.h[4] 712 SMLAL v17.4s, v4.4h, v1.h[4] 713 SMLAL2 v21.4s, v4.8h, v1.h[4] 714 SMLAL v25.4s, v5.4h, v1.h[4] 715 SMLAL2 v29.4s, v5.8h, v1.h[4] 716 SMLAL v18.4s, v4.4h, v2.h[4] 717 SMLAL2 v22.4s, v4.8h, v2.h[4] 718 SMLAL v26.4s, v5.4h, v2.h[4] 719 SMLAL2 v30.4s, v5.8h, v2.h[4] 720 SMLAL v19.4s, v4.4h, v3.h[4] 721 SMLAL2 v23.4s, v4.8h, v3.h[4] 722 SMLAL v27.4s, v5.4h, v3.h[4] 723 SMLAL2 v31.4s, v5.8h, v3.h[4] 724 CMP x0, 6 725 B.LO 4b 726 727 LDP d4, d5, [x5], 16 728 SXTL v4.8h, v4.8b 729 SXTL v5.8h, v5.8b 730 SMLAL v16.4s, v4.4h, v0.h[5] 731 SMLAL2 v20.4s, v4.8h, v0.h[5] 732 SMLAL v24.4s, v5.4h, v0.h[5] 733 SMLAL2 v28.4s, v5.8h, v0.h[5] 734 SMLAL v17.4s, v4.4h, v1.h[5] 735 SMLAL2 v21.4s, v4.8h, v1.h[5] 736 SMLAL v25.4s, v5.4h, v1.h[5] 737 SMLAL2 v29.4s, v5.8h, v1.h[5] 738 SMLAL v18.4s, v4.4h, v2.h[5] 739 SMLAL2 v22.4s, v4.8h, v2.h[5] 740 SMLAL v26.4s, v5.4h, v2.h[5] 741 SMLAL2 v30.4s, v5.8h, v2.h[5] 742 SMLAL v19.4s, v4.4h, v3.h[5] 743 SMLAL2 v23.4s, v4.8h, v3.h[5] 744 SMLAL v27.4s, v5.4h, v3.h[5] 745 SMLAL2 v31.4s, v5.8h, v3.h[5] 746 B.EQ 4b 747 748 LDP d4, d5, [x5], 16 749 SXTL v4.8h, v4.8b 750 SXTL v5.8h, v5.8b 751 SMLAL v16.4s, v4.4h, v0.h[6] 752 SMLAL2 v20.4s, v4.8h, v0.h[6] 753 SMLAL v24.4s, v5.4h, v0.h[6] 754 SMLAL2 v28.4s, v5.8h, v0.h[6] 755 SMLAL v17.4s, v4.4h, v1.h[6] 756 SMLAL2 v21.4s, v4.8h, v1.h[6] 757 SMLAL v25.4s, v5.4h, v1.h[6] 758 SMLAL2 v29.4s, v5.8h, v1.h[6] 759 SMLAL v18.4s, v4.4h, v2.h[6] 760 SMLAL2 v22.4s, v4.8h, v2.h[6] 761 SMLAL v26.4s, v5.4h, v2.h[6] 762 SMLAL2 v30.4s, v5.8h, v2.h[6] 763 SMLAL v19.4s, v4.4h, v3.h[6] 764 SMLAL2 v23.4s, v4.8h, v3.h[6] 765 SMLAL v27.4s, v5.4h, v3.h[6] 766 SMLAL2 v31.4s, v5.8h, v3.h[6] 767 B 4b 768 769 # Store odd width 770 .p2align 3 7716: 772 TBZ x1, 3, 7f 773 STR d3, [x7], 8 774 STR d2, [x17], 8 775 DUP d3, v3.d[1] 776 DUP d2, v2.d[1] 777 STR d1, [x16], 8 778 STR d0, [x6], 8 779 DUP d1, v1.d[1] 780 DUP d0, v0.d[1] 7817: 782 TBZ x1, 2, 8f 783 STR s3, [x7], 4 784 STR s2, [x17], 4 785 DUP s3, v3.s[1] 786 DUP s2, v2.s[1] 787 STR s1, [x16], 4 788 STR s0, [x6], 4 789 DUP s1, v1.s[1] 790 DUP s0, v0.s[1] 7918: 792 TBZ x1, 1, 9f 793 STR h3, [x7], 2 794 STR h2, [x17], 2 795 DUP h3, v3.h[1] 796 DUP h2, v2.h[1] 797 STR h1, [x16], 2 798 STR h0, [x6], 2 799 DUP h1, v1.h[1] 800 DUP h0, v0.h[1] 8019: 802 TBZ x1, 0, 10f 803 STR b3, [x7] 804 STR b2, [x17] 805 STR b1, [x16] 806 STR b0, [x6] 80710: 808 # Restore x20-x21 from stack 809 LDP x20, x21, [sp], 16 810 RET 811 812END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53 813 814#ifdef __ELF__ 815.section ".note.GNU-stack","",%progbits 816#endif 817