1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const float*restrict w, x5 19# float*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> x8 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x20 a0 30# x13 a1 31# x14 a2 32# x15 a3 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x7 c3 / cm_stride 39 40# Vector register usage 41# A0 v0 v4 42# A1 v1 v5 43# A2 v2 v6 44# A3 v3 v7 45# B v8 v9 v10 v11 46# B v12 v13 v14 v15 47# B v16 v17 v18 v19 48# B v20 v21 v22 v23 49# C v24 v25 50# C v26 v27 51# C v28 v29 52# C v30 v31 53# Clamp v4 v5 54 55BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75 56 57 # Load cn_stride, a_offset 58 LDP x10, x11, [sp] 59 60 # Load zero, params pointer 61 LDP x12, x8, [sp, 16] 62 63 # Load min/max values 64 LD2R {v4.4s, v5.4s}, [x8] 65 66 # Save x20 on stack 67 STR x20, [sp, -80]! 68 69 # Save d8-d15 on stack 70 STP d8, d9, [sp, 16] 71 STP d10, d11, [sp, 32] 72 STP d12, d13, [sp, 48] 73 STP d14, d15, [sp, 64] 74 75 # Clamp C pointers 76 CMP x0, 2 // if mr < 2 77 ADD x16, x6, x7 // c1 = c0 + cm_stride 78 CSEL x16, x6, x16, LO // c1 = c0 79 80 ADD x17, x16, x7 // c2 = c1 + cm_stride 81 // if mr <= 2 82 CSEL x17, x16, x17, LS // c2 = c1 83 84 CMP x0, 4 // if mr < 4 85 ADD x7, x17, x7 // c3 = c2 + cm_stride 86 CSEL x7, x17, x7, LO // c3 = c2 87 880: 89 # Load initial bias from w into accumulators 90 LDP q24, q25, [x5], 32 91 MOV v26.16b, v24.16b 92 MOV v27.16b, v25.16b 93 MOV v28.16b, v24.16b 94 MOV v29.16b, v25.16b 95 MOV v30.16b, v24.16b 96 MOV v31.16b, v25.16b 97 98 MOV x9, x3 // p = ks 99 1001: 101 # Load next 4 A pointers 102 LDP x20, x13, [x4], 16 103 LDP x14, x15, [x4], 16 104 105 CMP x20, x12 // if a0 == zero 106 ADD x20, x20, x11 // a0 += a_offset 107 CSEL x20, x12, x20, EQ // a0 = zero, else += a0 + a_offset 108 CMP x13, x12 // if a1 == zero 109 ADD x13, x13, x11 // a1 += a_offset 110 CSEL x13, x12, x13, EQ // a1 = zero, else += a1 + a_offset 111 CMP x14, x12 // if a2 == zero 112 ADD x14, x14, x11 // a2 += a_offset 113 CSEL x14, x12, x14, EQ // a2 = zero, else += a2 + a_offset 114 CMP x15, x12 // if a3 == zero 115 ADD x15, x15, x11 // a3 += a_offset 116 CSEL x15, x12, x15, EQ // a3 = zero, else += a3 + a_offset 117 118 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 119 SUBS x0, x2, 32 // k = kc - 32 120 B.LO 4f 121 122 # 16 prologue 123 # Read first block of 4 A and B. 124 LDR q0, [x20], 16 125 LDP q16, q17, [x5], 32 126 LDR q1, [x13], 16 127 LDR q2, [x14], 16 128 LDR q3, [x15], 16 129 LDP q18, q19, [x5], 32 130 LDP q20, q21, [x5], 32 131 LDP q22, q23, [x5], 32 132 133 # Is there at least 32. yes do main loop 134 SUBS x0, x0, 32 135 B.LO 3f 136 137 # Main loop - 8 floats of A 1382: 139 # First block of 4. FMA for first 4, loads for 2nd block of 4. 140 FMLA v24.4s, v16.4s, v0.s[0] 141 LDP q8, q9, [x5], 32 142 FMLA v25.4s, v17.4s, v0.s[0] 143 FMLA v26.4s, v16.4s, v1.s[0] 144 LDP q10, q11, [x5], 32 145 FMLA v27.4s, v17.4s, v1.s[0] 146 FMLA v28.4s, v16.4s, v2.s[0] 147 LDP q12, q13, [x5], 32 148 FMLA v29.4s, v17.4s, v2.s[0] 149 FMLA v30.4s, v16.4s, v3.s[0] 150 LDP q14, q15, [x5], 32 151 FMLA v31.4s, v17.4s, v3.s[0] 152 FMLA v24.4s, v18.4s, v0.s[1] 153 LDR q4, [x20], 16 154 FMLA v25.4s, v19.4s, v0.s[1] 155 FMLA v26.4s, v18.4s, v1.s[1] 156 LDR q5, [x13], 16 157 FMLA v27.4s, v19.4s, v1.s[1] 158 FMLA v28.4s, v18.4s, v2.s[1] 159 LDR q6, [x14], 16 160 FMLA v29.4s, v19.4s, v2.s[1] 161 FMLA v30.4s, v18.4s, v3.s[1] 162 LDR q7, [x15], 16 163 FMLA v31.4s, v19.4s, v3.s[1] 164 FMLA v24.4s, v20.4s, v0.s[2] 165 PRFM PLDL1KEEP, [x5, 128] 166 FMLA v25.4s, v21.4s, v0.s[2] 167 FMLA v26.4s, v20.4s, v1.s[2] 168 PRFM PLDL1KEEP, [x5, 192] 169 FMLA v27.4s, v21.4s, v1.s[2] 170 FMLA v28.4s, v20.4s, v2.s[2] 171 PRFM PLDL1KEEP, [x5, 256] 172 FMLA v29.4s, v21.4s, v2.s[2] 173 FMLA v30.4s, v20.4s, v3.s[2] 174 PRFM PLDL1KEEP, [x5, 320] 175 FMLA v31.4s, v21.4s, v3.s[2] 176 FMLA v24.4s, v22.4s, v0.s[3] 177 FMLA v25.4s, v23.4s, v0.s[3] 178 FMLA v26.4s, v22.4s, v1.s[3] 179 FMLA v27.4s, v23.4s, v1.s[3] 180 FMLA v28.4s, v22.4s, v2.s[3] 181 FMLA v29.4s, v23.4s, v2.s[3] 182 FMLA v30.4s, v22.4s, v3.s[3] 183 FMLA v31.4s, v23.4s, v3.s[3] 184 185 # Second block of 4. FMA for second 4, loads for 1st block of 4. 186 FMLA v24.4s, v8.4s, v4.s[0] 187 LDP q16, q17, [x5], 32 188 FMLA v25.4s, v9.4s, v4.s[0] 189 FMLA v26.4s, v8.4s, v5.s[0] 190 LDP q18, q19, [x5], 32 191 FMLA v27.4s, v9.4s, v5.s[0] 192 FMLA v28.4s, v8.4s, v6.s[0] 193 LDP q20, q21, [x5], 32 194 FMLA v29.4s, v9.4s, v6.s[0] 195 FMLA v30.4s, v8.4s, v7.s[0] 196 LDP q22, q23, [x5], 32 197 FMLA v31.4s, v9.4s, v7.s[0] 198 FMLA v24.4s, v10.4s, v4.s[1] 199 LDR q0, [x20], 16 200 FMLA v25.4s, v11.4s, v4.s[1] 201 FMLA v26.4s, v10.4s, v5.s[1] 202 LDR q1, [x13], 16 203 FMLA v27.4s, v11.4s, v5.s[1] 204 FMLA v28.4s, v10.4s, v6.s[1] 205 LDR q2, [x14], 16 206 FMLA v29.4s, v11.4s, v6.s[1] 207 FMLA v30.4s, v10.4s, v7.s[1] 208 LDR q3, [x15], 16 209 FMLA v31.4s, v11.4s, v7.s[1] 210 FMLA v24.4s, v12.4s, v4.s[2] 211 FMLA v25.4s, v13.4s, v4.s[2] 212 FMLA v26.4s, v12.4s, v5.s[2] 213 FMLA v27.4s, v13.4s, v5.s[2] 214 FMLA v28.4s, v12.4s, v6.s[2] 215 FMLA v29.4s, v13.4s, v6.s[2] 216 FMLA v30.4s, v12.4s, v7.s[2] 217 FMLA v31.4s, v13.4s, v7.s[2] 218 FMLA v24.4s, v14.4s, v4.s[3] 219 FMLA v25.4s, v15.4s, v4.s[3] 220 FMLA v26.4s, v14.4s, v5.s[3] 221 FMLA v27.4s, v15.4s, v5.s[3] 222 FMLA v28.4s, v14.4s, v6.s[3] 223 FMLA v29.4s, v15.4s, v6.s[3] 224 SUBS x0, x0, 32 225 FMLA v30.4s, v14.4s, v7.s[3] 226 FMLA v31.4s, v15.4s, v7.s[3] 227 228 B.HS 2b 229 2303: 231 # Epilogue 232 # First block of 4. FMA for first 4, loads for 2nd block of 4. 233 FMLA v24.4s, v16.4s, v0.s[0] 234 LDP q8, q9, [x5], 32 235 FMLA v25.4s, v17.4s, v0.s[0] 236 FMLA v26.4s, v16.4s, v1.s[0] 237 LDP q10, q11, [x5], 32 238 FMLA v27.4s, v17.4s, v1.s[0] 239 FMLA v28.4s, v16.4s, v2.s[0] 240 LDP q12, q13, [x5], 32 241 FMLA v29.4s, v17.4s, v2.s[0] 242 FMLA v30.4s, v16.4s, v3.s[0] 243 LDP q14, q15, [x5], 32 244 FMLA v31.4s, v17.4s, v3.s[0] 245 FMLA v24.4s, v18.4s, v0.s[1] 246 LDR q4, [x20], 16 247 FMLA v25.4s, v19.4s, v0.s[1] 248 FMLA v26.4s, v18.4s, v1.s[1] 249 LDR q5, [x13], 16 250 FMLA v27.4s, v19.4s, v1.s[1] 251 FMLA v28.4s, v18.4s, v2.s[1] 252 LDR q6, [x14], 16 253 FMLA v29.4s, v19.4s, v2.s[1] 254 FMLA v30.4s, v18.4s, v3.s[1] 255 LDR q7, [x15], 16 256 FMLA v31.4s, v19.4s, v3.s[1] 257 FMLA v24.4s, v20.4s, v0.s[2] 258 FMLA v25.4s, v21.4s, v0.s[2] 259 FMLA v26.4s, v20.4s, v1.s[2] 260 FMLA v27.4s, v21.4s, v1.s[2] 261 FMLA v28.4s, v20.4s, v2.s[2] 262 FMLA v29.4s, v21.4s, v2.s[2] 263 FMLA v30.4s, v20.4s, v3.s[2] 264 FMLA v31.4s, v21.4s, v3.s[2] 265 FMLA v24.4s, v22.4s, v0.s[3] 266 FMLA v25.4s, v23.4s, v0.s[3] 267 FMLA v26.4s, v22.4s, v1.s[3] 268 FMLA v27.4s, v23.4s, v1.s[3] 269 FMLA v28.4s, v22.4s, v2.s[3] 270 FMLA v29.4s, v23.4s, v2.s[3] 271 FMLA v30.4s, v22.4s, v3.s[3] 272 FMLA v31.4s, v23.4s, v3.s[3] 273 274 # Second block of 4. FMA for second 4, noloads 275 FMLA v24.4s, v8.4s, v4.s[0] 276 FMLA v25.4s, v9.4s, v4.s[0] 277 FMLA v26.4s, v8.4s, v5.s[0] 278 FMLA v27.4s, v9.4s, v5.s[0] 279 FMLA v28.4s, v8.4s, v6.s[0] 280 FMLA v29.4s, v9.4s, v6.s[0] 281 FMLA v30.4s, v8.4s, v7.s[0] 282 FMLA v31.4s, v9.4s, v7.s[0] 283 FMLA v24.4s, v10.4s, v4.s[1] 284 FMLA v25.4s, v11.4s, v4.s[1] 285 FMLA v26.4s, v10.4s, v5.s[1] 286 FMLA v27.4s, v11.4s, v5.s[1] 287 FMLA v28.4s, v10.4s, v6.s[1] 288 FMLA v29.4s, v11.4s, v6.s[1] 289 FMLA v30.4s, v10.4s, v7.s[1] 290 FMLA v31.4s, v11.4s, v7.s[1] 291 FMLA v24.4s, v12.4s, v4.s[2] 292 FMLA v25.4s, v13.4s, v4.s[2] 293 FMLA v26.4s, v12.4s, v5.s[2] 294 FMLA v27.4s, v13.4s, v5.s[2] 295 FMLA v28.4s, v12.4s, v6.s[2] 296 FMLA v29.4s, v13.4s, v6.s[2] 297 FMLA v30.4s, v12.4s, v7.s[2] 298 FMLA v31.4s, v13.4s, v7.s[2] 299 300 FMLA v24.4s, v14.4s, v4.s[3] 301 FMLA v25.4s, v15.4s, v4.s[3] 302 FMLA v26.4s, v14.4s, v5.s[3] 303 FMLA v27.4s, v15.4s, v5.s[3] 304 305 # Load min/max values 306 LD2R {v4.4s, v5.4s}, [x8] 307 308 FMLA v28.4s, v14.4s, v6.s[3] 309 FMLA v29.4s, v15.4s, v6.s[3] 310 FMLA v30.4s, v14.4s, v7.s[3] 311 FMLA v31.4s, v15.4s, v7.s[3] 312 3134: 314 # Remainder- 4 floats of A 315 TBZ x0, 4, 5f 316 317 LDR q0, [x20], 16 318 LDP q16, q17, [x5], 32 319 LDR q1, [x13], 16 320 LDR q2, [x14], 16 321 LDR q3, [x15], 16 322 FMLA v24.4s, v16.4s, v0.s[0] 323 FMLA v25.4s, v17.4s, v0.s[0] 324 LDP q18, q19, [x5], 32 325 FMLA v26.4s, v16.4s, v1.s[0] 326 FMLA v27.4s, v17.4s, v1.s[0] 327 LDP q20, q21, [x5], 32 328 FMLA v28.4s, v16.4s, v2.s[0] 329 FMLA v29.4s, v17.4s, v2.s[0] 330 LDP q22, q23, [x5], 32 331 FMLA v30.4s, v16.4s, v3.s[0] 332 FMLA v31.4s, v17.4s, v3.s[0] 333 FMLA v24.4s, v18.4s, v0.s[1] 334 FMLA v25.4s, v19.4s, v0.s[1] 335 FMLA v26.4s, v18.4s, v1.s[1] 336 FMLA v27.4s, v19.4s, v1.s[1] 337 FMLA v28.4s, v18.4s, v2.s[1] 338 FMLA v29.4s, v19.4s, v2.s[1] 339 FMLA v30.4s, v18.4s, v3.s[1] 340 FMLA v31.4s, v19.4s, v3.s[1] 341 FMLA v24.4s, v20.4s, v0.s[2] 342 FMLA v25.4s, v21.4s, v0.s[2] 343 FMLA v26.4s, v20.4s, v1.s[2] 344 FMLA v27.4s, v21.4s, v1.s[2] 345 FMLA v28.4s, v20.4s, v2.s[2] 346 FMLA v29.4s, v21.4s, v2.s[2] 347 FMLA v30.4s, v20.4s, v3.s[2] 348 FMLA v31.4s, v21.4s, v3.s[2] 349 FMLA v24.4s, v22.4s, v0.s[3] 350 FMLA v25.4s, v23.4s, v0.s[3] 351 FMLA v26.4s, v22.4s, v1.s[3] 352 FMLA v27.4s, v23.4s, v1.s[3] 353 FMLA v28.4s, v22.4s, v2.s[3] 354 FMLA v29.4s, v23.4s, v2.s[3] 355 FMLA v30.4s, v22.4s, v3.s[3] 356 FMLA v31.4s, v23.4s, v3.s[3] 357 3585: 359 # Remainder- 2 floats of A 360 TBZ x0, 3, 6f 361 362 LDR d0, [x20], 8 363 LDP q16, q17, [x5], 32 364 LDR d1, [x13], 8 365 LDR d2, [x14], 8 366 LDR d3, [x15], 8 367 FMLA v24.4s, v16.4s, v0.s[0] 368 FMLA v25.4s, v17.4s, v0.s[0] 369 LDP q18, q19, [x5], 32 370 FMLA v26.4s, v16.4s, v1.s[0] 371 FMLA v27.4s, v17.4s, v1.s[0] 372 FMLA v28.4s, v16.4s, v2.s[0] 373 FMLA v29.4s, v17.4s, v2.s[0] 374 FMLA v30.4s, v16.4s, v3.s[0] 375 FMLA v31.4s, v17.4s, v3.s[0] 376 FMLA v24.4s, v18.4s, v0.s[1] 377 FMLA v25.4s, v19.4s, v0.s[1] 378 FMLA v26.4s, v18.4s, v1.s[1] 379 FMLA v27.4s, v19.4s, v1.s[1] 380 FMLA v28.4s, v18.4s, v2.s[1] 381 FMLA v29.4s, v19.4s, v2.s[1] 382 FMLA v30.4s, v18.4s, v3.s[1] 383 FMLA v31.4s, v19.4s, v3.s[1] 384 3856: 386 # Remainder- 1 float of A 387 TBZ x0, 2, 7f 388 389 LDR s0, [x20], 4 390 LDP q16, q17, [x5], 32 391 LDR s1, [x13], 4 392 LDR s2, [x14], 4 393 LDR s3, [x15], 4 394 FMLA v24.4s, v16.4s, v0.s[0] 395 FMLA v25.4s, v17.4s, v0.s[0] 396 FMLA v26.4s, v16.4s, v1.s[0] 397 FMLA v27.4s, v17.4s, v1.s[0] 398 FMLA v28.4s, v16.4s, v2.s[0] 399 FMLA v29.4s, v17.4s, v2.s[0] 400 FMLA v30.4s, v16.4s, v3.s[0] 401 FMLA v31.4s, v17.4s, v3.s[0] 402 4037: 404 # ks loop 405 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 406 B.HI 1b 407 408 # Clamp 409 FMAX v24.4s, v24.4s, v4.4s 410 FMAX v25.4s, v25.4s, v4.4s 411 FMAX v26.4s, v26.4s, v4.4s 412 FMAX v27.4s, v27.4s, v4.4s 413 FMAX v28.4s, v28.4s, v4.4s 414 FMAX v29.4s, v29.4s, v4.4s 415 FMAX v30.4s, v30.4s, v4.4s 416 FMAX v31.4s, v31.4s, v4.4s 417 FMIN v24.4s, v24.4s, v5.4s 418 FMIN v25.4s, v25.4s, v5.4s 419 FMIN v26.4s, v26.4s, v5.4s 420 FMIN v27.4s, v27.4s, v5.4s 421 FMIN v28.4s, v28.4s, v5.4s 422 FMIN v29.4s, v29.4s, v5.4s 423 FMIN v30.4s, v30.4s, v5.4s 424 FMIN v31.4s, v31.4s, v5.4s 425 426 # Store full 4 x 8 427 SUBS x1, x1, 8 428 B.LO 8f 429 430 STP q30, q31, [x7] 431 ADD x7, x7, x10 432 STP q28, q29, [x17] 433 ADD x17, x17, x10 434 STP q26, q27, [x16] 435 ADD x16, x16, x10 436 STP q24, q25, [x6] 437 ADD x6, x6, x10 438 439 SUB x4, x4, x3 // a -= ks 440 441 # nc loop 442 B.HI 0b 443 444 # Restore d8-d15 from stack 445 LDP d14, d15, [sp, 64] 446 LDP d12, d13, [sp, 48] 447 LDP d10, d11, [sp, 32] 448 LDP d8, d9, [sp, 16] 449 450 # Restore x20 from stack 451 LDR x20, [sp], 80 452 RET 453 454 # Store odd width 4558: 456 TBZ x1, 2, 9f 457 STR q30, [x7], 16 458 MOV v30.16b, v31.16b 459 STR q28, [x17], 16 460 MOV v28.16b, v29.16b 461 STR q26, [x16], 16 462 MOV v26.16b, v27.16b 463 STR q24, [x6], 16 464 MOV v24.16b, v25.16b 465 4669: 467 TBZ x1, 1, 10f 468 STR d30, [x7], 8 469 STR d28, [x17], 8 470 DUP d30, v30.d[1] 471 DUP d28, v28.d[1] 472 STR d26, [x16], 8 473 STR d24, [x6], 8 474 DUP d26, v26.d[1] 475 DUP d24, v24.d[1] 476 47710: 478 TBZ x1, 0, 11f 479 STR s30, [x7] 480 STR s28, [x17] 481 STR s26, [x16] 482 STR s24, [x6] 48311: 484 # Restore d8-d15 from stack 485 LDP d14, d15, [sp, 64] 486 LDP d12, d13, [sp, 48] 487 LDP d10, d11, [sp, 32] 488 LDP d8, d9, [sp, 16] 489 490 # Restore x20 from stack 491 LDR x20, [sp], 80 492 RET 493 494END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75 495 496#ifdef __ELF__ 497.section ".note.GNU-stack","",%progbits 498#endif 499