1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38# Vector register usage 39# A0 v0 v4 40# A1 v1 v5 41# A2 v2 v6 42# A3 v3 v7 43# B v8 v9 v10 v11 44# B v12 v13 v14 v15 45# B v16 v17 v18 v19 46# B v20 v21 v22 v23 47# C v24 v25 48# C v26 v27 49# C v28 v29 50# C v30 v31 51# Clamp v4 v5 52 53BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75 54 55 # Load cn_stride, params pointer 56 LDP x14, x8, [sp] 57 58 # Load min/max values 59 LD2R {v4.4s, v5.4s}, [x8] 60 61 # Save d8-d15 on stack 62 STP d8, d9, [sp, -64]! 63 STP d10, d11, [sp, 16] 64 STP d12, d13, [sp, 32] 65 STP d14, d15, [sp, 48] 66 67 # Clamp A and C pointers 68 CMP x0, 2 // if mr < 2 69 ADD x11, x3, x4 // a1 = a0 + a_stride 70 ADD x9, x6, x7 // c1 = c0 + cm_stride 71 CSEL x11, x3, x11, LO // a1 = a0 72 CSEL x9, x6, x9, LO // c1 = c0 73 74 ADD x12, x11, x4 // a2 = a1 + a_stride 75 ADD x10, x9, x7 // c2 = c1 + cm_stride 76 // if mr <= 2 77 CSEL x12, x11, x12, LS // a2 = a1 78 CSEL x10, x9, x10, LS // c2 = c1 79 80 CMP x0, 4 // if mr < 4 81 ADD x4, x12, x4 // a3 = a2 + a_stride 82 ADD x7, x10, x7 // c3 = c2 + cm_stride 83 CSEL x4, x12, x4, LO // a3 = a2 84 CSEL x7, x10, x7, LO // c3 = c2 85 860: 87 # Load initial bias from w into accumulators 88 LDP q24, q25, [x5], 32 89 MOV v26.16b, v24.16b 90 MOV v27.16b, v25.16b 91 MOV v28.16b, v24.16b 92 MOV v29.16b, v25.16b 93 MOV v30.16b, v24.16b 94 MOV v31.16b, v25.16b 95 96 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 97 SUBS x0, x2, 32 // k = kc - 32 98 B.LO 3f 99 100 # 16 prologue 101 # Read first block of 4 A and B. 102 LDR q0, [x3], 16 103 LDP q16, q17, [x5], 32 104 LDR q1, [x11], 16 105 LDR q2, [x12], 16 106 LDR q3, [x4], 16 107 LDP q18, q19, [x5], 32 108 LDP q20, q21, [x5], 32 109 LDP q22, q23, [x5], 32 110 111 # Is there at least 32. yes do main loop 112 SUBS x0, x0, 32 113 B.LO 2f 114 115 # Main loop - 8 floats of A (32 bytes) 1161: 117 # First block of 4. FMA for first 4, loads for 2nd block of 4. 118 FMLA v24.4s, v16.4s, v0.s[0] 119 LDP q8, q9, [x5], 32 120 FMLA v25.4s, v17.4s, v0.s[0] 121 FMLA v26.4s, v16.4s, v1.s[0] 122 LDP q10, q11, [x5], 32 123 FMLA v27.4s, v17.4s, v1.s[0] 124 FMLA v28.4s, v16.4s, v2.s[0] 125 LDP q12, q13, [x5], 32 126 FMLA v29.4s, v17.4s, v2.s[0] 127 FMLA v30.4s, v16.4s, v3.s[0] 128 LDP q14, q15, [x5], 32 129 FMLA v31.4s, v17.4s, v3.s[0] 130 FMLA v24.4s, v18.4s, v0.s[1] 131 LDR q4, [x3], 16 132 FMLA v25.4s, v19.4s, v0.s[1] 133 FMLA v26.4s, v18.4s, v1.s[1] 134 LDR q5, [x11], 16 135 FMLA v27.4s, v19.4s, v1.s[1] 136 FMLA v28.4s, v18.4s, v2.s[1] 137 LDR q6, [x12], 16 138 FMLA v29.4s, v19.4s, v2.s[1] 139 FMLA v30.4s, v18.4s, v3.s[1] 140 LDR q7, [x4], 16 141 FMLA v31.4s, v19.4s, v3.s[1] 142 FMLA v24.4s, v20.4s, v0.s[2] 143 PRFM PLDL1KEEP, [x5, 128] 144 FMLA v25.4s, v21.4s, v0.s[2] 145 FMLA v26.4s, v20.4s, v1.s[2] 146 PRFM PLDL1KEEP, [x5, 192] 147 FMLA v27.4s, v21.4s, v1.s[2] 148 FMLA v28.4s, v20.4s, v2.s[2] 149 PRFM PLDL1KEEP, [x5, 256] 150 FMLA v29.4s, v21.4s, v2.s[2] 151 FMLA v30.4s, v20.4s, v3.s[2] 152 PRFM PLDL1KEEP, [x5, 320] 153 FMLA v31.4s, v21.4s, v3.s[2] 154 FMLA v24.4s, v22.4s, v0.s[3] 155 FMLA v25.4s, v23.4s, v0.s[3] 156 FMLA v26.4s, v22.4s, v1.s[3] 157 FMLA v27.4s, v23.4s, v1.s[3] 158 FMLA v28.4s, v22.4s, v2.s[3] 159 FMLA v29.4s, v23.4s, v2.s[3] 160 FMLA v30.4s, v22.4s, v3.s[3] 161 FMLA v31.4s, v23.4s, v3.s[3] 162 163 # Second block of 4. FMA for second 4, loads for 1st block of 4. 164 FMLA v24.4s, v8.4s, v4.s[0] 165 LDP q16, q17, [x5], 32 166 FMLA v25.4s, v9.4s, v4.s[0] 167 FMLA v26.4s, v8.4s, v5.s[0] 168 LDP q18, q19, [x5], 32 169 FMLA v27.4s, v9.4s, v5.s[0] 170 FMLA v28.4s, v8.4s, v6.s[0] 171 LDP q20, q21, [x5], 32 172 FMLA v29.4s, v9.4s, v6.s[0] 173 FMLA v30.4s, v8.4s, v7.s[0] 174 LDP q22, q23, [x5], 32 175 FMLA v31.4s, v9.4s, v7.s[0] 176 FMLA v24.4s, v10.4s, v4.s[1] 177 LDR q0, [x3], 16 178 FMLA v25.4s, v11.4s, v4.s[1] 179 FMLA v26.4s, v10.4s, v5.s[1] 180 LDR q1, [x11], 16 181 FMLA v27.4s, v11.4s, v5.s[1] 182 FMLA v28.4s, v10.4s, v6.s[1] 183 LDR q2, [x12], 16 184 FMLA v29.4s, v11.4s, v6.s[1] 185 FMLA v30.4s, v10.4s, v7.s[1] 186 LDR q3, [x4], 16 187 FMLA v31.4s, v11.4s, v7.s[1] 188 FMLA v24.4s, v12.4s, v4.s[2] 189 FMLA v25.4s, v13.4s, v4.s[2] 190 FMLA v26.4s, v12.4s, v5.s[2] 191 FMLA v27.4s, v13.4s, v5.s[2] 192 FMLA v28.4s, v12.4s, v6.s[2] 193 FMLA v29.4s, v13.4s, v6.s[2] 194 FMLA v30.4s, v12.4s, v7.s[2] 195 FMLA v31.4s, v13.4s, v7.s[2] 196 FMLA v24.4s, v14.4s, v4.s[3] 197 FMLA v25.4s, v15.4s, v4.s[3] 198 FMLA v26.4s, v14.4s, v5.s[3] 199 FMLA v27.4s, v15.4s, v5.s[3] 200 FMLA v28.4s, v14.4s, v6.s[3] 201 FMLA v29.4s, v15.4s, v6.s[3] 202 SUBS x0, x0, 32 203 FMLA v30.4s, v14.4s, v7.s[3] 204 FMLA v31.4s, v15.4s, v7.s[3] 205 B.HS 1b 206 2072: 208 # Epilogue 209 # First block of 4. FMA for first 4, loads for 2nd block of 4. 210 FMLA v24.4s, v16.4s, v0.s[0] 211 LDP q8, q9, [x5], 32 212 FMLA v25.4s, v17.4s, v0.s[0] 213 FMLA v26.4s, v16.4s, v1.s[0] 214 LDP q10, q11, [x5], 32 215 FMLA v27.4s, v17.4s, v1.s[0] 216 FMLA v28.4s, v16.4s, v2.s[0] 217 LDP q12, q13, [x5], 32 218 FMLA v29.4s, v17.4s, v2.s[0] 219 FMLA v30.4s, v16.4s, v3.s[0] 220 LDP q14, q15, [x5], 32 221 FMLA v31.4s, v17.4s, v3.s[0] 222 FMLA v24.4s, v18.4s, v0.s[1] 223 LDR q4, [x3], 16 224 FMLA v25.4s, v19.4s, v0.s[1] 225 FMLA v26.4s, v18.4s, v1.s[1] 226 LDR q5, [x11], 16 227 FMLA v27.4s, v19.4s, v1.s[1] 228 FMLA v28.4s, v18.4s, v2.s[1] 229 LDR q6, [x12], 16 230 FMLA v29.4s, v19.4s, v2.s[1] 231 FMLA v30.4s, v18.4s, v3.s[1] 232 LDR q7, [x4], 16 233 FMLA v31.4s, v19.4s, v3.s[1] 234 FMLA v24.4s, v20.4s, v0.s[2] 235 FMLA v25.4s, v21.4s, v0.s[2] 236 FMLA v26.4s, v20.4s, v1.s[2] 237 FMLA v27.4s, v21.4s, v1.s[2] 238 FMLA v28.4s, v20.4s, v2.s[2] 239 FMLA v29.4s, v21.4s, v2.s[2] 240 FMLA v30.4s, v20.4s, v3.s[2] 241 FMLA v31.4s, v21.4s, v3.s[2] 242 FMLA v24.4s, v22.4s, v0.s[3] 243 FMLA v25.4s, v23.4s, v0.s[3] 244 FMLA v26.4s, v22.4s, v1.s[3] 245 FMLA v27.4s, v23.4s, v1.s[3] 246 FMLA v28.4s, v22.4s, v2.s[3] 247 FMLA v29.4s, v23.4s, v2.s[3] 248 FMLA v30.4s, v22.4s, v3.s[3] 249 FMLA v31.4s, v23.4s, v3.s[3] 250 251 # Second block of 4. FMA for second 4, noloads 252 FMLA v24.4s, v8.4s, v4.s[0] 253 FMLA v25.4s, v9.4s, v4.s[0] 254 FMLA v26.4s, v8.4s, v5.s[0] 255 FMLA v27.4s, v9.4s, v5.s[0] 256 FMLA v28.4s, v8.4s, v6.s[0] 257 FMLA v29.4s, v9.4s, v6.s[0] 258 FMLA v30.4s, v8.4s, v7.s[0] 259 FMLA v31.4s, v9.4s, v7.s[0] 260 261 FMLA v24.4s, v10.4s, v4.s[1] 262 FMLA v25.4s, v11.4s, v4.s[1] 263 FMLA v26.4s, v10.4s, v5.s[1] 264 FMLA v27.4s, v11.4s, v5.s[1] 265 FMLA v28.4s, v10.4s, v6.s[1] 266 FMLA v29.4s, v11.4s, v6.s[1] 267 FMLA v30.4s, v10.4s, v7.s[1] 268 FMLA v31.4s, v11.4s, v7.s[1] 269 270 FMLA v24.4s, v12.4s, v4.s[2] 271 FMLA v25.4s, v13.4s, v4.s[2] 272 FMLA v26.4s, v12.4s, v5.s[2] 273 FMLA v27.4s, v13.4s, v5.s[2] 274 FMLA v28.4s, v12.4s, v6.s[2] 275 FMLA v29.4s, v13.4s, v6.s[2] 276 FMLA v30.4s, v12.4s, v7.s[2] 277 FMLA v31.4s, v13.4s, v7.s[2] 278 279 FMLA v24.4s, v14.4s, v4.s[3] 280 FMLA v25.4s, v15.4s, v4.s[3] 281 FMLA v26.4s, v14.4s, v5.s[3] 282 FMLA v27.4s, v15.4s, v5.s[3] 283 284 # Load min/max values 285 LD2R {v4.4s, v5.4s}, [x8] 286 287 FMLA v28.4s, v14.4s, v6.s[3] 288 FMLA v29.4s, v15.4s, v6.s[3] 289 FMLA v30.4s, v14.4s, v7.s[3] 290 FMLA v31.4s, v15.4s, v7.s[3] 291 2923: 293 # Remainder- 4 floats of A (16 bytes) 294 TBZ x0, 4, 4f 295 296 LDR q0, [x3], 16 297 LDP q16, q17, [x5], 32 298 LDR q1, [x11], 16 299 LDR q2, [x12], 16 300 LDR q3, [x4], 16 301 FMLA v24.4s, v16.4s, v0.s[0] 302 FMLA v25.4s, v17.4s, v0.s[0] 303 LDP q18, q19, [x5], 32 304 FMLA v26.4s, v16.4s, v1.s[0] 305 FMLA v27.4s, v17.4s, v1.s[0] 306 LDP q20, q21, [x5], 32 307 FMLA v28.4s, v16.4s, v2.s[0] 308 FMLA v29.4s, v17.4s, v2.s[0] 309 LDP q22, q23, [x5], 32 310 FMLA v30.4s, v16.4s, v3.s[0] 311 FMLA v31.4s, v17.4s, v3.s[0] 312 FMLA v24.4s, v18.4s, v0.s[1] 313 FMLA v25.4s, v19.4s, v0.s[1] 314 FMLA v26.4s, v18.4s, v1.s[1] 315 FMLA v27.4s, v19.4s, v1.s[1] 316 FMLA v28.4s, v18.4s, v2.s[1] 317 FMLA v29.4s, v19.4s, v2.s[1] 318 FMLA v30.4s, v18.4s, v3.s[1] 319 FMLA v31.4s, v19.4s, v3.s[1] 320 FMLA v24.4s, v20.4s, v0.s[2] 321 FMLA v25.4s, v21.4s, v0.s[2] 322 FMLA v26.4s, v20.4s, v1.s[2] 323 FMLA v27.4s, v21.4s, v1.s[2] 324 FMLA v28.4s, v20.4s, v2.s[2] 325 FMLA v29.4s, v21.4s, v2.s[2] 326 FMLA v30.4s, v20.4s, v3.s[2] 327 FMLA v31.4s, v21.4s, v3.s[2] 328 FMLA v24.4s, v22.4s, v0.s[3] 329 FMLA v25.4s, v23.4s, v0.s[3] 330 FMLA v26.4s, v22.4s, v1.s[3] 331 FMLA v27.4s, v23.4s, v1.s[3] 332 FMLA v28.4s, v22.4s, v2.s[3] 333 FMLA v29.4s, v23.4s, v2.s[3] 334 FMLA v30.4s, v22.4s, v3.s[3] 335 FMLA v31.4s, v23.4s, v3.s[3] 336 3374: 338 # Remainder- 2 floats of A (8 bytes) 339 TBZ x0, 3, 5f 340 341 LDR d0, [x3], 8 342 LDP q16, q17, [x5], 32 343 LDR d1, [x11], 8 344 LDR d2, [x12], 8 345 LDR d3, [x4], 8 346 FMLA v24.4s, v16.4s, v0.s[0] 347 FMLA v25.4s, v17.4s, v0.s[0] 348 LDP q18, q19, [x5], 32 349 FMLA v26.4s, v16.4s, v1.s[0] 350 FMLA v27.4s, v17.4s, v1.s[0] 351 FMLA v28.4s, v16.4s, v2.s[0] 352 FMLA v29.4s, v17.4s, v2.s[0] 353 FMLA v30.4s, v16.4s, v3.s[0] 354 FMLA v31.4s, v17.4s, v3.s[0] 355 FMLA v24.4s, v18.4s, v0.s[1] 356 FMLA v25.4s, v19.4s, v0.s[1] 357 FMLA v26.4s, v18.4s, v1.s[1] 358 FMLA v27.4s, v19.4s, v1.s[1] 359 FMLA v28.4s, v18.4s, v2.s[1] 360 FMLA v29.4s, v19.4s, v2.s[1] 361 FMLA v30.4s, v18.4s, v3.s[1] 362 FMLA v31.4s, v19.4s, v3.s[1] 363 3645: 365 # Remainder- 1 float of A (4 bytes) 366 TBZ x0, 2, 6f 367 368 LDR s0, [x3], 4 369 LDP q16, q17, [x5], 32 370 LDR s1, [x11], 4 371 LDR s2, [x12], 4 372 LDR s3, [x4], 4 373 FMLA v24.4s, v16.4s, v0.s[0] 374 FMLA v25.4s, v17.4s, v0.s[0] 375 FMLA v26.4s, v16.4s, v1.s[0] 376 FMLA v27.4s, v17.4s, v1.s[0] 377 FMLA v28.4s, v16.4s, v2.s[0] 378 FMLA v29.4s, v17.4s, v2.s[0] 379 FMLA v30.4s, v16.4s, v3.s[0] 380 FMLA v31.4s, v17.4s, v3.s[0] 381 3826: 383 # Clamp 384 FMAX v24.4s, v24.4s, v4.4s 385 SUBS x1, x1, 8 386 FMAX v25.4s, v25.4s, v4.4s 387 FMAX v26.4s, v26.4s, v4.4s 388 FMAX v27.4s, v27.4s, v4.4s 389 FMAX v28.4s, v28.4s, v4.4s 390 FMAX v29.4s, v29.4s, v4.4s 391 FMAX v30.4s, v30.4s, v4.4s 392 FMAX v31.4s, v31.4s, v4.4s 393 FMIN v24.4s, v24.4s, v5.4s 394 FMIN v25.4s, v25.4s, v5.4s 395 FMIN v26.4s, v26.4s, v5.4s 396 FMIN v27.4s, v27.4s, v5.4s 397 FMIN v28.4s, v28.4s, v5.4s 398 FMIN v29.4s, v29.4s, v5.4s 399 FMIN v30.4s, v30.4s, v5.4s 400 FMIN v31.4s, v31.4s, v5.4s 401 402 # Store full 4 x 8 403 B.LO 7f 404 405 STP q24, q25, [x6] 406 SUB x3, x3, x2 // a0 -= kc 407 ADD x6, x6, x14 408 STP q26, q27, [x9] 409 SUB x11, x11, x2 // a1 -= kc 410 ADD x9, x9, x14 411 STP q28, q29, [x10] 412 SUB x12, x12, x2 // a2 -= kc 413 ADD x10, x10, x14 414 STP q30, q31, [x7] 415 SUB x4, x4, x2 // a3 -= kc 416 ADD x7, x7, x14 417 418 B.HI 0b 419 420 # Restore d8-d15 from stack 421 LDP d14, d15, [sp, 48] 422 LDP d12, d13, [sp, 32] 423 LDP d10, d11, [sp, 16] 424 LDP d8, d9, [sp], 64 425 RET 426 427 # Store odd width 4287: 429 TBZ x1, 2, 8f 430 STR q24, [x6], 16 431 MOV v24.16b, v25.16b 432 STR q26, [x9], 16 433 MOV v26.16b, v27.16b 434 STR q28, [x10], 16 435 MOV v28.16b, v29.16b 436 STR q30, [x7], 16 437 MOV v30.16b, v31.16b 438 4398: 440 TBZ x1, 1, 9f 441 STR d24, [x6], 8 442 STR d26, [x9], 8 443 DUP d24, v24.d[1] 444 DUP d26, v26.d[1] 445 STR d28, [x10], 8 446 STR d30, [x7], 8 447 DUP d28, v28.d[1] 448 DUP d30, v30.d[1] 449 4509: 451 TBZ x1, 0, 10f 452 STR s24, [x6] 453 STR s26, [x9] 454 STR s28, [x10] 455 STR s30, [x7] 45610: 457 # Restore d8-d15 from stack 458 LDP d14, d15, [sp, 48] 459 LDP d12, d13, [sp, 32] 460 LDP d10, d11, [sp, 16] 461 LDP d8, d9, [sp], 64 462 RET 463 464 465END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75 466 467#ifdef __ELF__ 468.section ".note.GNU-stack","",%progbits 469#endif 470