1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# LINT.IfChange 9# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75( 10# size_t mr, x0 11# size_t nc, x1 12# size_t kc, x2 / x0 13# const uint8_t*restrict a, x3 14# size_t a_stride, x4 15# const void*restrict w, x5 16# uint8_t*restrict c, x6 17# size_t cm_stride, x7 18# size_t cn_stride, [sp] -> x14 19$if INC: 20 # const float*restrict acc, [sp + 8] -> x15 21 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 22$else: 23 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x11 a1 30# x12 a2 31# x4 a3 / a_stride 32 33# C pointers 34# x6 c0 35# x9 c1 36# x10 c2 37# x7 c3 / cm_stride 38 39# Vector register usage 40# A0 v0 v4 41# A1 v1 v5 42# A2 v2 v6 43# A3 v3 v7 44# B v8 v9 v10 v11 45# B v12 v13 v14 v15 46# B v16 v17 v18 v19 47# B v20 v21 v22 v23 48# C v24 v25 49# C v26 v27 50# C v28 v29 51# C v30 v31 52# Clamp v4 v5 53 54BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75 55 56 $if INC: 57 # Load cn_stride, acc 58 LDP x14, x15, [sp] 59 # Load params pointer 60 LDR x8, [sp, 16] 61 $else: 62 # Load cn_stride, params pointer 63 LDP x14, x8, [sp] 64 65 # Load min/max values 66 LD2R {v4.4s, v5.4s}, [x8] 67 68 # Save d8-d15 on stack 69 STP d8, d9, [sp, -64]! 70 STP d10, d11, [sp, 16] 71 STP d12, d13, [sp, 32] 72 STP d14, d15, [sp, 48] 73 74 # Clamp A and C pointers 75 CMP x0, 2 // if mr < 2 76 ADD x11, x3, x4 // a1 = a0 + a_stride 77 ADD x9, x6, x7 // c1 = c0 + cm_stride 78 CSEL x11, x3, x11, LO // a1 = a0 79 CSEL x9, x6, x9, LO // c1 = c0 80 81 ADD x12, x11, x4 // a2 = a1 + a_stride 82 ADD x10, x9, x7 // c2 = c1 + cm_stride 83 // if mr <= 2 84 CSEL x12, x11, x12, LS // a2 = a1 85 CSEL x10, x9, x10, LS // c2 = c1 86 87 CMP x0, 4 // if mr < 4 88 ADD x4, x12, x4 // a3 = a2 + a_stride 89 ADD x7, x10, x7 // c3 = c2 + cm_stride 90 CSEL x4, x12, x4, LO // a3 = a2 91 CSEL x7, x10, x7, LO // c3 = c2 92 930: 94 $if INC: 95 # Load initial accumulators 96 LDP q24, q25, [x15], 32 97 LDP q26, q27, [x15], 32 98 LDP q28, q29, [x15], 32 99 LDP q30, q31, [x15], 32 100 $else: 101 # Load initial bias from w into accumulators 102 LDP q24, q25, [x5], 32 103 MOV v26.16b, v24.16b 104 MOV v27.16b, v25.16b 105 MOV v28.16b, v24.16b 106 MOV v29.16b, v25.16b 107 MOV v30.16b, v24.16b 108 MOV v31.16b, v25.16b 109 110 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 111 SUBS x0, x2, 32 // k = kc - 32 112 B.LO 3f 113 114 # 16 prologue 115 # Read first block of 4 A and B. 116 LDR q0, [x3], 16 117 LDP q16, q17, [x5], 32 118 LDR q1, [x11], 16 119 LDR q2, [x12], 16 120 LDR q3, [x4], 16 121 LDP q18, q19, [x5], 32 122 LDP q20, q21, [x5], 32 123 LDP q22, q23, [x5], 32 124 125 # Is there at least 32. yes do main loop 126 SUBS x0, x0, 32 127 B.LO 2f 128 129 # Main loop - 8 floats of A (32 bytes) 1301: 131 # First block of 4. FMA for first 4, loads for 2nd block of 4. 132 FMLA v24.4s, v16.4s, v0.s[0] 133 LDP q8, q9, [x5], 32 134 FMLA v25.4s, v17.4s, v0.s[0] 135 FMLA v26.4s, v16.4s, v1.s[0] 136 LDP q10, q11, [x5], 32 137 FMLA v27.4s, v17.4s, v1.s[0] 138 FMLA v28.4s, v16.4s, v2.s[0] 139 LDP q12, q13, [x5], 32 140 FMLA v29.4s, v17.4s, v2.s[0] 141 FMLA v30.4s, v16.4s, v3.s[0] 142 LDP q14, q15, [x5], 32 143 FMLA v31.4s, v17.4s, v3.s[0] 144 FMLA v24.4s, v18.4s, v0.s[1] 145 LDR q4, [x3], 16 146 FMLA v25.4s, v19.4s, v0.s[1] 147 FMLA v26.4s, v18.4s, v1.s[1] 148 LDR q5, [x11], 16 149 FMLA v27.4s, v19.4s, v1.s[1] 150 FMLA v28.4s, v18.4s, v2.s[1] 151 LDR q6, [x12], 16 152 FMLA v29.4s, v19.4s, v2.s[1] 153 FMLA v30.4s, v18.4s, v3.s[1] 154 LDR q7, [x4], 16 155 FMLA v31.4s, v19.4s, v3.s[1] 156 FMLA v24.4s, v20.4s, v0.s[2] 157 $if PREFETCH: 158 PRFM PLDL1KEEP, [x5, 128] 159 FMLA v25.4s, v21.4s, v0.s[2] 160 FMLA v26.4s, v20.4s, v1.s[2] 161 $if PREFETCH: 162 PRFM PLDL1KEEP, [x5, 192] 163 FMLA v27.4s, v21.4s, v1.s[2] 164 FMLA v28.4s, v20.4s, v2.s[2] 165 $if PREFETCH: 166 PRFM PLDL1KEEP, [x5, 256] 167 FMLA v29.4s, v21.4s, v2.s[2] 168 FMLA v30.4s, v20.4s, v3.s[2] 169 $if PREFETCH: 170 PRFM PLDL1KEEP, [x5, 320] 171 FMLA v31.4s, v21.4s, v3.s[2] 172 FMLA v24.4s, v22.4s, v0.s[3] 173 FMLA v25.4s, v23.4s, v0.s[3] 174 FMLA v26.4s, v22.4s, v1.s[3] 175 FMLA v27.4s, v23.4s, v1.s[3] 176 FMLA v28.4s, v22.4s, v2.s[3] 177 FMLA v29.4s, v23.4s, v2.s[3] 178 FMLA v30.4s, v22.4s, v3.s[3] 179 FMLA v31.4s, v23.4s, v3.s[3] 180 181 # Second block of 4. FMA for second 4, loads for 1st block of 4. 182 FMLA v24.4s, v8.4s, v4.s[0] 183 LDP q16, q17, [x5], 32 184 FMLA v25.4s, v9.4s, v4.s[0] 185 FMLA v26.4s, v8.4s, v5.s[0] 186 LDP q18, q19, [x5], 32 187 FMLA v27.4s, v9.4s, v5.s[0] 188 FMLA v28.4s, v8.4s, v6.s[0] 189 LDP q20, q21, [x5], 32 190 FMLA v29.4s, v9.4s, v6.s[0] 191 FMLA v30.4s, v8.4s, v7.s[0] 192 LDP q22, q23, [x5], 32 193 FMLA v31.4s, v9.4s, v7.s[0] 194 FMLA v24.4s, v10.4s, v4.s[1] 195 LDR q0, [x3], 16 196 FMLA v25.4s, v11.4s, v4.s[1] 197 FMLA v26.4s, v10.4s, v5.s[1] 198 LDR q1, [x11], 16 199 FMLA v27.4s, v11.4s, v5.s[1] 200 FMLA v28.4s, v10.4s, v6.s[1] 201 LDR q2, [x12], 16 202 FMLA v29.4s, v11.4s, v6.s[1] 203 FMLA v30.4s, v10.4s, v7.s[1] 204 LDR q3, [x4], 16 205 FMLA v31.4s, v11.4s, v7.s[1] 206 FMLA v24.4s, v12.4s, v4.s[2] 207 FMLA v25.4s, v13.4s, v4.s[2] 208 FMLA v26.4s, v12.4s, v5.s[2] 209 FMLA v27.4s, v13.4s, v5.s[2] 210 FMLA v28.4s, v12.4s, v6.s[2] 211 FMLA v29.4s, v13.4s, v6.s[2] 212 FMLA v30.4s, v12.4s, v7.s[2] 213 FMLA v31.4s, v13.4s, v7.s[2] 214 FMLA v24.4s, v14.4s, v4.s[3] 215 FMLA v25.4s, v15.4s, v4.s[3] 216 FMLA v26.4s, v14.4s, v5.s[3] 217 FMLA v27.4s, v15.4s, v5.s[3] 218 FMLA v28.4s, v14.4s, v6.s[3] 219 FMLA v29.4s, v15.4s, v6.s[3] 220 SUBS x0, x0, 32 221 FMLA v30.4s, v14.4s, v7.s[3] 222 FMLA v31.4s, v15.4s, v7.s[3] 223 B.HS 1b 224 2252: 226 # Epilogue 227 # First block of 4. FMA for first 4, loads for 2nd block of 4. 228 FMLA v24.4s, v16.4s, v0.s[0] 229 LDP q8, q9, [x5], 32 230 FMLA v25.4s, v17.4s, v0.s[0] 231 FMLA v26.4s, v16.4s, v1.s[0] 232 LDP q10, q11, [x5], 32 233 FMLA v27.4s, v17.4s, v1.s[0] 234 FMLA v28.4s, v16.4s, v2.s[0] 235 LDP q12, q13, [x5], 32 236 FMLA v29.4s, v17.4s, v2.s[0] 237 FMLA v30.4s, v16.4s, v3.s[0] 238 LDP q14, q15, [x5], 32 239 FMLA v31.4s, v17.4s, v3.s[0] 240 FMLA v24.4s, v18.4s, v0.s[1] 241 LDR q4, [x3], 16 242 FMLA v25.4s, v19.4s, v0.s[1] 243 FMLA v26.4s, v18.4s, v1.s[1] 244 LDR q5, [x11], 16 245 FMLA v27.4s, v19.4s, v1.s[1] 246 FMLA v28.4s, v18.4s, v2.s[1] 247 LDR q6, [x12], 16 248 FMLA v29.4s, v19.4s, v2.s[1] 249 FMLA v30.4s, v18.4s, v3.s[1] 250 LDR q7, [x4], 16 251 FMLA v31.4s, v19.4s, v3.s[1] 252 FMLA v24.4s, v20.4s, v0.s[2] 253 FMLA v25.4s, v21.4s, v0.s[2] 254 FMLA v26.4s, v20.4s, v1.s[2] 255 FMLA v27.4s, v21.4s, v1.s[2] 256 FMLA v28.4s, v20.4s, v2.s[2] 257 FMLA v29.4s, v21.4s, v2.s[2] 258 FMLA v30.4s, v20.4s, v3.s[2] 259 FMLA v31.4s, v21.4s, v3.s[2] 260 FMLA v24.4s, v22.4s, v0.s[3] 261 FMLA v25.4s, v23.4s, v0.s[3] 262 FMLA v26.4s, v22.4s, v1.s[3] 263 FMLA v27.4s, v23.4s, v1.s[3] 264 FMLA v28.4s, v22.4s, v2.s[3] 265 FMLA v29.4s, v23.4s, v2.s[3] 266 FMLA v30.4s, v22.4s, v3.s[3] 267 FMLA v31.4s, v23.4s, v3.s[3] 268 269 # Second block of 4. FMA for second 4, noloads 270 FMLA v24.4s, v8.4s, v4.s[0] 271 FMLA v25.4s, v9.4s, v4.s[0] 272 FMLA v26.4s, v8.4s, v5.s[0] 273 FMLA v27.4s, v9.4s, v5.s[0] 274 FMLA v28.4s, v8.4s, v6.s[0] 275 FMLA v29.4s, v9.4s, v6.s[0] 276 FMLA v30.4s, v8.4s, v7.s[0] 277 FMLA v31.4s, v9.4s, v7.s[0] 278 279 FMLA v24.4s, v10.4s, v4.s[1] 280 FMLA v25.4s, v11.4s, v4.s[1] 281 FMLA v26.4s, v10.4s, v5.s[1] 282 FMLA v27.4s, v11.4s, v5.s[1] 283 FMLA v28.4s, v10.4s, v6.s[1] 284 FMLA v29.4s, v11.4s, v6.s[1] 285 FMLA v30.4s, v10.4s, v7.s[1] 286 FMLA v31.4s, v11.4s, v7.s[1] 287 288 FMLA v24.4s, v12.4s, v4.s[2] 289 FMLA v25.4s, v13.4s, v4.s[2] 290 FMLA v26.4s, v12.4s, v5.s[2] 291 FMLA v27.4s, v13.4s, v5.s[2] 292 FMLA v28.4s, v12.4s, v6.s[2] 293 FMLA v29.4s, v13.4s, v6.s[2] 294 FMLA v30.4s, v12.4s, v7.s[2] 295 FMLA v31.4s, v13.4s, v7.s[2] 296 297 FMLA v24.4s, v14.4s, v4.s[3] 298 FMLA v25.4s, v15.4s, v4.s[3] 299 FMLA v26.4s, v14.4s, v5.s[3] 300 FMLA v27.4s, v15.4s, v5.s[3] 301 302 # Load min/max values 303 LD2R {v4.4s, v5.4s}, [x8] 304 305 FMLA v28.4s, v14.4s, v6.s[3] 306 FMLA v29.4s, v15.4s, v6.s[3] 307 FMLA v30.4s, v14.4s, v7.s[3] 308 FMLA v31.4s, v15.4s, v7.s[3] 309 3103: 311 # Remainder- 4 floats of A (16 bytes) 312 TBZ x0, 4, 4f 313 314 LDR q0, [x3], 16 315 LDP q16, q17, [x5], 32 316 LDR q1, [x11], 16 317 LDR q2, [x12], 16 318 LDR q3, [x4], 16 319 FMLA v24.4s, v16.4s, v0.s[0] 320 FMLA v25.4s, v17.4s, v0.s[0] 321 LDP q18, q19, [x5], 32 322 FMLA v26.4s, v16.4s, v1.s[0] 323 FMLA v27.4s, v17.4s, v1.s[0] 324 LDP q20, q21, [x5], 32 325 FMLA v28.4s, v16.4s, v2.s[0] 326 FMLA v29.4s, v17.4s, v2.s[0] 327 LDP q22, q23, [x5], 32 328 FMLA v30.4s, v16.4s, v3.s[0] 329 FMLA v31.4s, v17.4s, v3.s[0] 330 FMLA v24.4s, v18.4s, v0.s[1] 331 FMLA v25.4s, v19.4s, v0.s[1] 332 FMLA v26.4s, v18.4s, v1.s[1] 333 FMLA v27.4s, v19.4s, v1.s[1] 334 FMLA v28.4s, v18.4s, v2.s[1] 335 FMLA v29.4s, v19.4s, v2.s[1] 336 FMLA v30.4s, v18.4s, v3.s[1] 337 FMLA v31.4s, v19.4s, v3.s[1] 338 FMLA v24.4s, v20.4s, v0.s[2] 339 FMLA v25.4s, v21.4s, v0.s[2] 340 FMLA v26.4s, v20.4s, v1.s[2] 341 FMLA v27.4s, v21.4s, v1.s[2] 342 FMLA v28.4s, v20.4s, v2.s[2] 343 FMLA v29.4s, v21.4s, v2.s[2] 344 FMLA v30.4s, v20.4s, v3.s[2] 345 FMLA v31.4s, v21.4s, v3.s[2] 346 FMLA v24.4s, v22.4s, v0.s[3] 347 FMLA v25.4s, v23.4s, v0.s[3] 348 FMLA v26.4s, v22.4s, v1.s[3] 349 FMLA v27.4s, v23.4s, v1.s[3] 350 FMLA v28.4s, v22.4s, v2.s[3] 351 FMLA v29.4s, v23.4s, v2.s[3] 352 FMLA v30.4s, v22.4s, v3.s[3] 353 FMLA v31.4s, v23.4s, v3.s[3] 354 3554: 356 # Remainder- 2 floats of A (8 bytes) 357 TBZ x0, 3, 5f 358 359 LDR d0, [x3], 8 360 LDP q16, q17, [x5], 32 361 LDR d1, [x11], 8 362 LDR d2, [x12], 8 363 LDR d3, [x4], 8 364 FMLA v24.4s, v16.4s, v0.s[0] 365 FMLA v25.4s, v17.4s, v0.s[0] 366 LDP q18, q19, [x5], 32 367 FMLA v26.4s, v16.4s, v1.s[0] 368 FMLA v27.4s, v17.4s, v1.s[0] 369 FMLA v28.4s, v16.4s, v2.s[0] 370 FMLA v29.4s, v17.4s, v2.s[0] 371 FMLA v30.4s, v16.4s, v3.s[0] 372 FMLA v31.4s, v17.4s, v3.s[0] 373 FMLA v24.4s, v18.4s, v0.s[1] 374 FMLA v25.4s, v19.4s, v0.s[1] 375 FMLA v26.4s, v18.4s, v1.s[1] 376 FMLA v27.4s, v19.4s, v1.s[1] 377 FMLA v28.4s, v18.4s, v2.s[1] 378 FMLA v29.4s, v19.4s, v2.s[1] 379 FMLA v30.4s, v18.4s, v3.s[1] 380 FMLA v31.4s, v19.4s, v3.s[1] 381 3825: 383 # Remainder- 1 float of A (4 bytes) 384 TBZ x0, 2, 6f 385 386 LDR s0, [x3], 4 387 LDP q16, q17, [x5], 32 388 LDR s1, [x11], 4 389 LDR s2, [x12], 4 390 LDR s3, [x4], 4 391 FMLA v24.4s, v16.4s, v0.s[0] 392 FMLA v25.4s, v17.4s, v0.s[0] 393 FMLA v26.4s, v16.4s, v1.s[0] 394 FMLA v27.4s, v17.4s, v1.s[0] 395 FMLA v28.4s, v16.4s, v2.s[0] 396 FMLA v29.4s, v17.4s, v2.s[0] 397 FMLA v30.4s, v16.4s, v3.s[0] 398 FMLA v31.4s, v17.4s, v3.s[0] 399 4006: 401 # Clamp 402 FMAX v24.4s, v24.4s, v4.4s 403 SUBS x1, x1, 8 404 FMAX v25.4s, v25.4s, v4.4s 405 FMAX v26.4s, v26.4s, v4.4s 406 FMAX v27.4s, v27.4s, v4.4s 407 FMAX v28.4s, v28.4s, v4.4s 408 FMAX v29.4s, v29.4s, v4.4s 409 FMAX v30.4s, v30.4s, v4.4s 410 FMAX v31.4s, v31.4s, v4.4s 411 FMIN v24.4s, v24.4s, v5.4s 412 FMIN v25.4s, v25.4s, v5.4s 413 FMIN v26.4s, v26.4s, v5.4s 414 FMIN v27.4s, v27.4s, v5.4s 415 FMIN v28.4s, v28.4s, v5.4s 416 FMIN v29.4s, v29.4s, v5.4s 417 FMIN v30.4s, v30.4s, v5.4s 418 FMIN v31.4s, v31.4s, v5.4s 419 420 # Store full 4 x 8 421 B.LO 7f 422 423 $if INC: 424 STP q30, q31, [x7] 425 SUB x3, x3, x2 // a0 -= kc 426 ADD x7, x7, x14 427 STP q28, q29, [x10] 428 SUB x11, x11, x2 // a1 -= kc 429 ADD x10, x10, x14 430 STP q26, q27, [x9] 431 SUB x12, x12, x2 // a2 -= kc 432 ADD x9, x9, x14 433 STP q24, q25, [x6] 434 SUB x4, x4, x2 // a3 -= kc 435 ADD x6, x6, x14 436 $else: 437 STP q24, q25, [x6] 438 SUB x3, x3, x2 // a0 -= kc 439 ADD x6, x6, x14 440 STP q26, q27, [x9] 441 SUB x11, x11, x2 // a1 -= kc 442 ADD x9, x9, x14 443 STP q28, q29, [x10] 444 SUB x12, x12, x2 // a2 -= kc 445 ADD x10, x10, x14 446 STP q30, q31, [x7] 447 SUB x4, x4, x2 // a3 -= kc 448 ADD x7, x7, x14 449 450 B.HI 0b 451 452 # Restore d8-d15 from stack 453 LDP d14, d15, [sp, 48] 454 LDP d12, d13, [sp, 32] 455 LDP d10, d11, [sp, 16] 456 LDP d8, d9, [sp], 64 457 RET 458 459 # Store odd width 4607: 461 TBZ x1, 2, 8f 462 $if INC: 463 STR q30, [x7], 16 464 MOV v30.16b, v31.16b 465 STR q28, [x10], 16 466 MOV v28.16b, v29.16b 467 STR q26, [x9], 16 468 MOV v26.16b, v27.16b 469 STR q24, [x6], 16 470 MOV v24.16b, v25.16b 471 $else: 472 STR q24, [x6], 16 473 MOV v24.16b, v25.16b 474 STR q26, [x9], 16 475 MOV v26.16b, v27.16b 476 STR q28, [x10], 16 477 MOV v28.16b, v29.16b 478 STR q30, [x7], 16 479 MOV v30.16b, v31.16b 480 4818: 482 TBZ x1, 1, 9f 483 $if INC: 484 STR d30, [x7], 8 485 STR d28, [x10], 8 486 DUP d30, v30.d[1] 487 DUP d28, v28.d[1] 488 STR d26, [x9], 8 489 STR d24, [x6], 8 490 DUP d26, v26.d[1] 491 DUP d24, v24.d[1] 492 $else: 493 STR d24, [x6], 8 494 STR d26, [x9], 8 495 DUP d24, v24.d[1] 496 DUP d26, v26.d[1] 497 STR d28, [x10], 8 498 STR d30, [x7], 8 499 DUP d28, v28.d[1] 500 DUP d30, v30.d[1] 501 5029: 503 TBZ x1, 0, 10f 504 $if INC: 505 STR s30, [x7] 506 STR s28, [x10] 507 STR s26, [x9] 508 STR s24, [x6] 509 $else: 510 STR s24, [x6] 511 STR s26, [x9] 512 STR s28, [x10] 513 STR s30, [x7] 51410: 515 # Restore d8-d15 from stack 516 LDP d14, d15, [sp, 48] 517 LDP d12, d13, [sp, 32] 518 LDP d10, d11, [sp, 16] 519 LDP d8, d9, [sp], 64 520 RET 521 522 523END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma${"_prfm" if PREFETCH else ""}_cortex_a75 524# LINT.ThenChange(4x8-aarch64-neonfma-cortex-a75.cc) 525 526#ifdef __ELF__ 527.section ".note.GNU-stack","",%progbits 528#endif 529