1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/6x8-aarch64-neonfma-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x14 a0 30# x15 a1 31# x20 a2 32# x21 a3 33# x22 a4 34# x23 a5 35 36# C pointers 37# x6 c0 38# x16 c1 39# x17 c2 40# x10 c3 41# x13 c4 42# x7 c5 43 44# Vector register usage 45# A0 v0 46# A1 v1 47# A2 v2 48# A3 v3 49# A4 v4 50# A5 v5 51# B v16 v17 v18 v19 52# C v20 v21 53# C v22 v23 54# C v24 v25 55# C v26 v27 56# C v28 v29 57# C v30 v31 58# Clamp v6 v7 59# unused A v8 v9 v10 v11 60# unused B v12 v13 v14 v15 61 62BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64 63 64 # Load zero, params pointer 65 LDP x12, x8, [sp, 16] 66 67 # Clamp C pointers 68 CMP x0, 2 // if mr < 2 69 ADD x16, x6, x7 // c1 = c0 + cm_stride 70 CSEL x16, x6, x16, LO // c1 = c0 71 72 # Load min/max values 73 LD2R {v6.4s, v7.4s}, [x8] 74 75 ADD x17, x16, x7 // c2 = c1 + cm_stride 76 // if mr <= 2 77 CSEL x17, x16, x17, LS // c2 = c1 78 79 # Save x20,x21,x22,x23 on stack 80 STP x20, x21, [sp, -32]! 81 82 CMP x0, 4 // if mr < 4 83 ADD x10, x17, x7 // c3 = c2 + cm_stride 84 CSEL x10, x17, x10, LO // c3 = c2 85 86 STP x22, x23, [sp, 16] 87 88 ADD x13, x10, x7 // c4 = c3 + cm_stride 89 // if mr <= 4 90 CSEL x13, x10, x13, LS // c4 = c3 91 92 # Load a_offset 93 LDR x11, [sp, 40] 94 95 CMP x0, 6 // if mr < 6 96 ADD x7, x13, x7 // c5 = c4 + cm_stride 97 CSEL x7, x13, x7, LO // c5 = c4 98 990: 100 # Load initial bias from w into accumulators 101 LDP q20, q21, [x5], 32 102 MOV v22.16b, v20.16b 103 MOV v23.16b, v21.16b 104 MOV v24.16b, v20.16b 105 MOV v25.16b, v21.16b 106 MOV v26.16b, v20.16b 107 MOV v27.16b, v21.16b 108 MOV v28.16b, v20.16b 109 MOV v29.16b, v21.16b 110 MOV v30.16b, v20.16b 111 MOV v31.16b, v21.16b 112 113 MOV x9, x3 // p = ks 114 1151: 116 # Load next 6 A pointers 117 LDP x14, x15, [x4], 16 118 LDP x20, x21, [x4], 16 119 LDP x22, x23, [x4], 16 120 121 CMP x14, x12 // if a0 == zero 122 ADD x14, x14, x11 // a0 += a_offset 123 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 124 CMP x15, x12 // if a1 == zero 125 ADD x15, x15, x11 // a1 += a_offset 126 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 127 CMP x20, x12 // if a2 == zero 128 ADD x20, x20, x11 // a2 += a_offset 129 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 130 CMP x21, x12 // if a3 == zero 131 ADD x21, x21, x11 // a3 += a_offset 132 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 133 CMP x22, x12 // if a4 == zero 134 ADD x22, x22, x11 // a4 += a_offset 135 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 136 CMP x23, x12 // if a5 == zero 137 ADD x23, x23, x11 // a5 += a_offset 138 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 139 140 # Is there at least 2 floats (8 bytes) for main loop? 141 SUBS x0, x2, 8 // k = kc - 8 142 B.LO 4f 143 144 # Main loop - 2 floats of A (8 bytes) 145 # 24 FMA + 6 LD64 A + 2 LDP B 1462: 147 LDR d0, [x14], 8 148 LDP q16, q17, [x5], 32 149 LDR d1, [x15], 8 150 LDR d2, [x20], 8 151 LDR d3, [x21], 8 152 LDR d4, [x22], 8 153 LDR d5, [x23], 8 154 FMLA v20.4s, v16.4s, v0.s[0] 155 FMLA v22.4s, v16.4s, v1.s[0] 156 FMLA v24.4s, v16.4s, v2.s[0] 157 FMLA v26.4s, v16.4s, v3.s[0] 158 LDP q18, q19, [x5], 32 159 FMLA v28.4s, v16.4s, v4.s[0] 160 FMLA v30.4s, v16.4s, v5.s[0] 161 FMLA v21.4s, v17.4s, v0.s[0] 162 FMLA v23.4s, v17.4s, v1.s[0] 163 FMLA v25.4s, v17.4s, v2.s[0] 164 FMLA v27.4s, v17.4s, v3.s[0] 165 FMLA v29.4s, v17.4s, v4.s[0] 166 FMLA v31.4s, v17.4s, v5.s[0] 167 168 FMLA v20.4s, v18.4s, v0.s[1] 169 FMLA v22.4s, v18.4s, v1.s[1] 170 FMLA v24.4s, v18.4s, v2.s[1] 171 FMLA v26.4s, v18.4s, v3.s[1] 172 FMLA v28.4s, v18.4s, v4.s[1] 173 FMLA v30.4s, v18.4s, v5.s[1] 174 FMLA v21.4s, v19.4s, v0.s[1] 175 FMLA v23.4s, v19.4s, v1.s[1] 176 FMLA v25.4s, v19.4s, v2.s[1] 177 FMLA v27.4s, v19.4s, v3.s[1] 178 SUBS x0, x0, 8 179 FMLA v29.4s, v19.4s, v4.s[1] 180 FMLA v31.4s, v19.4s, v5.s[1] 181 B.HS 2b 182 183 # Is there a remainder?- 1 float of A (4 bytes) 184 TBNZ x0, 2, 4f 185 1863: 187 # ks loop 188 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 189 B.HI 1b 190 191 # Clamp 192 FMAX v20.4s, v20.4s, v6.4s 193 # Load cn_stride 194 LDR x0, [sp, 32] 195 FMAX v21.4s, v21.4s, v6.4s 196 FMAX v22.4s, v22.4s, v6.4s 197 FMAX v23.4s, v23.4s, v6.4s 198 FMAX v24.4s, v24.4s, v6.4s 199 FMAX v25.4s, v25.4s, v6.4s 200 FMAX v26.4s, v26.4s, v6.4s 201 FMAX v27.4s, v27.4s, v6.4s 202 FMAX v28.4s, v28.4s, v6.4s 203 FMAX v29.4s, v29.4s, v6.4s 204 FMAX v30.4s, v30.4s, v6.4s 205 FMAX v31.4s, v31.4s, v6.4s 206 SUBS x1, x1, 8 207 FMIN v20.4s, v20.4s, v7.4s 208 FMIN v21.4s, v21.4s, v7.4s 209 FMIN v22.4s, v22.4s, v7.4s 210 FMIN v23.4s, v23.4s, v7.4s 211 FMIN v24.4s, v24.4s, v7.4s 212 FMIN v25.4s, v25.4s, v7.4s 213 FMIN v26.4s, v26.4s, v7.4s 214 FMIN v27.4s, v27.4s, v7.4s 215 FMIN v28.4s, v28.4s, v7.4s 216 FMIN v29.4s, v29.4s, v7.4s 217 FMIN v30.4s, v30.4s, v7.4s 218 FMIN v31.4s, v31.4s, v7.4s 219 220 # Store full 6 x 8 221 B.LO 5f 222 223 STP q30, q31, [x7] 224 ADD x7, x7, x0 225 STP q28, q29, [x13] 226 ADD x13, x13, x0 227 STP q26, q27, [x10] 228 ADD x10, x10, x0 229 STP q24, q25, [x17] 230 ADD x17, x17, x0 231 STP q22, q23, [x16] 232 ADD x16, x16, x0 233 STP q20, q21, [x6] 234 ADD x6, x6, x0 235 236 SUB x4, x4, x3 // a -= ks 237 238 # nc loop 239 B.HI 0b 240 241 # Restore x20,x21,x22,x23 from stack 242 LDP x22, x23, [sp, 16] 243 LDP x20, x21, [sp], 32 244 RET 245 246 # Remainder- 1 float of A (4 bytes) 2474: 248 LDR s0, [x14], 4 249 LDP q16, q17, [x5], 32 250 LDR s1, [x15], 4 251 LDR s2, [x20], 4 252 LDR s3, [x21], 4 253 LDR s4, [x22], 4 254 LDR s5, [x23], 4 255 FMLA v20.4s, v16.4s, v0.s[0] 256 FMLA v22.4s, v16.4s, v1.s[0] 257 FMLA v24.4s, v16.4s, v2.s[0] 258 FMLA v26.4s, v16.4s, v3.s[0] 259 FMLA v28.4s, v16.4s, v4.s[0] 260 FMLA v30.4s, v16.4s, v5.s[0] 261 FMLA v21.4s, v17.4s, v0.s[0] 262 FMLA v23.4s, v17.4s, v1.s[0] 263 FMLA v25.4s, v17.4s, v2.s[0] 264 FMLA v27.4s, v17.4s, v3.s[0] 265 FMLA v29.4s, v17.4s, v4.s[0] 266 FMLA v31.4s, v17.4s, v5.s[0] 267 B 3b 268 269 # Store odd width 2705: 271 TBZ x1, 2, 6f 272 STR q30, [x7], 16 273 MOV v30.16b, v31.16b 274 STR q28, [x13], 16 275 MOV v28.16b, v29.16b 276 STR q26, [x10], 16 277 MOV v26.16b, v27.16b 278 STR q24, [x17], 16 279 MOV v24.16b, v25.16b 280 STR q22, [x16], 16 281 MOV v22.16b, v23.16b 282 STR q20, [x6], 16 283 MOV v20.16b, v21.16b 2846: 285 TBZ x1, 1, 7f 286 STR d30, [x7], 8 287 STR d28, [x13], 8 288 DUP d30, v30.d[1] 289 DUP d28, v28.d[1] 290 STR d26, [x10], 8 291 STR d24, [x17], 8 292 DUP d26, v26.d[1] 293 DUP d24, v24.d[1] 294 STR d22, [x16], 8 295 STR d20, [x6], 8 296 DUP d22, v22.d[1] 297 DUP d20, v20.d[1] 298 2997: 300 TBZ x1, 0, 8f 301 STR s30, [x7] 302 STR s28, [x13] 303 STR s26, [x10] 304 STR s24, [x17] 305 STR s22, [x16] 306 STR s20, [x6] 3078: 308 # Restore x20,x21,x22,x23 from stack 309 LDP x22, x23, [sp, 16] 310 LDP x20, x21, [sp], 32 311 RET 312 313END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64 314 315#ifdef __ELF__ 316.section ".note.GNU-stack","",%progbits 317#endif 318