1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/4x8-aarch64-neonfma-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2021 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const float*restrict w, x5 19# float*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x8 a0 30# x13 a1 31# x14 a2 32# x15 a3 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x7 c3 / cm_stride 39 40# Vector register usage 41# A0 v0 42# A1 v1 43# A2 v2 44# A3 v3 45# B v20 v21 v22 v23 46# C v24 v25 47# C v26 v27 48# C v28 v29 49# C v30 v31 50# Clamp v4 v5 51 52BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128 53 54 # Load cn_stride, a_offset 55 LDP x10, x11, [sp] 56 57 # Load zero, params pointer 58 LDP x12, x8, [sp, 16] 59 60 # Clamp C pointers 61 CMP x0, 2 // if mr < 2 62 ADD x16, x6, x7 // c1 = c0 + cm_stride 63 CSEL x16, x6, x16, LO // c1 = c0 64 65 # Load min/max values 66 LD2R {v4.4s, v5.4s}, [x8] 67 68 ADD x17, x16, x7 // c2 = c1 + cm_stride 69 // if mr <= 2 70 CSEL x17, x16, x17, LS // c2 = c1 71 72 CMP x0, 4 // if mr < 4 73 ADD x7, x17, x7 // c3 = c2 + cm_stride 74 CSEL x7, x17, x7, LO // c3 = c2 75 760: 77 # Load initial bias from w into accumulators 78 LDP q24, q25, [x5], 32 79 MOV v26.16b, v24.16b 80 MOV v27.16b, v25.16b 81 MOV v28.16b, v24.16b 82 MOV v29.16b, v25.16b 83 MOV v30.16b, v24.16b 84 MOV v31.16b, v25.16b 85 86 MOV x9, x3 // p = ks 87 881: 89 # Load next 4 A pointers 90 LDP x8, x13, [x4], 16 91 LDP x14, x15, [x4], 16 92 93 CMP x8, x12 // if a0 == zero 94 ADD x8, x8, x11 // a0 += a_offset 95 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 96 CMP x13, x12 // if a1 == zero 97 ADD x13, x13, x11 // a1 += a_offset 98 CSEL x13, x12, x13, EQ // a1 = zero, else += a1 + a_offset 99 CMP x14, x12 // if a2 == zero 100 ADD x14, x14, x11 // a2 += a_offset 101 CSEL x14, x12, x14, EQ // a2 = zero, else += a2 + a_offset 102 CMP x15, x12 // if a3 == zero 103 ADD x15, x15, x11 // a3 += a_offset 104 CSEL x15, x12, x15, EQ // a3 = zero, else += a3 + a_offset 105 106 # Is there at least 4 floats (16 bytes)? 107 SUBS x0, x2, 16 // k = kc - 16 108 B.LO 4f 109 110 # Main loop - 4 floats of A (16 bytes) 1112: 112 LDR q0, [x8], 16 113 LDP q20, q21, [x5], 32 114 LDR q1, [x13], 16 115 LDR q2, [x14], 16 116 LDR q3, [x15], 16 117 FMLA v24.4s, v20.4s, v0.s[0] 118 FMLA v25.4s, v21.4s, v0.s[0] 119 FMLA v26.4s, v20.4s, v1.s[0] 120 FMLA v27.4s, v21.4s, v1.s[0] 121 LDP q22, q23, [x5], 32 122 FMLA v28.4s, v20.4s, v2.s[0] 123 FMLA v29.4s, v21.4s, v2.s[0] 124 FMLA v30.4s, v20.4s, v3.s[0] 125 FMLA v31.4s, v21.4s, v3.s[0] 126 LDP q16, q17, [x5], 32 127 FMLA v24.4s, v22.4s, v0.s[1] 128 FMLA v25.4s, v23.4s, v0.s[1] 129 FMLA v26.4s, v22.4s, v1.s[1] 130 FMLA v27.4s, v23.4s, v1.s[1] 131 LDP q18, q19, [x5], 32 132 FMLA v28.4s, v22.4s, v2.s[1] 133 FMLA v29.4s, v23.4s, v2.s[1] 134 FMLA v30.4s, v22.4s, v3.s[1] 135 FMLA v31.4s, v23.4s, v3.s[1] 136 FMLA v24.4s, v16.4s, v0.s[2] 137 FMLA v25.4s, v17.4s, v0.s[2] 138 FMLA v26.4s, v16.4s, v1.s[2] 139 FMLA v27.4s, v17.4s, v1.s[2] 140 FMLA v28.4s, v16.4s, v2.s[2] 141 FMLA v29.4s, v17.4s, v2.s[2] 142 FMLA v30.4s, v16.4s, v3.s[2] 143 FMLA v31.4s, v17.4s, v3.s[2] 144 FMLA v24.4s, v18.4s, v0.s[3] 145 FMLA v25.4s, v19.4s, v0.s[3] 146 FMLA v26.4s, v18.4s, v1.s[3] 147 FMLA v27.4s, v19.4s, v1.s[3] 148 FMLA v28.4s, v18.4s, v2.s[3] 149 FMLA v29.4s, v19.4s, v2.s[3] 150 SUBS x0, x0, 16 151 FMLA v30.4s, v18.4s, v3.s[3] 152 FMLA v31.4s, v19.4s, v3.s[3] 153 B.HS 2b 154 155 # Is there a remainder?- 2 floats of A (8 bytes) or less 156 TST x0, 15 157 B.NE 4f 1583: 159 # ks loop 160 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 161 B.HI 1b 162 163 # Clamp 164 FMAX v24.4s, v24.4s, v4.4s 165 FMAX v25.4s, v25.4s, v4.4s 166 FMAX v26.4s, v26.4s, v4.4s 167 FMAX v27.4s, v27.4s, v4.4s 168 FMAX v28.4s, v28.4s, v4.4s 169 FMAX v29.4s, v29.4s, v4.4s 170 FMAX v30.4s, v30.4s, v4.4s 171 FMAX v31.4s, v31.4s, v4.4s 172 FMIN v24.4s, v24.4s, v5.4s 173 FMIN v25.4s, v25.4s, v5.4s 174 FMIN v26.4s, v26.4s, v5.4s 175 FMIN v27.4s, v27.4s, v5.4s 176 FMIN v28.4s, v28.4s, v5.4s 177 FMIN v29.4s, v29.4s, v5.4s 178 FMIN v30.4s, v30.4s, v5.4s 179 FMIN v31.4s, v31.4s, v5.4s 180 181 # Store full 4 x 8 182 SUBS x1, x1, 8 183 B.LO 6f 184 185 STP q30, q31, [x7] 186 ADD x7, x7, x10 187 STP q28, q29, [x17] 188 ADD x17, x17, x10 189 STP q26, q27, [x16] 190 ADD x16, x16, x10 191 STP q24, q25, [x6] 192 ADD x6, x6, x10 193 194 SUB x4, x4, x3 // a -= ks 195 196 # nc loop 197 B.HI 0b 198 RET 199 200 # Remainder- 2 floats of A (8 bytes) 2014: 202 # Is there a remainder?- 2 floats of A (8 bytes) 203 TBZ x0, 3, 5f 204 205 # Remainder- 2 floats of A (8 bytes) 206 LDP q20, q21, [x5], 32 207 LDR d0, [x8], 8 208 LDR d1, [x13], 8 209 LDR d2, [x14], 8 210 LDR d3, [x15], 8 211 FMLA v24.4s, v20.4s, v0.s[0] 212 FMLA v25.4s, v21.4s, v0.s[0] 213 FMLA v26.4s, v20.4s, v1.s[0] 214 FMLA v27.4s, v21.4s, v1.s[0] 215 LDP q22, q23, [x5], 32 216 FMLA v28.4s, v20.4s, v2.s[0] 217 FMLA v29.4s, v21.4s, v2.s[0] 218 FMLA v30.4s, v20.4s, v3.s[0] 219 FMLA v31.4s, v21.4s, v3.s[0] 220 FMLA v24.4s, v22.4s, v0.s[1] 221 FMLA v25.4s, v23.4s, v0.s[1] 222 FMLA v26.4s, v22.4s, v1.s[1] 223 FMLA v27.4s, v23.4s, v1.s[1] 224 FMLA v28.4s, v22.4s, v2.s[1] 225 FMLA v29.4s, v23.4s, v2.s[1] 226 FMLA v30.4s, v22.4s, v3.s[1] 227 FMLA v31.4s, v23.4s, v3.s[1] 228 229 # Is there a remainder?- 1 float of A (4 bytes) 230 TBZ x0, 2, 3b 231 232 # Remainder- 1 float of A 2335: 234 LDR s0, [x8], 4 235 LDP q20, q21, [x5], 32 236 LDR s1, [x13], 4 237 LDR s2, [x14], 4 238 LDR s3, [x15], 4 239 FMLA v24.4s, v20.4s, v0.s[0] 240 FMLA v25.4s, v21.4s, v0.s[0] 241 FMLA v26.4s, v20.4s, v1.s[0] 242 FMLA v27.4s, v21.4s, v1.s[0] 243 FMLA v28.4s, v20.4s, v2.s[0] 244 FMLA v29.4s, v21.4s, v2.s[0] 245 FMLA v30.4s, v20.4s, v3.s[0] 246 FMLA v31.4s, v21.4s, v3.s[0] 247 B 3b 248 249 # Store odd width 2506: 251 TBZ x1, 2, 7f 252 STR q30, [x7], 16 253 MOV v30.16b, v31.16b 254 STR q28, [x17], 16 255 MOV v28.16b, v29.16b 256 STR q26, [x16], 16 257 MOV v26.16b, v27.16b 258 STR q24, [x6], 16 259 MOV v24.16b, v25.16b 260 2617: 262 TBZ x1, 1, 8f 263 STR d30, [x7], 8 264 STR d28, [x17], 8 265 DUP d30, v30.d[1] 266 DUP d28, v28.d[1] 267 STR d26, [x16], 8 268 STR d24, [x6], 8 269 DUP d26, v26.d[1] 270 DUP d24, v24.d[1] 271 2728: 273 TBZ x1, 0, 9f 274 STR s30, [x7] 275 STR s28, [x17] 276 STR s26, [x16] 277 STR s24, [x6] 2789: 279 RET 280 281END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128 282 283#ifdef __ELF__ 284.section ".note.GNU-stack","",%progbits 285#endif 286