1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const void*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# void*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x8 22 23# const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 29# A1 x9 v1 30# A2 x10 v2 31# A3 x11 v3 32# A4 x12 v4 33# A5 x4 v5 34 35# B x5 v16 v17 v18 v19 36 37# C0 x6 v20 v21 38# C1 x16 v22 v23 39# C2 x17 v24 v25 40# C3 x14 v26 v27 41# C4 x13 v28 v29 42# C5 x7 v30 v31 43 44# Clamp v6, (v4), (v5) 45# unused v7 46# unused A v8 v9 v10 v11 47# unused B v12 v13 v14 v15 48 49BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55 50 51 # Load params pointer 52 LDR x8, [sp, 8] 53 54 # Clamp A and C pointers 55 CMP x0, 2 // if mr < 2 56 ADD x9, x3, x4 // a1 = a0 + a_stride 57 ADD x16, x6, x7 // c1 = c0 + cm_stride 58 CSEL x9, x3, x9, LO // a1 = a0 59 CSEL x16, x6, x16, LO // c1 = c0 60 61 # Load params 62 LDR s6, [x8] 63 64 ADD x10, x9, x4 // a2 = a1 + a_stride 65 ADD x17, x16, x7 // c2 = c1 + cm_stride 66 // if mr <= 2 67 CSEL x10, x9, x10, LS // a2 = a1 68 CSEL x17, x16, x17, LS // c2 = c1 69 70 CMP x0, 4 // if mr < 4 71 ADD x11, x10, x4 // a3 = a2 + a_stride 72 ADD x14, x17, x7 // c3 = c2 + cm_stride 73 CSEL x11, x10, x11, LO // a3 = a2 74 CSEL x14, x17, x14, LO // c3 = c2 75 76 ADD x12, x11, x4 // a4 = a3 + a_stride 77 ADD x13, x14, x7 // c4 = c3 + cm_stride 78 // if mr <= 4 79 CSEL x12, x11, x12, LS // a4 = a3 80 CSEL x13, x14, x13, LS // c4 = c3 81 82 CMP x0, 6 // if mr < 6 83 ADD x4, x12, x4 // a5 = a4 + a_stride 84 ADD x7, x13, x7 // c5 = c4 + cm_stride 85 CSEL x4, x12, x4, LO // a5 = a4 86 CSEL x7, x13, x7, LO // c5 = c4 87 88 LDR x8, [sp] // load cn_stride 89 900: 91 # Load initial bias from w into accumulators 92 LDP q20, q21, [x5], 32 93 MOV v22.16b, v20.16b 94 MOV v23.16b, v21.16b 95 MOV v24.16b, v20.16b 96 MOV v25.16b, v21.16b 97 MOV v26.16b, v20.16b 98 MOV v27.16b, v21.16b 99 MOV v28.16b, v20.16b 100 MOV v29.16b, v21.16b 101 MOV v30.16b, v20.16b 102 MOV v31.16b, v21.16b 103 104 # Is there at least 2 halffloats (4 bytes)? 105 SUBS x0, x2, 4 // k = kc - 4 106 B.LO 4f 107 108 # Prologue - load 4 A and 2 B 109 110 LDR s0, [x3], 4 111 LDR q16, [x5], 16 112 LDR q17, [x5], 16 113 LDR s1, [x9], 4 114 LDR s2, [x10], 4 115 LDR s3, [x11], 4 116 117 # Is there at least 2 halffloats for main loop? 118 SUBS x0, x0, 4 119 B.LO 2f 120 121 .p2align 3 122 # Main loop - 2 halffloats of A (4 bytes) 123 # 24 FMA + 6 ld32 A + 4 LDR B 1241: 125 FMLA v20.8h, v16.8h, v0.h[0] 126 LDR s4, [x12], 4 // A4 127 FMLA v21.8h, v17.8h, v0.h[0] 128 LDR s5, [x4], 4 // A5 129 FMLA v22.8h, v16.8h, v1.h[0] 130 LDR d18, [x5], 8 // B0 131 FMLA v23.8h, v17.8h, v1.h[0] 132 LD1 {v18.d}[1], [x5], 8 // B1 133 FMLA v24.8h, v16.8h, v2.h[0] 134 LDR d19, [x5], 8 // B2 135 FMLA v25.8h, v17.8h, v2.h[0] 136 LD1 {v19.d}[1], [x5], 8 // B3 137 FMLA v26.8h, v16.8h, v3.h[0] 138 FMLA v27.8h, v17.8h, v3.h[0] 139 FMLA v28.8h, v16.8h, v4.h[0] 140 FMLA v29.8h, v17.8h, v4.h[0] 141 FMLA v30.8h, v16.8h, v5.h[0] 142 FMLA v31.8h, v17.8h, v5.h[0] 143 SUBS x0, x0, 4 144 145 FMLA v20.8h, v18.8h, v0.h[1] 146 LDR d16, [x5], 8 // B0 147 FMLA v21.8h, v19.8h, v0.h[1] 148 LD1 {v16.d}[1], [x5], 8 // B1 149 FMLA v22.8h, v18.8h, v1.h[1] 150 LDR d17, [x5], 8 // B2 151 FMLA v23.8h, v19.8h, v1.h[1] 152 LD1 {v17.d}[1], [x5], 8 // B3 153 FMLA v24.8h, v18.8h, v2.h[1] 154 FMLA v25.8h, v19.8h, v2.h[1] 155 FMLA v26.8h, v18.8h, v3.h[1] 156 FMLA v27.8h, v19.8h, v3.h[1] 157 LDR s0, [x3], 4 // A0 158 FMLA v28.8h, v18.8h, v4.h[1] 159 LDR s1, [x9], 4 // A1 160 FMLA v29.8h, v19.8h, v4.h[1] 161 LDR s2, [x10], 4 // A2 162 FMLA v30.8h, v18.8h, v5.h[1] 163 LDR s3, [x11], 4 // A3 164 FMLA v31.8h, v19.8h, v5.h[1] 165 B.HS 1b 166 167 # Epilogue - same as main loop but no loads for next loop 1682: 169 FMLA v20.8h, v16.8h, v0.h[0] 170 LDR s4, [x12], 4 // A4 171 FMLA v21.8h, v17.8h, v0.h[0] 172 LDR s5, [x4], 4 // A5 173 FMLA v22.8h, v16.8h, v1.h[0] 174 LDR d18, [x5], 8 // B0 175 FMLA v23.8h, v17.8h, v1.h[0] 176 LD1 {v18.d}[1], [x5], 8 // B1 177 FMLA v24.8h, v16.8h, v2.h[0] 178 LDR d19, [x5], 8 // B2 179 FMLA v25.8h, v17.8h, v2.h[0] 180 LD1 {v19.d}[1], [x5], 8 // B3 181 FMLA v26.8h, v16.8h, v3.h[0] 182 FMLA v27.8h, v17.8h, v3.h[0] 183 FMLA v28.8h, v16.8h, v4.h[0] 184 FMLA v29.8h, v17.8h, v4.h[0] 185 FMLA v30.8h, v16.8h, v5.h[0] 186 FMLA v31.8h, v17.8h, v5.h[0] 187 188 FMLA v20.8h, v18.8h, v0.h[1] 189 FMLA v21.8h, v19.8h, v0.h[1] 190 FMLA v22.8h, v18.8h, v1.h[1] 191 FMLA v23.8h, v19.8h, v1.h[1] 192 FMLA v24.8h, v18.8h, v2.h[1] 193 FMLA v25.8h, v19.8h, v2.h[1] 194 FMLA v26.8h, v18.8h, v3.h[1] 195 FMLA v27.8h, v19.8h, v3.h[1] 196 FMLA v28.8h, v18.8h, v4.h[1] 197 FMLA v29.8h, v19.8h, v4.h[1] 198 FMLA v30.8h, v18.8h, v5.h[1] 199 FMLA v31.8h, v19.8h, v5.h[1] 200 201 # Is there a remainder?- 1 halffloat of A (2 bytes) 202 TBNZ x0, 1, 4f 2033: 204 # Clamp 205 DUP v4.8h, v6.h[0] 206 DUP v5.8h, v6.h[1] 207 FMAX v20.8h, v20.8h, v4.8h 208 FMAX v21.8h, v21.8h, v4.8h 209 FMAX v22.8h, v22.8h, v4.8h 210 FMAX v23.8h, v23.8h, v4.8h 211 FMAX v24.8h, v24.8h, v4.8h 212 FMAX v25.8h, v25.8h, v4.8h 213 FMAX v26.8h, v26.8h, v4.8h 214 FMAX v27.8h, v27.8h, v4.8h 215 FMAX v28.8h, v28.8h, v4.8h 216 FMAX v29.8h, v29.8h, v4.8h 217 FMAX v30.8h, v30.8h, v4.8h 218 FMAX v31.8h, v31.8h, v4.8h 219 SUBS x1, x1, 16 220 FMIN v20.8h, v20.8h, v5.8h 221 FMIN v21.8h, v21.8h, v5.8h 222 FMIN v22.8h, v22.8h, v5.8h 223 FMIN v23.8h, v23.8h, v5.8h 224 FMIN v24.8h, v24.8h, v5.8h 225 FMIN v25.8h, v25.8h, v5.8h 226 FMIN v26.8h, v26.8h, v5.8h 227 FMIN v27.8h, v27.8h, v5.8h 228 FMIN v28.8h, v28.8h, v5.8h 229 FMIN v29.8h, v29.8h, v5.8h 230 FMIN v30.8h, v30.8h, v5.8h 231 FMIN v31.8h, v31.8h, v5.8h 232 233 # Store full 6 x 16 234 B.LO 5f 235 236 ST1 {v20.16b, v21.16b}, [x6], x8 237 SUB x3, x3, x2 // a0 -= kc 238 ST1 {v22.16b, v23.16b}, [x16], x8 239 SUB x9, x9, x2 // a1 -= kc 240 ST1 {v24.16b, v25.16b}, [x17], x8 241 SUB x10, x10, x2 // a2 -= kc 242 ST1 {v26.16b, v27.16b}, [x14], x8 243 SUB x11, x11, x2 // a3 -= kc 244 ST1 {v28.16b, v29.16b}, [x13], x8 245 SUB x12, x12, x2 // a4 -= kc 246 ST1 {v30.16b, v31.16b}, [x7], x8 247 SUB x4, x4, x2 // a5 -= kc 248 249 B.HI 0b 250 RET 251 2524: 253 # Remainder- 1 halffloat of A (2 bytes) 254 LDR h0, [x3], 2 // A0 255 LDR q16, [x5], 16 // B 256 LDR q17, [x5], 16 // B 257 FMLA v20.8h, v16.8h, v0.h[0] 258 LDR h1, [x9], 2 // A1 259 FMLA v22.8h, v16.8h, v1.h[0] 260 LDR h2, [x10], 2 // A2 261 FMLA v24.8h, v16.8h, v2.h[0] 262 LDR h3, [x11], 2 // A3 263 FMLA v26.8h, v16.8h, v3.h[0] 264 LDR h4, [x12], 2 // A4 265 FMLA v28.8h, v16.8h, v4.h[0] 266 LDR h5, [x4], 2 // A5 267 FMLA v30.8h, v16.8h, v5.h[0] 268 FMLA v21.8h, v17.8h, v0.h[0] 269 FMLA v23.8h, v17.8h, v1.h[0] 270 FMLA v25.8h, v17.8h, v2.h[0] 271 FMLA v27.8h, v17.8h, v3.h[0] 272 FMLA v29.8h, v17.8h, v4.h[0] 273 FMLA v31.8h, v17.8h, v5.h[0] 274 B 3b 275 276 # Store odd width 2775: 278 TBZ x1, 3, 6f 279 STR q20, [x6], 16 280 MOV v20.16b, v21.16b 281 STR q22, [x16], 16 282 MOV v22.16b, v23.16b 283 STR q24, [x17], 16 284 MOV v24.16b, v25.16b 285 STR q26, [x14], 16 286 MOV v26.16b, v27.16b 287 STR q28, [x13], 16 288 MOV v28.16b, v29.16b 289 STR q30, [x7], 16 290 MOV v30.16b, v31.16b 291 2926: 293 TBZ x1, 2, 7f 294 STR d20, [x6], 8 295 STR d22, [x16], 8 296 DUP d20, v20.d[1] 297 DUP d22, v22.d[1] 298 STR d24, [x17], 8 299 STR d26, [x14], 8 300 DUP d24, v24.d[1] 301 DUP d26, v26.d[1] 302 STR d28, [x13], 8 303 STR d30, [x7], 8 304 DUP d28, v28.d[1] 305 DUP d30, v30.d[1] 306 3077: 308 TBZ x1, 1, 8f 309 STR s20, [x6], 4 310 STR s22, [x16], 4 311 DUP s20, v20.s[1] 312 DUP s22, v22.s[1] 313 STR s24, [x17], 4 314 STR s26, [x14], 4 315 DUP s24, v24.s[1] 316 DUP s26, v26.s[1] 317 STR s28, [x13], 4 318 STR s30, [x7], 4 319 DUP s28, v28.s[1] 320 DUP s30, v30.s[1] 321 3228: 323 TBZ x1, 0, 9f 324 STR h20, [x6] 325 STR h22, [x16] 326 STR h24, [x17] 327 STR h26, [x14] 328 STR h28, [x13] 329 STR h30, [x7] 3309: 331 RET 332 333END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55 334 335#ifdef __ELF__ 336.section ".note.GNU-stack","",%progbits 337#endif 338