1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const void*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# void*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x8) 22# const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# Vector register usage 43# A0 v0 44# A1 v1 45# A2 v2 46# A3 v3 47# A4 v4 48# A5 v5 49# B v16 v17 v18 v19 50# C v20 51# C v22 52# C v24 53# C v26 54# C v28 55# C v30 56# Clamp v6, (v4), (v5) 57# unused A v8 v9 v10 v11 58# unused B v12 v13 v14 v15 59 60 61BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64 62 63 # Load params pointer 64 LDR x8, [sp, 8] 65 66 # Clamp A and C pointers 67 CMP x0, 2 // if mr < 2 68 ADD x9, x3, x4 // a1 = a0 + a_stride 69 ADD x16, x6, x7 // c1 = c0 + cm_stride 70 CSEL x9, x3, x9, LO // a1 = a0 71 CSEL x16, x6, x16, LO // c1 = c0 72 73 # Load params 74 LDR s6, [x8] 75 76 ADD x10, x9, x4 // a2 = a1 + a_stride 77 ADD x17, x16, x7 // c2 = c1 + cm_stride 78 // if mr <= 2 79 CSEL x10, x9, x10, LS // a2 = a1 80 CSEL x17, x16, x17, LS // c2 = c1 81 82 CMP x0, 4 // if mr < 4 83 ADD x11, x10, x4 // a3 = a2 + a_stride 84 ADD x14, x17, x7 // c3 = c2 + cm_stride 85 CSEL x11, x10, x11, LO // a3 = a2 86 CSEL x14, x17, x14, LO // c3 = c2 87 88 ADD x12, x11, x4 // a4 = a3 + a_stride 89 ADD x13, x14, x7 // c4 = c3 + cm_stride 90 // if mr <= 4 91 CSEL x12, x11, x12, LS // a4 = a3 92 CSEL x13, x14, x13, LS // c4 = c3 93 94 CMP x0, 6 // if mr < 6 95 ADD x4, x12, x4 // a5 = a4 + a_stride 96 ADD x7, x13, x7 // c5 = c4 + cm_stride 97 CSEL x4, x12, x4, LO // a5 = a4 98 CSEL x7, x13, x7, LO // c5 = c4 99 100 LDR x8, [sp] // load cn_stride 101 1020: 103 # Load initial bias from w into accumulators 104 LDR q20, [x5], 16 105 MOV v22.16b, v20.16b 106 MOV v24.16b, v20.16b 107 MOV v26.16b, v20.16b 108 MOV v28.16b, v20.16b 109 MOV v30.16b, v20.16b 110 111 # Is there at least 4 halffloats (8 bytes)? 112 SUBS x0, x2, 8 // k = kc - 8 113 B.LO 3f 114 115 # Main loop - 4 halffloats of A (8 bytes) 116 # 24 FMA + 6 ld64 A + 4 LDR B 1171: 118 LDR d0, [x3], 8 119 LDR q16, [x5], 16 120 LDR q17, [x5], 16 121 LDR d1, [x9], 8 122 LDR d2, [x10], 8 123 LDR d3, [x11], 8 124 LDR d4, [x12], 8 125 LDR d5, [x4], 8 126 LDR q18, [x5], 16 127 LDR q19, [x5], 16 128 SUBS x0, x0, 8 129 FMLA v20.8h, v16.8h, v0.h[0] 130 FMLA v22.8h, v16.8h, v1.h[0] 131 FMLA v24.8h, v16.8h, v2.h[0] 132 FMLA v26.8h, v16.8h, v3.h[0] 133 FMLA v28.8h, v16.8h, v4.h[0] 134 FMLA v30.8h, v16.8h, v5.h[0] 135 FMLA v20.8h, v17.8h, v0.h[1] 136 FMLA v22.8h, v17.8h, v1.h[1] 137 FMLA v24.8h, v17.8h, v2.h[1] 138 FMLA v26.8h, v17.8h, v3.h[1] 139 FMLA v28.8h, v17.8h, v4.h[1] 140 FMLA v30.8h, v17.8h, v5.h[1] 141 142 FMLA v20.8h, v18.8h, v0.h[2] 143 FMLA v22.8h, v18.8h, v1.h[2] 144 FMLA v24.8h, v18.8h, v2.h[2] 145 FMLA v26.8h, v18.8h, v3.h[2] 146 FMLA v28.8h, v18.8h, v4.h[2] 147 FMLA v30.8h, v18.8h, v5.h[2] 148 FMLA v20.8h, v19.8h, v0.h[3] 149 FMLA v22.8h, v19.8h, v1.h[3] 150 FMLA v24.8h, v19.8h, v2.h[3] 151 FMLA v26.8h, v19.8h, v3.h[3] 152 FMLA v28.8h, v19.8h, v4.h[3] 153 FMLA v30.8h, v19.8h, v5.h[3] 154 B.HS 1b 155 156 # Is there a remainder?- 2 halffloats of A (4 bytes) 157 TBNZ x0, 2, 4f 158 # Is there a remainder?- 1 halffloat of A (2 bytes) 159 TBNZ x0, 1, 5f 1602: 161 # Clamp 162 DUP v4.8h, v6.h[0] 163 DUP v5.8h, v6.h[1] 164 FMAX v20.8h, v20.8h, v4.8h 165 FMAX v22.8h, v22.8h, v4.8h 166 FMAX v24.8h, v24.8h, v4.8h 167 FMAX v26.8h, v26.8h, v4.8h 168 FMAX v28.8h, v28.8h, v4.8h 169 FMAX v30.8h, v30.8h, v4.8h 170 SUBS x1, x1, 8 171 FMIN v20.8h, v20.8h, v5.8h 172 FMIN v22.8h, v22.8h, v5.8h 173 FMIN v24.8h, v24.8h, v5.8h 174 FMIN v26.8h, v26.8h, v5.8h 175 FMIN v28.8h, v28.8h, v5.8h 176 FMIN v30.8h, v30.8h, v5.8h 177 178 # Store full 6 x 8 179 B.LO 6f 180 181 ST1 {v20.16b}, [x6], x8 182 SUB x3, x3, x2 // a0 -= kc 183 ST1 {v22.16b}, [x16], x8 184 SUB x9, x9, x2 // a1 -= kc 185 ST1 {v24.16b}, [x17], x8 186 SUB x10, x10, x2 // a2 -= kc 187 ST1 {v26.16b}, [x14], x8 188 SUB x11, x11, x2 // a3 -= kc 189 ST1 {v28.16b}, [x13], x8 190 SUB x12, x12, x2 // a4 -= kc 191 ST1 {v30.16b}, [x7], x8 192 SUB x4, x4, x2 // a5 -= kc 193 194 B.HI 0b 195 RET 196 1973: 198 TBZ x0, 2, 5f 1994: 200 # Remainder- 2 halffloats of A (4 bytes) 201 LDR s0, [x3], 4 202 LDR q16, [x5], 16 203 LDR q17, [x5], 16 204 LDR s1, [x9], 4 205 LDR s2, [x10], 4 206 LDR s3, [x11], 4 207 LDR s4, [x12], 4 208 LDR s5, [x4], 4 209 210 FMLA v20.8h, v16.8h, v0.h[0] 211 FMLA v22.8h, v16.8h, v1.h[0] 212 FMLA v24.8h, v16.8h, v2.h[0] 213 FMLA v26.8h, v16.8h, v3.h[0] 214 FMLA v28.8h, v16.8h, v4.h[0] 215 FMLA v30.8h, v16.8h, v5.h[0] 216 217 FMLA v20.8h, v17.8h, v0.h[1] 218 FMLA v22.8h, v17.8h, v1.h[1] 219 FMLA v24.8h, v17.8h, v2.h[1] 220 FMLA v26.8h, v17.8h, v3.h[1] 221 FMLA v28.8h, v17.8h, v4.h[1] 222 FMLA v30.8h, v17.8h, v5.h[1] 223 TBZ x0, 1, 2b 224 2255: 226 # Remainder- 1 halffloat of A (2 bytes) 227 LDR h0, [x3], 2 228 LDR q16, [x5], 16 229 LDR h1, [x9], 2 230 LDR h2, [x10], 2 231 LDR h3, [x11], 2 232 LDR h4, [x12], 2 233 LDR h5, [x4], 2 234 FMLA v20.8h, v16.8h, v0.h[0] 235 FMLA v22.8h, v16.8h, v1.h[0] 236 FMLA v24.8h, v16.8h, v2.h[0] 237 FMLA v26.8h, v16.8h, v3.h[0] 238 FMLA v28.8h, v16.8h, v4.h[0] 239 FMLA v30.8h, v16.8h, v5.h[0] 240 B 2b 241 242 # Store odd width 2436: 244 TBZ x1, 2, 7f 245 STR d20, [x6], 8 246 STR d22, [x16], 8 247 DUP d20, v20.d[1] 248 DUP d22, v22.d[1] 249 STR d24, [x17], 8 250 STR d26, [x14], 8 251 DUP d24, v24.d[1] 252 DUP d26, v26.d[1] 253 STR d28, [x13], 8 254 STR d30, [x7], 8 255 DUP d28, v28.d[1] 256 DUP d30, v30.d[1] 257 2587: 259 TBZ x1, 1, 8f 260 STR s20, [x6], 4 261 STR s22, [x16], 4 262 DUP s20, v20.s[1] 263 DUP s22, v22.s[1] 264 STR s24, [x17], 4 265 STR s26, [x14], 4 266 DUP s24, v24.s[1] 267 DUP s26, v26.s[1] 268 STR s28, [x13], 4 269 STR s30, [x7], 4 270 DUP s28, v28.s[1] 271 DUP s30, v30.s[1] 272 2738: 274 TBZ x1, 0, 9f 275 STR h20, [x6] 276 STR h22, [x16] 277 STR h24, [x17] 278 STR h26, [x14] 279 STR h28, [x13] 280 STR h30, [x7] 2819: 282 RET 283 284END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64 285 286#ifdef __ELF__ 287.section ".note.GNU-stack","",%progbits 288#endif 289