1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/4x16-aarch64-neonfp16arith-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const void*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# void*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# Register usage 27# A0 x3 v0 28# A1 x11 v1 29# A2 x12 v2 30# A3 x4 v3 31 32# B x5 v20 v21 v22 v23 v16 v17 v18 v19 33 34# C0 x6 v24 v25 35# C1 x9 v26 v27 36# C2 x10 v28 v29 37# C3 x7 v30 v31 38 39# Clamp v4, v5 40 41BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64 42 43 # Load cn_stride, params pointer 44 LDP x14, x8, [sp] 45 46 # Load params values 47 LD2R {v4.8h, v5.8h}, [x8] 48 49 # Clamp A and C pointers 50 CMP x0, 2 // if mr < 2 51 ADD x11, x3, x4 // a1 = a0 + a_stride 52 ADD x9, x6, x7 // c1 = c0 + cm_stride 53 CSEL x11, x3, x11, LO // a1 = a0 54 CSEL x9, x6, x9, LO // c1 = c0 55 56 ADD x12, x11, x4 // a2 = a1 + a_stride 57 ADD x10, x9, x7 // c2 = c1 + cm_stride 58 // if mr <= 2 59 CSEL x12, x11, x12, LS // a2 = a1 60 CSEL x10, x9, x10, LS // c2 = c1 61 62 CMP x0, 4 // if mr < 4 63 ADD x4, x12, x4 // a3 = a2 + a_stride 64 ADD x7, x10, x7 // c3 = c2 + cm_stride 65 CSEL x4, x12, x4, LO // a3 = a2 66 CSEL x7, x10, x7, LO // c3 = c2 67 680: 69 # Load initial bias from w into accumulators 70 LDR q24, [x5], 16 71 LDR q25, [x5], 16 72 MOV v26.16b, v24.16b 73 MOV v28.16b, v24.16b 74 MOV v30.16b, v24.16b 75 MOV v27.16b, v25.16b 76 MOV v29.16b, v25.16b 77 MOV v31.16b, v25.16b 78 79 # Is there at least 2 halffloats (4 bytes)? 80 SUBS x0, x2, 8 // k = kc - 8 81 B.LO 3f 82 83 .p2align 3 84 # Main loop - 4 halffloats of A (8 bytes) 851: 86 LDR d0, [x3], 8 87 LDR q20, [x5], 16 88 LDR q21, [x5], 16 89 LDR d1, [x11], 8 90 LDR d2, [x12], 8 91 LDR d3, [x4], 8 92 LDR q22, [x5], 16 93 LDR q23, [x5], 16 94 LDR q16, [x5], 16 95 LDR q17, [x5], 16 96 LDR q18, [x5], 16 97 LDR q19, [x5], 16 98 SUBS x0, x0, 8 99 FMLA v24.8h, v20.8h, v0.h[0] 100 FMLA v25.8h, v21.8h, v0.h[0] 101 FMLA v26.8h, v20.8h, v1.h[0] 102 FMLA v27.8h, v21.8h, v1.h[0] 103 FMLA v28.8h, v20.8h, v2.h[0] 104 FMLA v29.8h, v21.8h, v2.h[0] 105 FMLA v30.8h, v20.8h, v3.h[0] 106 FMLA v31.8h, v21.8h, v3.h[0] 107 FMLA v24.8h, v22.8h, v0.h[1] 108 FMLA v25.8h, v23.8h, v0.h[1] 109 FMLA v26.8h, v22.8h, v1.h[1] 110 FMLA v27.8h, v23.8h, v1.h[1] 111 FMLA v28.8h, v22.8h, v2.h[1] 112 FMLA v29.8h, v23.8h, v2.h[1] 113 FMLA v30.8h, v22.8h, v3.h[1] 114 FMLA v31.8h, v23.8h, v3.h[1] 115 116 FMLA v24.8h, v16.8h, v0.h[2] 117 FMLA v25.8h, v17.8h, v0.h[2] 118 FMLA v26.8h, v16.8h, v1.h[2] 119 FMLA v27.8h, v17.8h, v1.h[2] 120 FMLA v28.8h, v16.8h, v2.h[2] 121 FMLA v29.8h, v17.8h, v2.h[2] 122 FMLA v30.8h, v16.8h, v3.h[2] 123 FMLA v31.8h, v17.8h, v3.h[2] 124 FMLA v24.8h, v18.8h, v0.h[3] 125 FMLA v25.8h, v19.8h, v0.h[3] 126 FMLA v26.8h, v18.8h, v1.h[3] 127 FMLA v27.8h, v19.8h, v1.h[3] 128 FMLA v28.8h, v18.8h, v2.h[3] 129 FMLA v29.8h, v19.8h, v2.h[3] 130 FMLA v30.8h, v18.8h, v3.h[3] 131 FMLA v31.8h, v19.8h, v3.h[3] 132 B.HS 1b 133 134 # Is there a remainder- 1 to 3 halffloats of A (2 to 6 bytes) 135 ANDS x0, x0, 7 136 B.NE 3f 137 1382: 139 # Clamp 140 FMAX v24.8h, v24.8h, v4.8h 141 SUBS x1, x1, 16 142 FMAX v25.8h, v25.8h, v4.8h 143 FMAX v26.8h, v26.8h, v4.8h 144 FMAX v27.8h, v27.8h, v4.8h 145 FMAX v28.8h, v28.8h, v4.8h 146 FMAX v29.8h, v29.8h, v4.8h 147 FMAX v30.8h, v30.8h, v4.8h 148 FMAX v31.8h, v31.8h, v4.8h 149 FMIN v24.8h, v24.8h, v5.8h 150 FMIN v25.8h, v25.8h, v5.8h 151 FMIN v26.8h, v26.8h, v5.8h 152 FMIN v27.8h, v27.8h, v5.8h 153 FMIN v28.8h, v28.8h, v5.8h 154 FMIN v29.8h, v29.8h, v5.8h 155 FMIN v30.8h, v30.8h, v5.8h 156 FMIN v31.8h, v31.8h, v5.8h 157 158 # Store full 4 x 16 159 B.LO 5f 160 161 ST1 {v24.16b, v25.16b}, [x6], x14 162 SUB x3, x3, x2 // a0 -= kc 163 ST1 {v26.16b, v27.16b}, [x9], x14 164 SUB x11, x11, x2 // a1 -= kc 165 ST1 {v28.16b, v29.16b}, [x10], x14 166 SUB x12, x12, x2 // a2 -= kc 167 ST1 {v30.16b, v31.16b}, [x7], x14 168 SUB x4, x4, x2 // a3 -= kc 169 170 B.HI 0b 171 172 RET 173 174 # Remainder- 1 to 3 halffloats of A (2 to 6 bytes) 1753: 176 TBZ x0, 2, 4f 177 LDR s0, [x3], 4 178 LDR q20, [x5], 16 179 LDR q21, [x5], 16 180 LDR s1, [x11], 4 181 LDR s2, [x12], 4 182 LDR s3, [x4], 4 183 LDR q22, [x5], 16 184 LDR q23, [x5], 16 185 FMLA v24.8h, v20.8h, v0.h[0] 186 FMLA v25.8h, v21.8h, v0.h[0] 187 FMLA v26.8h, v20.8h, v1.h[0] 188 FMLA v27.8h, v21.8h, v1.h[0] 189 FMLA v28.8h, v20.8h, v2.h[0] 190 FMLA v29.8h, v21.8h, v2.h[0] 191 FMLA v30.8h, v20.8h, v3.h[0] 192 FMLA v31.8h, v21.8h, v3.h[0] 193 FMLA v24.8h, v22.8h, v0.h[1] 194 FMLA v25.8h, v23.8h, v0.h[1] 195 FMLA v26.8h, v22.8h, v1.h[1] 196 FMLA v27.8h, v23.8h, v1.h[1] 197 FMLA v28.8h, v22.8h, v2.h[1] 198 FMLA v29.8h, v23.8h, v2.h[1] 199 FMLA v30.8h, v22.8h, v3.h[1] 200 FMLA v31.8h, v23.8h, v3.h[1] 201 TBZ x0, 1, 2b 202 2034: 204 LDR h0, [x3], 2 205 LDR q20, [x5], 16 206 LDR q21, [x5], 16 207 LDR h1, [x11], 2 208 LDR h2, [x12], 2 209 LDR h3, [x4], 2 210 FMLA v24.8h, v20.8h, v0.h[0] 211 FMLA v25.8h, v21.8h, v0.h[0] 212 FMLA v26.8h, v20.8h, v1.h[0] 213 FMLA v27.8h, v21.8h, v1.h[0] 214 FMLA v28.8h, v20.8h, v2.h[0] 215 FMLA v29.8h, v21.8h, v2.h[0] 216 FMLA v30.8h, v20.8h, v3.h[0] 217 FMLA v31.8h, v21.8h, v3.h[0] 218 B 2b 219 220 221 # Store odd width 2225: 223 TBZ x1, 3, 6f 224 STR q24, [x6], 16 225 MOV v24.16b, v25.16b 226 STR q26, [x9], 16 227 MOV v26.16b, v27.16b 228 STR q28, [x10], 16 229 MOV v28.16b, v29.16b 230 STR q30, [x7], 16 231 MOV v30.16b, v31.16b 232 2336: 234 TBZ x1, 2, 7f 235 STR d24, [x6], 8 236 STR d26, [x9], 8 237 DUP d24, v24.d[1] 238 DUP d26, v26.d[1] 239 STR d28, [x10], 8 240 STR d30, [x7], 8 241 DUP d28, v28.d[1] 242 DUP d30, v30.d[1] 243 2447: 245 TBZ x1, 1, 8f 246 STR s24, [x6], 4 247 STR s26, [x9], 4 248 DUP s24, v24.s[1] 249 DUP s26, v26.s[1] 250 STR s28, [x10], 4 251 STR s30, [x7], 4 252 DUP s28, v28.s[1] 253 DUP s30, v30.s[1] 254 2558: 256 TBZ x1, 0, 9f 257 STR h24, [x6] 258 STR h26, [x9] 259 STR h28, [x10] 260 STR h30, [x7] 2619: 262 RET 263 264END_FUNCTION xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64 265 266#ifdef __ELF__ 267.section ".note.GNU-stack","",%progbits 268#endif 269