1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const void**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x8 18# size_t a_offset, [sp + 8] -> x11 19# const void* zero, [sp + 16] -> x12 20# const xnn_f16_minmax_params params [sp + 24] -> (x8) 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# Register usage 25# A0 x14 v0 26# A1 x15 v1 27# A2 x20 v2 28# A3 x21 v3 29# A4 x22 v4 30# A5 x23 v5 31 32# B x5 v16 v17 v18 v19 33 34# C0 x6 v20 v21 35# C1 x16 v22 v23 36# C2 x17 v24 v25 37# C3 x10 v26 v27 38# C4 x13 v28 v29 39# C5 x7 v30 v31 40 41# Clamp v6, (v4), (v5) 42# unused v7 43# unused A v8 v9 v10 v11 44# unused B v12 v13 v14 v15 45 46BEGIN_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32 47 48 # Load zero, params pointer 49 LDP x12, x8, [sp, 16] 50 51 # Clamp C pointers 52 CMP x0, 2 // if mr < 2 53 ADD x16, x6, x7 // c1 = c0 + cm_stride 54 CSEL x16, x6, x16, LO // c1 = c0 55 ADD x17, x16, x7 // c2 = c1 + cm_stride 56 // if mr <= 2 57 CSEL x17, x16, x17, LS // c2 = c1 58 59 # Load params 60 LDR s6, [x8] 61 62 CMP x0, 4 // if mr < 4 63 ADD x10, x17, x7 // c3 = c2 + cm_stride 64 CSEL x10, x17, x10, LO // c3 = c2 65 ADD x13, x10, x7 // c4 = c3 + cm_stride 66 // if mr <= 4 67 CSEL x13, x10, x13, LS // c4 = c3 68 CMP x0, 6 // if mr < 6 69 ADD x7, x13, x7 // c5 = c4 + cm_stride 70 CSEL x7, x13, x7, LO // c5 = c4 71 72 LDP x8, x11, [sp] // load cn_stride, a_offset 73 74 # Save x20-x23 on stack 75 STP x20, x21, [sp, -32]! 76 STP x22, x23, [sp, 16] 77 780: 79 # Load initial bias from w into accumulators 80 LDP q20, q21, [x5], 32 81 MOV x9, x3 // p = ks 82 MOV v22.16b, v20.16b 83 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 84 MOV v23.16b, v21.16b 85 PRFM PLDL1KEEP, [x5, 64] 86 MOV v24.16b, v20.16b 87 PRFM PLDL1KEEP, [x5, 128] 88 MOV v25.16b, v21.16b 89 PRFM PLDL1KEEP, [x5, 192] 90 MOV v26.16b, v20.16b 91 PRFM PLDL1KEEP, [x5, 256] 92 MOV v27.16b, v21.16b 93 PRFM PLDL1KEEP, [x5, 320] 94 MOV v28.16b, v20.16b 95 MOV v29.16b, v21.16b 96 MOV v30.16b, v20.16b 97 MOV v31.16b, v21.16b 98 991: 100 # Load next 6 A pointers 101 LDP x14, x15, [x4], 16 102 LDP x20, x21, [x4], 16 103 LDP x22, x23, [x4], 16 104 105 CMP x14, x12 // if a0 == zero 106 ADD x14, x14, x11 // a0 += a_offset 107 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 108 CMP x15, x12 // if a1 == zero 109 ADD x15, x15, x11 // a1 += a_offset 110 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 111 CMP x20, x12 // if a2 == zero 112 ADD x20, x20, x11 // a2 += a_offset 113 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 114 CMP x21, x12 // if a3 == zero 115 ADD x21, x21, x11 // a3 += a_offset 116 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 117 CMP x22, x12 // if a4 == zero 118 ADD x22, x22, x11 // a4 += a_offset 119 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 120 CMP x23, x12 // if a5 == zero 121 ADD x23, x23, x11 // a5 += a_offset 122 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 123 124 # Is there at least 2 halffloats (4 bytes)? 125 SUBS x0, x2, 4 // k = kc - 4 126 B.LO 4f 127 128 .p2align 3 129 # Main loop - 2 halffloats of A (4 bytes) 130 # 24 FMA + 6 ld32 A + 4 LDR B 1312: 132 LDR s0, [x14], 4 // A0 133 LDR q16, [x5], 16 // B 134 LDR q17, [x5], 16 // B 135 LDR s1, [x15], 4 // A1 136 LDR s2, [x20], 4 // A2 137 LDR s3, [x21], 4 // A3 138 LDR s4, [x22], 4 // A4 139 LDR s5, [x23], 4 // A5 140 LDR q18, [x5], 16 // B 141 LDR q19, [x5], 16 // B 142 SUBS x0, x0, 4 143 FMLA v20.8h, v16.8h, v0.h[0] 144 FMLA v21.8h, v17.8h, v0.h[0] 145 FMLA v22.8h, v16.8h, v1.h[0] 146 FMLA v23.8h, v17.8h, v1.h[0] 147 FMLA v24.8h, v16.8h, v2.h[0] 148 FMLA v25.8h, v17.8h, v2.h[0] 149 FMLA v26.8h, v16.8h, v3.h[0] 150 FMLA v27.8h, v17.8h, v3.h[0] 151 FMLA v28.8h, v16.8h, v4.h[0] 152 FMLA v29.8h, v17.8h, v4.h[0] 153 FMLA v30.8h, v16.8h, v5.h[0] 154 FMLA v31.8h, v17.8h, v5.h[0] 155 156 FMLA v20.8h, v18.8h, v0.h[1] 157 FMLA v21.8h, v19.8h, v0.h[1] 158 FMLA v22.8h, v18.8h, v1.h[1] 159 FMLA v23.8h, v19.8h, v1.h[1] 160 FMLA v24.8h, v18.8h, v2.h[1] 161 FMLA v25.8h, v19.8h, v2.h[1] 162 FMLA v26.8h, v18.8h, v3.h[1] 163 FMLA v27.8h, v19.8h, v3.h[1] 164 FMLA v28.8h, v18.8h, v4.h[1] 165 FMLA v29.8h, v19.8h, v4.h[1] 166 FMLA v30.8h, v18.8h, v5.h[1] 167 FMLA v31.8h, v19.8h, v5.h[1] 168 B.HS 2b 169 170 # Is there a remainder?- 1 halffloat of A (2 bytes) 171 TBNZ x0, 1, 4f 172 1733: 174 # ks loop 175 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 176 B.HI 1b 177 178 # Clamp 179 DUP v4.8h, v6.h[0] 180 DUP v5.8h, v6.h[1] 181 FMAX v20.8h, v20.8h, v4.8h 182 FMAX v21.8h, v21.8h, v4.8h 183 FMAX v22.8h, v22.8h, v4.8h 184 FMAX v23.8h, v23.8h, v4.8h 185 FMAX v24.8h, v24.8h, v4.8h 186 FMAX v25.8h, v25.8h, v4.8h 187 FMAX v26.8h, v26.8h, v4.8h 188 FMAX v27.8h, v27.8h, v4.8h 189 FMAX v28.8h, v28.8h, v4.8h 190 FMAX v29.8h, v29.8h, v4.8h 191 FMAX v30.8h, v30.8h, v4.8h 192 FMAX v31.8h, v31.8h, v4.8h 193 SUBS x1, x1, 16 194 FMIN v20.8h, v20.8h, v5.8h 195 FMIN v21.8h, v21.8h, v5.8h 196 FMIN v22.8h, v22.8h, v5.8h 197 FMIN v23.8h, v23.8h, v5.8h 198 FMIN v24.8h, v24.8h, v5.8h 199 FMIN v25.8h, v25.8h, v5.8h 200 FMIN v26.8h, v26.8h, v5.8h 201 FMIN v27.8h, v27.8h, v5.8h 202 FMIN v28.8h, v28.8h, v5.8h 203 FMIN v29.8h, v29.8h, v5.8h 204 FMIN v30.8h, v30.8h, v5.8h 205 FMIN v31.8h, v31.8h, v5.8h 206 207 # Store full 6 x 16 208 B.LO 5f 209 210 ST1 {v30.16b, v31.16b}, [x7], x8 211 ST1 {v28.16b, v29.16b}, [x13], x8 212 ST1 {v26.16b, v27.16b}, [x10], x8 213 ST1 {v24.16b, v25.16b}, [x17], x8 214 ST1 {v22.16b, v23.16b}, [x16], x8 215 ST1 {v20.16b, v21.16b}, [x6], x8 216 217 SUB x4, x4, x3 // a -= ks 218 219 # nc loop 220 B.HI 0b 221 222 # Restore x20-x23 from stack 223 LDP x22, x23, [sp, 16] 224 LDP x20, x21, [sp], 32 225 RET 226 2274: 228 # Remainder- 1 halffloat of A (2 bytes) 229 LDR h0, [x14], 2 // A0 230 LDR q16, [x5], 16 // B 231 LDR q17, [x5], 16 // B 232 LDR h1, [x15], 2 // A1 233 LDR h2, [x20], 2 // A2 234 LDR h3, [x21], 2 // A3 235 LDR h4, [x22], 2 // A4 236 LDR h5, [x23], 2 // A5 237 FMLA v20.8h, v16.8h, v0.h[0] 238 FMLA v21.8h, v17.8h, v0.h[0] 239 FMLA v22.8h, v16.8h, v1.h[0] 240 FMLA v23.8h, v17.8h, v1.h[0] 241 FMLA v24.8h, v16.8h, v2.h[0] 242 FMLA v25.8h, v17.8h, v2.h[0] 243 FMLA v26.8h, v16.8h, v3.h[0] 244 FMLA v27.8h, v17.8h, v3.h[0] 245 FMLA v28.8h, v16.8h, v4.h[0] 246 FMLA v29.8h, v17.8h, v4.h[0] 247 FMLA v30.8h, v16.8h, v5.h[0] 248 FMLA v31.8h, v17.8h, v5.h[0] 249 B 3b 250 251 # Store odd width 2525: 253 TBZ x1, 3, 6f 254 STR q30, [x7], 16 255 MOV v30.16b, v31.16b 256 STR q28, [x13], 16 257 MOV v28.16b, v29.16b 258 STR q26, [x10], 16 259 MOV v26.16b, v27.16b 260 STR q24, [x17], 16 261 MOV v24.16b, v25.16b 262 STR q22, [x16], 16 263 MOV v22.16b, v23.16b 264 STR q20, [x6], 16 265 MOV v20.16b, v21.16b 2666: 267 TBZ x1, 2, 7f 268 STR d30, [x7], 8 269 STR d28, [x13], 8 270 DUP d30, v30.d[1] 271 DUP d28, v28.d[1] 272 STR d26, [x10], 8 273 STR d24, [x17], 8 274 DUP d26, v26.d[1] 275 DUP d24, v24.d[1] 276 STR d22, [x16], 8 277 STR d20, [x6], 8 278 DUP d22, v22.d[1] 279 DUP d20, v20.d[1] 280 2817: 282 TBZ x1, 1, 8f 283 STR s30, [x7], 4 284 STR s28, [x13], 4 285 DUP s30, v30.s[1] 286 DUP s28, v28.s[1] 287 STR s26, [x10], 4 288 STR s24, [x17], 4 289 DUP s26, v26.s[1] 290 DUP s24, v24.s[1] 291 STR s22, [x16], 4 292 STR s20, [x6], 4 293 DUP s22, v22.s[1] 294 DUP s20, v20.s[1] 295 2968: 297 TBZ x1, 0, 9f 298 STR h30, [x7] 299 STR h28, [x13] 300 STR h26, [x10] 301 STR h24, [x17] 302 STR h22, [x16] 303 STR h20, [x6] 3049: 305 # Restore x20-x23 from stack 306 LDP x22, x23, [sp, 16] 307 LDP x20, x21, [sp], 32 308 RET 309 310END_FUNCTION xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32 311 312#ifdef __ELF__ 313.section ".note.GNU-stack","",%progbits 314#endif 315