1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/4x8-aarch64-neonfp16arith-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const void*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# void*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38# Vector register usage 39# A0 v0 40# A1 v1 41# A2 v2 42# A3 v3 43# B v20 v21 v22 v23 44# C v24 45# C v26 46# C v28 47# C v30 48 49# Clamp v4, v5 50# unused A v6, v7 v8 v9 v10 v11 51# unused B v27 52 53BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64 54 55 # Load cn_stride, params pointer 56 LDP x14, x8, [sp] 57 58 # Load params values 59 LD2R {v4.8h, v5.8h}, [x8] 60 61 # Clamp A and C pointers 62 CMP x0, 2 // if mr < 2 63 ADD x11, x3, x4 // a1 = a0 + a_stride 64 ADD x9, x6, x7 // c1 = c0 + cm_stride 65 CSEL x11, x3, x11, LO // a1 = a0 66 CSEL x9, x6, x9, LO // c1 = c0 67 68 ADD x12, x11, x4 // a2 = a1 + a_stride 69 ADD x10, x9, x7 // c2 = c1 + cm_stride 70 // if mr <= 2 71 CSEL x12, x11, x12, LS // a2 = a1 72 CSEL x10, x9, x10, LS // c2 = c1 73 74 CMP x0, 4 // if mr < 4 75 ADD x4, x12, x4 // a3 = a2 + a_stride 76 ADD x7, x10, x7 // c3 = c2 + cm_stride 77 CSEL x4, x12, x4, LO // a3 = a2 78 CSEL x7, x10, x7, LO // c3 = c2 79 800: 81 # Load initial bias from w into accumulators 82 LDR q24, [x5], 16 83 MOV v26.16b, v24.16b 84 MOV v28.16b, v24.16b 85 MOV v30.16b, v24.16b 86 87 # Is there at least 4 halffloats (8 bytes)? 88 SUBS x0, x2, 8 // k = kc - 8 89 B.LO 3f 90 91 # Main loop - 4 halffloats of A (8 bytes) 921: 93 LDR d0, [x3], 8 94 LDR q20, [x5], 16 95 LDR q21, [x5], 16 96 LDR d1, [x11], 8 97 LDR d2, [x12], 8 98 LDR d3, [x4], 8 99 LDR q22, [x5], 16 100 LDR q23, [x5], 16 101 SUBS x0, x0, 8 102 FMLA v24.8h, v20.8h, v0.h[0] 103 FMLA v26.8h, v20.8h, v1.h[0] 104 FMLA v28.8h, v20.8h, v2.h[0] 105 FMLA v30.8h, v20.8h, v3.h[0] 106 FMLA v24.8h, v21.8h, v0.h[1] 107 FMLA v26.8h, v21.8h, v1.h[1] 108 FMLA v28.8h, v21.8h, v2.h[1] 109 FMLA v30.8h, v21.8h, v3.h[1] 110 111 FMLA v24.8h, v22.8h, v0.h[2] 112 FMLA v26.8h, v22.8h, v1.h[2] 113 FMLA v28.8h, v22.8h, v2.h[2] 114 FMLA v30.8h, v22.8h, v3.h[2] 115 FMLA v24.8h, v23.8h, v0.h[3] 116 FMLA v26.8h, v23.8h, v1.h[3] 117 FMLA v28.8h, v23.8h, v2.h[3] 118 FMLA v30.8h, v23.8h, v3.h[3] 119 B.HS 1b 120 121 # Is there a remainder?- 2 halffloats of A (4 bytes) 122 TBNZ x0, 2, 4f 123 # Is there a remainder?- 1 halffloat of A (2 bytes) 124 TBNZ x0, 1, 5f 1252: 126 # Clamp 127 FMAX v24.8h, v24.8h, v4.8h 128 SUBS x1, x1, 8 129 FMAX v26.8h, v26.8h, v4.8h 130 FMAX v28.8h, v28.8h, v4.8h 131 FMAX v30.8h, v30.8h, v4.8h 132 FMIN v24.8h, v24.8h, v5.8h 133 FMIN v26.8h, v26.8h, v5.8h 134 FMIN v28.8h, v28.8h, v5.8h 135 FMIN v30.8h, v30.8h, v5.8h 136 137 # Store full 4 x 8 138 B.LO 6f 139 140 ST1 {v24.16b}, [x6], x14 141 SUB x3, x3, x2 // a0 -= kc 142 ST1 {v26.16b}, [x9], x14 143 SUB x11, x11, x2 // a1 -= kc 144 ST1 {v28.16b}, [x10], x14 145 SUB x12, x12, x2 // a2 -= kc 146 ST1 {v30.16b}, [x7], x14 147 SUB x4, x4, x2 // a3 -= kc 148 149 B.HI 0b 150 RET 151 1523: 153 TBZ x0, 2, 5f 1544: 155 # Remainder- 2 halffloats of A (4 bytes) 156 LDR s0, [x3], 4 157 LDR q20, [x5], 16 158 LDR q21, [x5], 16 159 LDR s1, [x11], 4 160 LDR s2, [x12], 4 161 LDR s3, [x4], 4 162 163 FMLA v24.8h, v20.8h, v0.h[0] 164 FMLA v26.8h, v20.8h, v1.h[0] 165 FMLA v28.8h, v20.8h, v2.h[0] 166 FMLA v30.8h, v20.8h, v3.h[0] 167 168 FMLA v24.8h, v21.8h, v0.h[1] 169 FMLA v26.8h, v21.8h, v1.h[1] 170 FMLA v28.8h, v21.8h, v2.h[1] 171 FMLA v30.8h, v21.8h, v3.h[1] 172 TBZ x0, 1, 2b 173 1745: 175 # Remainder- 1 halffloat of A (2 bytes) 176 LDR h0, [x3], 2 177 LDR q20, [x5], 16 178 LDR h1, [x11], 2 179 LDR h2, [x12], 2 180 LDR h3 , [x4], 2 181 FMLA v24.8h, v20.8h, v0.h[0] 182 FMLA v26.8h, v20.8h, v1.h[0] 183 FMLA v28.8h, v20.8h, v2.h[0] 184 FMLA v30.8h, v20.8h, v3.h[0] 185 B 2b 186 187 # Store odd width 1886: 189 TBZ x1, 2, 7f 190 STR d24, [x6], 8 191 STR d26, [x9], 8 192 DUP d24, v24.d[1] 193 DUP d26, v26.d[1] 194 STR d28, [x10], 8 195 STR d30, [x7], 8 196 DUP d28, v28.d[1] 197 DUP d30, v30.d[1] 198 1997: 200 TBZ x1, 1, 8f 201 STR s24, [x6], 4 202 STR s26, [x9], 4 203 DUP s24, v24.s[1] 204 DUP s26, v26.s[1] 205 STR s28, [x10], 4 206 STR s30, [x7], 4 207 DUP s28, v28.s[1] 208 DUP s30, v30.s[1] 209 2108: 211 TBZ x1, 0, 9f 212 STR h24, [x6] 213 STR h26, [x9] 214 STR h28, [x10] 215 STR h30, [x7] 2169: 217 RET 218 219END_FUNCTION xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64 220 221#ifdef __ELF__ 222.section ".note.GNU-stack","",%progbits 223#endif 224