1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/4x16-aarch64-neonfp16arith-ld32.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const void*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# void*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# Register usage 27# A0 x3 v0 28# A1 x11 v1 29# A2 x12 v2 30# A3 x4 v3 31 32# B x5 v20 v21 v22 v23 33 34# C0 x6 v24 v25 35# C1 x9 v26 v27 36# C2 x10 v28 v29 37# C3 x7 v30 v31 38 39# Clamp v4, v5 40 41BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32 42 43 # Load cn_stride, params pointer 44 LDP x14, x8, [sp] 45 46 # Load params values 47 LD2R {v4.8h, v5.8h}, [x8] 48 49 # Clamp A and C pointers 50 CMP x0, 2 // if mr < 2 51 ADD x11, x3, x4 // a1 = a0 + a_stride 52 ADD x9, x6, x7 // c1 = c0 + cm_stride 53 CSEL x11, x3, x11, LO // a1 = a0 54 CSEL x9, x6, x9, LO // c1 = c0 55 56 ADD x12, x11, x4 // a2 = a1 + a_stride 57 ADD x10, x9, x7 // c2 = c1 + cm_stride 58 // if mr <= 2 59 CSEL x12, x11, x12, LS // a2 = a1 60 CSEL x10, x9, x10, LS // c2 = c1 61 62 CMP x0, 4 // if mr < 4 63 ADD x4, x12, x4 // a3 = a2 + a_stride 64 ADD x7, x10, x7 // c3 = c2 + cm_stride 65 CSEL x4, x12, x4, LO // a3 = a2 66 CSEL x7, x10, x7, LO // c3 = c2 67 680: 69 # Load initial bias from w into accumulators 70 LDR q24, [x5], 16 71 LDR q25, [x5], 16 72 MOV v26.16b, v24.16b 73 MOV v28.16b, v24.16b 74 MOV v30.16b, v24.16b 75 MOV v27.16b, v25.16b 76 MOV v29.16b, v25.16b 77 MOV v31.16b, v25.16b 78 79 # Is there at least 2 halffloats (4 bytes)? 80 SUBS x0, x2, 4 // k = kc - 4 81 B.LO 3f 82 83 .p2align 3 84 # Main loop - 2 halffloats of A (4 bytes) 851: 86 LDR s0, [x3], 4 87 LDR q20, [x5], 16 88 LDR q21, [x5], 16 89 LDR s1, [x11], 4 90 LDR s2, [x12], 4 91 LDR s3, [x4], 4 92 LDR q22, [x5], 16 93 LDR q23, [x5], 16 94 SUBS x0, x0, 4 95 FMLA v24.8h, v20.8h, v0.h[0] 96 FMLA v25.8h, v21.8h, v0.h[0] 97 FMLA v26.8h, v20.8h, v1.h[0] 98 FMLA v27.8h, v21.8h, v1.h[0] 99 FMLA v28.8h, v20.8h, v2.h[0] 100 FMLA v29.8h, v21.8h, v2.h[0] 101 FMLA v30.8h, v20.8h, v3.h[0] 102 FMLA v31.8h, v21.8h, v3.h[0] 103 104 FMLA v24.8h, v22.8h, v0.h[1] 105 FMLA v25.8h, v23.8h, v0.h[1] 106 FMLA v26.8h, v22.8h, v1.h[1] 107 FMLA v27.8h, v23.8h, v1.h[1] 108 FMLA v28.8h, v22.8h, v2.h[1] 109 FMLA v29.8h, v23.8h, v2.h[1] 110 FMLA v30.8h, v22.8h, v3.h[1] 111 FMLA v31.8h, v23.8h, v3.h[1] 112 B.HS 1b 113 114 # Is there a remainder?- 1 halffloat of A (2 bytes) 115 TBNZ x0, 1, 3f 116 1172: 118 # Clamp 119 FMAX v24.8h, v24.8h, v4.8h 120 SUBS x1, x1, 16 121 FMAX v25.8h, v25.8h, v4.8h 122 FMAX v26.8h, v26.8h, v4.8h 123 FMAX v27.8h, v27.8h, v4.8h 124 FMAX v28.8h, v28.8h, v4.8h 125 FMAX v29.8h, v29.8h, v4.8h 126 FMAX v30.8h, v30.8h, v4.8h 127 FMAX v31.8h, v31.8h, v4.8h 128 FMIN v24.8h, v24.8h, v5.8h 129 FMIN v25.8h, v25.8h, v5.8h 130 FMIN v26.8h, v26.8h, v5.8h 131 FMIN v27.8h, v27.8h, v5.8h 132 FMIN v28.8h, v28.8h, v5.8h 133 FMIN v29.8h, v29.8h, v5.8h 134 FMIN v30.8h, v30.8h, v5.8h 135 FMIN v31.8h, v31.8h, v5.8h 136 137 # Store full 4 x 16 138 B.LO 4f 139 140 ST1 {v24.16b, v25.16b}, [x6], x14 141 SUB x3, x3, x2 // a0 -= kc 142 ST1 {v26.16b, v27.16b}, [x9], x14 143 SUB x11, x11, x2 // a1 -= kc 144 ST1 {v28.16b, v29.16b}, [x10], x14 145 SUB x12, x12, x2 // a2 -= kc 146 ST1 {v30.16b, v31.16b}, [x7], x14 147 SUB x4, x4, x2 // a3 -= kc 148 149 B.HI 0b 150 151 RET 152 153 # Remainder- 1 halffloat of A (2 bytes) 1543: 155 LDR h0, [x3], 2 156 LDR q20, [x5], 16 157 LDR q21, [x5], 16 158 LDR h1, [x11], 2 159 LDR h2, [x12], 2 160 LDR h3, [x4], 2 161 FMLA v24.8h, v20.8h, v0.h[0] 162 FMLA v25.8h, v21.8h, v0.h[0] 163 FMLA v26.8h, v20.8h, v1.h[0] 164 FMLA v27.8h, v21.8h, v1.h[0] 165 FMLA v28.8h, v20.8h, v2.h[0] 166 FMLA v29.8h, v21.8h, v2.h[0] 167 FMLA v30.8h, v20.8h, v3.h[0] 168 FMLA v31.8h, v21.8h, v3.h[0] 169 B 2b 170 171 # Store odd width 1724: 173 TBZ x1, 3, 5f 174 STR q24, [x6], 16 175 MOV v24.16b, v25.16b 176 STR q26, [x9], 16 177 MOV v26.16b, v27.16b 178 STR q28, [x10], 16 179 MOV v28.16b, v29.16b 180 STR q30, [x7], 16 181 MOV v30.16b, v31.16b 182 1835: 184 TBZ x1, 2, 6f 185 STR d24, [x6], 8 186 STR d26, [x9], 8 187 DUP d24, v24.d[1] 188 DUP d26, v26.d[1] 189 STR d28, [x10], 8 190 STR d30, [x7], 8 191 DUP d28, v28.d[1] 192 DUP d30, v30.d[1] 193 1946: 195 TBZ x1, 1, 7f 196 STR s24, [x6], 4 197 STR s26, [x9], 4 198 DUP s24, v24.s[1] 199 DUP s26, v26.s[1] 200 STR s28, [x10], 4 201 STR s30, [x7], 4 202 DUP s28, v28.s[1] 203 DUP s30, v30.s[1] 204 2057: 206 TBZ x1, 0, 8f 207 STR h24, [x6] 208 STR h26, [x9] 209 STR h28, [x10] 210 STR h30, [x7] 2118: 212 RET 213 214END_FUNCTION xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32 215 216#ifdef __ELF__ 217.section ".note.GNU-stack","",%progbits 218#endif 219